From: Linux Kernel Mailing List <linux-kernel@vger.kernel.org> Subject: Linux: 2.6.32 Patch-mainline: 2.6.32 This patch contains the differences between 2.6.31 and 2.6.32. Acked-by: Jeff Mahoney <jeffm@suse.com> Automatically created from "patches.kernel.org/patch-2.6.32" by xen-port-patches.py --- sle11sp1-2010-03-29.orig/arch/x86/ia32/ia32entry-xen.S 2009-11-06 10:52:09.000000000 +0100 +++ sle11sp1-2010-03-29/arch/x86/ia32/ia32entry-xen.S 2009-11-06 14:53:39.000000000 +0100 @@ -20,18 +20,15 @@ #define AUDIT_ARCH_I386 (EM_386|__AUDIT_ARCH_LE) #define __AUDIT_ARCH_LE 0x40000000 -#ifndef CONFIG_AUDITSYSCALL -#define sysexit_audit int_ret_from_sys_call -#define sysretl_audit int_ret_from_sys_call -#endif - #define IA32_NR_syscalls ((ia32_syscall_end - ia32_sys_call_table)/8) .macro IA32_ARG_FIXUP noebp=0 movl %edi,%r8d .if \noebp + jmp ia32_common .else movl %ebp,%r9d +ia32_common: .endif xchg %ecx,%esi movl %ebx,%edi @@ -39,12 +36,12 @@ .endm /* clobbers %eax */ - .macro CLEAR_RREGS _r9=rax + .macro CLEAR_RREGS offset=0, _r9=rax xorl %eax,%eax - movq %rax,R11(%rsp) - movq %rax,R10(%rsp) - movq %\_r9,R9(%rsp) - movq %rax,R8(%rsp) + movq %rax,\offset+R11(%rsp) + movq %rax,\offset+R10(%rsp) + movq %\_r9,\offset+R9(%rsp) + movq %rax,\offset+R8(%rsp) .endm /* @@ -144,17 +141,7 @@ ENTRY(ia32_sysenter_target) jnz sysenter_tracesys cmpl $(IA32_NR_syscalls-1),%eax ja ia32_badsys -sysenter_do_call: - IA32_ARG_FIXUP -sysenter_dispatch: - call *ia32_sys_call_table(,%rax,8) - movq %rax,RAX-ARGOFFSET(%rsp) - GET_THREAD_INFO(%r10) - DISABLE_INTERRUPTS(CLBR_NONE) - TRACE_IRQS_OFF - testl $_TIF_ALLWORK_MASK,TI_flags(%r10) - jnz sysexit_audit - jmp int_ret_from_sys_call + jmp ia32_do_call #ifdef CONFIG_AUDITSYSCALL .macro auditsys_entry_common @@ -175,31 +162,10 @@ sysenter_dispatch: movl RDI-ARGOFFSET(%rsp),%r8d /* reload 5th syscall arg */ .endm - .macro auditsys_exit exit,ebpsave=RBP - testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),TI_flags(%r10) - jnz int_ret_from_sys_call - TRACE_IRQS_ON - ENABLE_INTERRUPTS(CLBR_NONE) - movl %eax,%esi /* second arg, syscall return value */ - cmpl $0,%eax /* is it < 0? */ - setl %al /* 1 if so, 0 if not */ - movzbl %al,%edi /* zero-extend that into %edi */ - inc %edi /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */ - call audit_syscall_exit - movl \ebpsave-ARGOFFSET(%rsp),%ebp /* reload user register value */ - movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi - DISABLE_INTERRUPTS(CLBR_NONE) - TRACE_IRQS_OFF - jmp int_with_check - .endm - sysenter_auditsys: auditsys_entry_common movl %ebp,%r9d /* reload 6th syscall arg */ - jmp sysenter_dispatch - -sysexit_audit: - auditsys_exit sysexit_from_sys_call + jmp ia32_dispatch #endif sysenter_tracesys: @@ -216,7 +182,7 @@ sysenter_tracesys: RESTORE_REST cmpl $(IA32_NR_syscalls-1),%eax ja int_ret_from_sys_call /* sysenter_tracesys has set RAX(%rsp) */ - jmp sysenter_do_call + jmp ia32_do_call CFI_ENDPROC ENDPROC(ia32_sysenter_target) @@ -272,24 +238,13 @@ ENTRY(ia32_cstar_target) ja ia32_badsys cstar_do_call: IA32_ARG_FIXUP 1 -cstar_dispatch: - call *ia32_sys_call_table(,%rax,8) - movq %rax,RAX-ARGOFFSET(%rsp) - GET_THREAD_INFO(%r10) - DISABLE_INTERRUPTS(CLBR_NONE) - testl $_TIF_ALLWORK_MASK,TI_flags(%r10) - jnz sysretl_audit - jmp int_ret_from_sys_call #ifdef CONFIG_AUDITSYSCALL cstar_auditsys: movl %r9d,R9-ARGOFFSET(%rsp) /* register to be clobbered by call */ auditsys_entry_common movl R9-ARGOFFSET(%rsp),%r9d /* reload 6th syscall arg */ - jmp cstar_dispatch - -sysretl_audit: - auditsys_exit sysretl_from_sys_call, RCX /* user %ebp in RCX slot */ + jmp ia32_dispatch #endif cstar_tracesys: @@ -299,7 +254,7 @@ cstar_tracesys: #endif xchgl %r9d,%ebp SAVE_REST - CLEAR_RREGS r9 + CLEAR_RREGS 0, r9 movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */ movq %rsp,%rdi /* &pt_regs -> arg1 */ call syscall_trace_enter @@ -367,9 +322,11 @@ ENTRY(ia32_syscall) ja ia32_badsys ia32_do_call: IA32_ARG_FIXUP +ia32_dispatch: call *ia32_sys_call_table(,%rax,8) # xxx: rip relative ia32_sysret: movq %rax,RAX-ARGOFFSET(%rsp) + CLEAR_RREGS -ARGOFFSET jmp int_ret_from_sys_call ia32_tracesys: @@ -387,8 +344,8 @@ END(ia32_syscall) ia32_badsys: movq $0,ORIG_RAX-ARGOFFSET(%rsp) - movq $-ENOSYS,RAX-ARGOFFSET(%rsp) - jmp int_ret_from_sys_call + movq $-ENOSYS,%rax + jmp ia32_sysret quiet_ni_syscall: movq $-ENOSYS,%rax @@ -482,7 +439,7 @@ ia32_sys_call_table: .quad sys_mkdir .quad sys_rmdir /* 40 */ .quad sys_dup - .quad sys32_pipe + .quad sys_pipe .quad compat_sys_times .quad quiet_ni_syscall /* old prof syscall holder */ .quad sys_brk /* 45 */ @@ -776,5 +733,5 @@ ia32_sys_call_table: .quad compat_sys_preadv .quad compat_sys_pwritev .quad compat_sys_rt_tgsigqueueinfo /* 335 */ - .quad sys_perf_counter_open + .quad sys_perf_event_open ia32_syscall_end: --- sle11sp1-2010-03-29.orig/arch/x86/include/asm/time.h 2009-11-06 10:51:25.000000000 +0100 +++ sle11sp1-2010-03-29/arch/x86/include/asm/time.h 2009-11-06 10:52:22.000000000 +0100 @@ -8,8 +8,9 @@ extern void hpet_time_init(void); extern void time_init(void); #ifdef CONFIG_XEN +struct timespec; extern int xen_independent_wallclock(void); -extern unsigned long xen_read_persistent_clock(void); +extern void xen_read_persistent_clock(struct timespec *); extern int xen_update_persistent_clock(void); #endif --- sle11sp1-2010-03-29.orig/arch/x86/include/asm/uv/uv_hub.h 2010-03-31 09:52:27.000000000 +0200 +++ sle11sp1-2010-03-29/arch/x86/include/asm/uv/uv_hub.h 2009-11-06 11:32:17.000000000 +0100 @@ -11,7 +11,7 @@ #ifndef _ASM_X86_UV_UV_HUB_H #define _ASM_X86_UV_UV_HUB_H -#ifdef CONFIG_X86_64 +#ifdef CONFIG_X86_UV #include <linux/numa.h> #include <linux/percpu.h> #include <linux/timer.h> --- sle11sp1-2010-03-29.orig/arch/x86/include/mach-xen/asm/agp.h 2009-11-06 10:52:09.000000000 +0100 +++ sle11sp1-2010-03-29/arch/x86/include/mach-xen/asm/agp.h 2009-11-06 10:52:22.000000000 +0100 @@ -28,10 +28,7 @@ */ #define flush_agp_cache() wbinvd() -/* Convert a physical address to an address suitable for the GART. */ -#define phys_to_gart(x) phys_to_machine(x) -#define gart_to_phys(x) machine_to_phys(x) -#define page_to_gart(x) phys_to_gart(page_to_pseudophys(x)) +#define virt_to_gart virt_to_machine /* GATT allocation. Returns/accepts GATT kernel virtual address. */ #define alloc_gatt_pages(order) ({ \ --- sle11sp1-2010-03-29.orig/arch/x86/include/mach-xen/asm/desc.h 2009-11-06 10:52:09.000000000 +0100 +++ sle11sp1-2010-03-29/arch/x86/include/mach-xen/asm/desc.h 2009-11-18 14:54:16.000000000 +0100 @@ -312,7 +312,14 @@ static inline void load_LDT(mm_context_t static inline unsigned long get_desc_base(const struct desc_struct *desc) { - return desc->base0 | ((desc->base1) << 16) | ((desc->base2) << 24); + return (unsigned)(desc->base0 | ((desc->base1) << 16) | ((desc->base2) << 24)); +} + +static inline void set_desc_base(struct desc_struct *desc, unsigned long base) +{ + desc->base0 = base & 0xffff; + desc->base1 = (base >> 16) & 0xff; + desc->base2 = (base >> 24) & 0xff; } static inline unsigned long get_desc_limit(const struct desc_struct *desc) @@ -320,6 +327,12 @@ static inline unsigned long get_desc_lim return desc->limit0 | (desc->limit << 16); } +static inline void set_desc_limit(struct desc_struct *desc, unsigned long limit) +{ + desc->limit0 = limit & 0xffff; + desc->limit = (limit >> 16) & 0xf; +} + #ifndef CONFIG_X86_NO_IDT static inline void _set_gate(int gate, unsigned type, void *addr, unsigned dpl, unsigned ist, unsigned seg) --- sle11sp1-2010-03-29.orig/arch/x86/include/mach-xen/asm/dma-mapping.h 2009-11-06 10:51:47.000000000 +0100 +++ sle11sp1-2010-03-29/arch/x86/include/mach-xen/asm/dma-mapping.h 2009-11-06 10:52:22.000000000 +0100 @@ -1,11 +1,24 @@ #ifndef _ASM_X86_DMA_MAPPING_H_ +#define phys_to_dma _phys_to_dma_ +#define dma_to_phys _dma_to_phys_ + #include_next <asm/dma-mapping.h> -void dma_generic_free_coherent(struct device *, size_t, void *, dma_addr_t); +#undef phys_to_dma +#undef dma_to_phys + +static inline dma_addr_t phys_to_dma(struct device *dev, phys_addr_t paddr) +{ + return phys_to_machine(paddr); +} -#define address_needs_mapping(hwdev, addr, size) \ - !is_buffer_dma_capable(dma_get_mask(hwdev), addr, size) +static inline phys_addr_t dma_to_phys(struct device *dev, dma_addr_t daddr) +{ + return machine_to_phys(daddr); +} + +void dma_generic_free_coherent(struct device *, size_t, void *, dma_addr_t); extern int range_straddles_page_boundary(paddr_t p, size_t size); --- sle11sp1-2010-03-29.orig/arch/x86/include/mach-xen/asm/fixmap.h 2009-11-06 10:52:09.000000000 +0100 +++ sle11sp1-2010-03-29/arch/x86/include/mach-xen/asm/fixmap.h 2009-11-06 10:52:22.000000000 +0100 @@ -139,6 +139,9 @@ enum fixed_addresses { #ifdef CONFIG_X86_32 FIX_WP_TEST, #endif +#ifdef CONFIG_INTEL_TXT + FIX_TBOOT_BASE, +#endif __end_of_fixed_addresses }; --- sle11sp1-2010-03-29.orig/arch/x86/include/mach-xen/asm/hypervisor.h 2009-11-23 10:43:12.000000000 +0100 +++ sle11sp1-2010-03-29/arch/x86/include/mach-xen/asm/hypervisor.h 2009-11-23 10:44:04.000000000 +0100 @@ -70,6 +70,7 @@ extern start_info_t *xen_start_info; #endif #define init_hypervisor(c) ((void)((c)->x86_hyper_vendor = X86_HYPER_VENDOR_XEN)) +#define init_hypervisor_platform() init_hypervisor(&boot_cpu_data) struct vcpu_runstate_info *setup_runstate_area(unsigned int cpu); @@ -351,6 +352,6 @@ MULTI_grant_table_op(multicall_entry_t * #endif -#define uvm_multi(cpumask) ((unsigned long)cpus_addr(cpumask) | UVMF_MULTI) +#define uvm_multi(cpumask) ((unsigned long)cpumask_bits(cpumask) | UVMF_MULTI) #endif /* __HYPERVISOR_H__ */ --- sle11sp1-2010-03-29.orig/arch/x86/include/mach-xen/asm/irqflags.h 2009-11-06 10:52:02.000000000 +0100 +++ sle11sp1-2010-03-29/arch/x86/include/mach-xen/asm/irqflags.h 2009-11-06 10:52:22.000000000 +0100 @@ -1,7 +1,7 @@ #ifndef _X86_IRQFLAGS_H_ #define _X86_IRQFLAGS_H_ -#include <asm/processor-flags.h> +#include <asm/smp-processor-id.h> #ifndef __ASSEMBLY__ /* --- sle11sp1-2010-03-29.orig/arch/x86/include/mach-xen/asm/mmu_context.h 2009-11-06 10:52:02.000000000 +0100 +++ sle11sp1-2010-03-29/arch/x86/include/mach-xen/asm/mmu_context.h 2009-11-06 10:52:22.000000000 +0100 @@ -88,12 +88,12 @@ static inline void switch_mm(struct mm_s !PagePinned(virt_to_page(next->pgd))); /* stop flush ipis for the previous mm */ - cpu_clear(cpu, prev->cpu_vm_mask); + cpumask_clear_cpu(cpu, mm_cpumask(prev)); #if defined(CONFIG_SMP) && !defined(CONFIG_XEN) /* XEN: no lazy tlb */ percpu_write(cpu_tlbstate.state, TLBSTATE_OK); percpu_write(cpu_tlbstate.active_mm, next); #endif - cpu_set(cpu, next->cpu_vm_mask); + cpumask_set_cpu(cpu, mm_cpumask(next)); /* Re-load page tables: load_cr3(next->pgd) */ op->cmd = MMUEXT_NEW_BASEPTR; @@ -125,7 +125,7 @@ static inline void switch_mm(struct mm_s percpu_write(cpu_tlbstate.state, TLBSTATE_OK); BUG_ON(percpu_read(cpu_tlbstate.active_mm) != next); - if (!cpu_test_and_set(cpu, next->cpu_vm_mask)) { + if (!cpumask_test_and_set_cpu(cpu, mm_cpumask(next))) { /* We were in lazy tlb mode and leave_mm disabled * tlb flush IPI delivery. We must reload CR3 * to make sure to use no freed page tables. --- sle11sp1-2010-03-29.orig/arch/x86/include/mach-xen/asm/pci.h 2009-11-06 10:52:09.000000000 +0100 +++ sle11sp1-2010-03-29/arch/x86/include/mach-xen/asm/pci.h 2009-11-06 10:52:22.000000000 +0100 @@ -151,7 +151,11 @@ static inline int __pcibus_to_node(const static inline const struct cpumask * cpumask_of_pcibus(const struct pci_bus *bus) { - return cpumask_of_node(__pcibus_to_node(bus)); + int node; + + node = __pcibus_to_node(bus); + return (node == -1) ? cpu_online_mask : + cpumask_of_node(node); } #endif --- sle11sp1-2010-03-29.orig/arch/x86/include/mach-xen/asm/pgtable.h 2009-11-20 11:17:56.000000000 +0100 +++ sle11sp1-2010-03-29/arch/x86/include/mach-xen/asm/pgtable.h 2009-11-20 11:18:13.000000000 +0100 @@ -53,16 +53,6 @@ extern struct list_head pgd_list; #define pte_update(mm, addr, ptep) do { } while (0) #define pte_update_defer(mm, addr, ptep) do { } while (0) -static inline void __init paravirt_pagetable_setup_start(pgd_t *base) -{ - xen_pagetable_setup_start(base); -} - -static inline void __init paravirt_pagetable_setup_done(pgd_t *base) -{ - xen_pagetable_setup_done(base); -} - #define pgd_val(x) xen_pgd_val(x) #define __pgd(x) xen_make_pgd(x) @@ -134,6 +124,11 @@ static inline int pte_special(pte_t pte) #define pte_page(pte) pfn_to_page(pte_pfn(pte)) +static inline unsigned long pmd_pfn(pmd_t pmd) +{ + return (pmd_val(pmd) & PTE_PFN_MASK) >> PAGE_SHIFT; +} + static inline int pmd_large(pmd_t pte) { return (pmd_flags(pte) & (_PAGE_PSE | _PAGE_PRESENT)) == @@ -363,7 +358,7 @@ static inline unsigned long pmd_page_vad * this macro returns the index of the entry in the pmd page which would * control the given virtual address */ -static inline unsigned pmd_index(unsigned long address) +static inline unsigned long pmd_index(unsigned long address) { return (address >> PMD_SHIFT) & (PTRS_PER_PMD - 1); } @@ -383,7 +378,7 @@ static inline unsigned pmd_index(unsigne * this function returns the index of the entry in the pte page which would * control the given virtual address */ -static inline unsigned pte_index(unsigned long address) +static inline unsigned long pte_index(unsigned long address) { return (address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1); } @@ -439,11 +434,6 @@ static inline pmd_t *pmd_offset(pud_t *p return (pmd_t *)pud_page_vaddr(*pud) + pmd_index(address); } -static inline unsigned long pmd_pfn(pmd_t pmd) -{ - return (pmd_val(pmd) & PTE_PFN_MASK) >> PAGE_SHIFT; -} - static inline int pud_large(pud_t pud) { return (__pud_val(pud) & (_PAGE_PSE | _PAGE_PRESENT)) == @@ -479,7 +469,7 @@ static inline unsigned long pgd_page_vad #define pgd_page(pgd) pfn_to_page(pgd_val(pgd) >> PAGE_SHIFT) /* to find an entry in a page-table-directory. */ -static inline unsigned pud_index(unsigned long address) +static inline unsigned long pud_index(unsigned long address) { return (address >> PUD_SHIFT) & (PTRS_PER_PUD - 1); } @@ -600,7 +590,7 @@ extern int ptep_clear_flush_young(struct if (!pte_none(__res) && \ ((vma)->vm_mm != current->mm || \ HYPERVISOR_update_va_mapping(addr, __pte(0), \ - uvm_multi((vma)->vm_mm->cpu_vm_mask) | \ + uvm_multi(mm_cpumask((vma)->vm_mm)) | \ UVMF_INVLPG))) { \ __xen_pte_clear(__ptep); \ flush_tlb_page(vma, addr); \ --- sle11sp1-2010-03-29.orig/arch/x86/include/mach-xen/asm/pgtable_types.h 2009-11-06 10:52:09.000000000 +0100 +++ sle11sp1-2010-03-29/arch/x86/include/mach-xen/asm/pgtable_types.h 2009-11-06 10:52:22.000000000 +0100 @@ -334,6 +334,7 @@ static inline pteval_t pte_flags(pte_t p typedef struct page *pgtable_t; extern pteval_t __supported_pte_mask; +extern void set_nx(void); extern int nx_enabled; #define pgprot_writecombine pgprot_writecombine @@ -354,14 +355,6 @@ int phys_mem_access_prot_allowed(struct /* Install a pte for a particular vaddr in kernel space. */ void set_pte_vaddr(unsigned long vaddr, pte_t pte); -#ifndef CONFIG_XEN -extern void native_pagetable_setup_start(pgd_t *base); -extern void native_pagetable_setup_done(pgd_t *base); -#else -static inline void xen_pagetable_setup_start(pgd_t *base) {} -static inline void xen_pagetable_setup_done(pgd_t *base) {} -#endif - struct seq_file; extern void arch_report_meminfo(struct seq_file *m); --- sle11sp1-2010-03-29.orig/arch/x86/include/mach-xen/asm/processor.h 2009-11-06 10:52:09.000000000 +0100 +++ sle11sp1-2010-03-29/arch/x86/include/mach-xen/asm/processor.h 2010-03-17 14:36:55.000000000 +0100 @@ -27,6 +27,7 @@ struct mm_struct; #include <linux/cpumask.h> #include <linux/cache.h> #include <linux/threads.h> +#include <linux/math64.h> #include <linux/init.h> #include <xen/interface/physdev.h> @@ -411,7 +412,17 @@ extern unsigned long kernel_eflags; extern asmlinkage void ignore_sysret(void); #else /* X86_64 */ #ifdef CONFIG_CC_STACKPROTECTOR -DECLARE_PER_CPU(unsigned long, stack_canary); +/* + * Make sure stack canary segment base is cached-aligned: + * "For Intel Atom processors, avoid non zero segment base address + * that is not aligned to cache line boundary at all cost." + * (Optim Ref Manual Assembly/Compiler Coding Rule 15.) + */ +struct stack_canary { + char __pad[20]; /* canary at %gs:20 */ + unsigned long canary; +}; +DECLARE_PER_CPU_ALIGNED(struct stack_canary, stack_canary); #endif #endif /* X86_64 */ @@ -648,13 +659,23 @@ static inline void cpu_relax(void) rep_nop(); } -/* Stop speculative execution: */ +/* Stop speculative execution and prefetching of modified code. */ static inline void sync_core(void) { int tmp; - asm volatile("cpuid" : "=a" (tmp) : "0" (1) - : "ebx", "ecx", "edx", "memory"); +#if defined(CONFIG_M386) || defined(CONFIG_M486) + if (boot_cpu_data.x86 < 5) + /* There is no speculative execution. + * jmp is a barrier to prefetching. */ + asm volatile("jmp 1f\n1:\n" ::: "memory"); + else +#endif + /* cpuid is a barrier to speculative execution. + * Prefetched instructions are automatically + * invalidated when modified. */ + asm volatile("cpuid" : "=a" (tmp) : "0" (1) + : "ebx", "ecx", "edx", "memory"); } static inline void __monitor(const void *eax, unsigned long ecx, @@ -945,4 +966,35 @@ extern void start_thread(struct pt_regs extern int get_tsc_mode(unsigned long adr); extern int set_tsc_mode(unsigned int val); +extern int amd_get_nb_id(int cpu); + +struct aperfmperf { + u64 aperf, mperf; +}; + +static inline void get_aperfmperf(struct aperfmperf *am) +{ + WARN_ON_ONCE(!boot_cpu_has(X86_FEATURE_APERFMPERF)); + + rdmsrl(MSR_IA32_APERF, am->aperf); + rdmsrl(MSR_IA32_MPERF, am->mperf); +} + +#define APERFMPERF_SHIFT 10 + +static inline +unsigned long calc_aperfmperf_ratio(struct aperfmperf *old, + struct aperfmperf *new) +{ + u64 aperf = new->aperf - old->aperf; + u64 mperf = new->mperf - old->mperf; + unsigned long ratio = aperf; + + mperf >>= APERFMPERF_SHIFT; + if (mperf) + ratio = div64_u64(aperf, mperf); + + return ratio; +} + #endif /* _ASM_X86_PROCESSOR_H */ --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/arch/x86/include/mach-xen/asm/setup.h 2009-11-06 10:52:22.000000000 +0100 @@ -0,0 +1,8 @@ +#ifndef __ASSEMBLY__ + +void xen_start_kernel(void); +void xen_arch_setup(void); + +#endif + +#include_next <asm/setup.h> --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/arch/x86/include/mach-xen/asm/smp-processor-id.h 2009-11-06 10:52:22.000000000 +0100 @@ -0,0 +1,36 @@ +#ifndef _ASM_X86_SMP_PROCESSOR_ID_H +#define _ASM_X86_SMP_PROCESSOR_ID_H + +#if defined(CONFIG_SMP) && !defined(__ASSEMBLY__) + +#include <asm/percpu.h> + +DECLARE_PER_CPU(int, cpu_number); + +/* + * This function is needed by all SMP systems. It must _always_ be valid + * from the initial startup. We map APIC_BASE very early in page_setup(), + * so this is correct in the x86 case. + */ +#define raw_smp_processor_id() percpu_read(cpu_number) +#define safe_smp_processor_id() smp_processor_id() + +#ifdef CONFIG_X86_64_SMP +#define stack_smp_processor_id() \ +({ \ + struct thread_info *ti; \ + __asm__("andq %%rsp,%0; ":"=r" (ti) : "0" (CURRENT_MASK)); \ + ti->cpu; \ +}) +#endif + +#ifdef CONFIG_DEBUG_PREEMPT +extern unsigned int debug_smp_processor_id(void); +# define smp_processor_id() debug_smp_processor_id() +#else +# define smp_processor_id() raw_smp_processor_id() +#endif + +#endif /* SMP && !__ASSEMBLY__ */ + +#endif /* _ASM_X86_SMP_PROCESSOR_ID_H */ --- sle11sp1-2010-03-29.orig/arch/x86/include/mach-xen/asm/smp.h 2009-11-20 11:17:59.000000000 +0100 +++ sle11sp1-2010-03-29/arch/x86/include/mach-xen/asm/smp.h 2009-11-20 11:18:10.000000000 +0100 @@ -121,7 +121,6 @@ static inline void arch_send_call_functi smp_ops.send_call_func_single_ipi(cpu); } -#define arch_send_call_function_ipi_mask arch_send_call_function_ipi_mask static inline void arch_send_call_function_ipi_mask(const struct cpumask *mask) { smp_ops.send_call_func_ipi(mask); @@ -167,27 +166,7 @@ static inline int num_booting_cpus(void) extern unsigned disabled_cpus __cpuinitdata; -#ifdef CONFIG_X86_32_SMP -/* - * This function is needed by all SMP systems. It must _always_ be valid - * from the initial startup. We map APIC_BASE very early in page_setup(), - * so this is correct in the x86 case. - */ -#define raw_smp_processor_id() (percpu_read(cpu_number)) -#define safe_smp_processor_id() smp_processor_id() - -#elif defined(CONFIG_X86_64_SMP) -#define raw_smp_processor_id() (percpu_read(cpu_number)) - -#define stack_smp_processor_id() \ -({ \ - struct thread_info *ti; \ - __asm__("andq %%rsp,%0; ":"=r" (ti) : "0" (CURRENT_MASK)); \ - ti->cpu; \ -}) -#define safe_smp_processor_id() smp_processor_id() - -#endif +#include <asm/smp-processor-id.h> #ifdef CONFIG_X86_LOCAL_APIC --- sle11sp1-2010-03-29.orig/arch/x86/include/mach-xen/asm/system.h 2009-11-06 10:52:02.000000000 +0100 +++ sle11sp1-2010-03-29/arch/x86/include/mach-xen/asm/system.h 2009-11-06 10:52:22.000000000 +0100 @@ -30,7 +30,7 @@ void __switch_to_xtra(struct task_struct "movl %P[task_canary](%[next]), %%ebx\n\t" \ "movl %%ebx, "__percpu_arg([stack_canary])"\n\t" #define __switch_canary_oparam \ - , [stack_canary] "=m" (per_cpu_var(stack_canary)) + , [stack_canary] "=m" (per_cpu_var(stack_canary.canary)) #define __switch_canary_iparam \ , [task_canary] "i" (offsetof(struct task_struct, stack_canary)) #else /* CC_STACKPROTECTOR */ @@ -149,33 +149,6 @@ do { \ #endif #ifdef __KERNEL__ -#define _set_base(addr, base) do { unsigned long __pr; \ -__asm__ __volatile__ ("movw %%dx,%1\n\t" \ - "rorl $16,%%edx\n\t" \ - "movb %%dl,%2\n\t" \ - "movb %%dh,%3" \ - :"=&d" (__pr) \ - :"m" (*((addr)+2)), \ - "m" (*((addr)+4)), \ - "m" (*((addr)+7)), \ - "0" (base) \ - ); } while (0) - -#define _set_limit(addr, limit) do { unsigned long __lr; \ -__asm__ __volatile__ ("movw %%dx,%1\n\t" \ - "rorl $16,%%edx\n\t" \ - "movb %2,%%dh\n\t" \ - "andb $0xf0,%%dh\n\t" \ - "orb %%dh,%%dl\n\t" \ - "movb %%dl,%2" \ - :"=&d" (__lr) \ - :"m" (*(addr)), \ - "m" (*((addr)+6)), \ - "0" (limit) \ - ); } while (0) - -#define set_base(ldt, base) _set_base(((char *)&(ldt)) , (base)) -#define set_limit(ldt, limit) _set_limit(((char *)&(ldt)) , ((limit)-1)) extern void xen_load_gs_index(unsigned); --- sle11sp1-2010-03-29.orig/arch/x86/include/mach-xen/asm/tlbflush.h 2009-11-06 10:52:09.000000000 +0100 +++ sle11sp1-2010-03-29/arch/x86/include/mach-xen/asm/tlbflush.h 2009-11-06 10:52:22.000000000 +0100 @@ -74,9 +74,9 @@ static inline void reset_lazy_tlbstate(v #define local_flush_tlb() __flush_tlb() #define flush_tlb_all xen_tlb_flush_all -#define flush_tlb_current_task() xen_tlb_flush_mask(¤t->mm->cpu_vm_mask) -#define flush_tlb_mm(mm) xen_tlb_flush_mask(&(mm)->cpu_vm_mask) -#define flush_tlb_page(vma, va) xen_invlpg_mask(&(vma)->vm_mm->cpu_vm_mask, va) +#define flush_tlb_current_task() xen_tlb_flush_mask(mm_cpumask(current->mm)) +#define flush_tlb_mm(mm) xen_tlb_flush_mask(mm_cpumask(mm)) +#define flush_tlb_page(vma, va) xen_invlpg_mask(mm_cpumask((vma)->vm_mm), va) #define flush_tlb() flush_tlb_current_task() --- sle11sp1-2010-03-29.orig/arch/x86/kernel/Makefile 2009-11-06 10:52:09.000000000 +0100 +++ sle11sp1-2010-03-29/arch/x86/kernel/Makefile 2009-11-06 10:52:22.000000000 +0100 @@ -132,8 +132,6 @@ ifeq ($(CONFIG_X86_64),y) obj-$(CONFIG_PCI_MMCONFIG) += mmconf-fam10h_64.o obj-y += vsmp_64.o - - time_64-$(CONFIG_XEN) += time_32.o endif disabled-obj-$(CONFIG_XEN) := %_uv.o crash.o early-quirks.o hpet.o i8253.o \ --- sle11sp1-2010-03-29.orig/arch/x86/kernel/apic/io_apic-xen.c 2010-02-18 15:29:00.000000000 +0100 +++ sle11sp1-2010-03-29/arch/x86/kernel/apic/io_apic-xen.c 2009-11-06 10:52:22.000000000 +0100 @@ -79,6 +79,8 @@ unsigned long io_apic_irqs; #endif /* CONFIG_XEN */ #define __apicdebuginit(type) static type __init +#define for_each_irq_pin(entry, head) \ + for (entry = head; entry; entry = entry->next) /* * Is the SiS APIC rmw bug present ? @@ -100,12 +102,24 @@ int nr_ioapic_registers[MAX_IO_APICS]; struct mpc_ioapic mp_ioapics[MAX_IO_APICS]; int nr_ioapics; +/* IO APIC gsi routing info */ +struct mp_ioapic_gsi mp_gsi_routing[MAX_IO_APICS]; + /* MP IRQ source entries */ struct mpc_intsrc mp_irqs[MAX_IRQ_SOURCES]; /* # of MP IRQ source entries */ int mp_irq_entries; +#ifndef CONFIG_XEN +/* Number of legacy interrupts */ +static int nr_legacy_irqs __read_mostly = NR_IRQS_LEGACY; +/* GSI interrupts */ +static int nr_irqs_gsi = NR_IRQS_LEGACY; +#else +#define nr_legacy_irqs NR_IRQS_LEGACY +#endif + #if defined (CONFIG_MCA) || defined (CONFIG_EISA) int mp_bus_id_to_type[MAX_MP_BUSSES]; #endif @@ -132,15 +146,6 @@ static int __init parse_noapic(char *str early_param("noapic", parse_noapic); #ifndef CONFIG_XEN -struct irq_pin_list; - -/* - * This is performance-critical, we want to do it O(1) - * - * the indexing order of this array favors 1:1 mappings - * between pins and IRQs. - */ - struct irq_pin_list { int apic, pin; struct irq_pin_list *next; @@ -155,6 +160,11 @@ static struct irq_pin_list *get_one_free return pin; } +/* + * This is performance-critical, we want to do it O(1) + * + * Most irqs are mapped 1:1 with pins. + */ struct irq_cfg { struct irq_pin_list *irq_2_pin; cpumask_var_t domain; @@ -188,6 +198,12 @@ static struct irq_cfg irq_cfgx[NR_IRQS] [15] = { .vector = IRQ15_VECTOR, }, }; +void __init io_apic_disable_legacy(void) +{ + nr_legacy_irqs = 0; + nr_irqs_gsi = 0; +} + int __init arch_early_irq_init(void) { struct irq_cfg *cfg; @@ -205,7 +221,7 @@ int __init arch_early_irq_init(void) desc->chip_data = &cfg[i]; zalloc_cpumask_var_node(&cfg[i].domain, GFP_NOWAIT, node); zalloc_cpumask_var_node(&cfg[i].old_domain, GFP_NOWAIT, node); - if (i < NR_IRQS_LEGACY) + if (i < nr_legacy_irqs) cpumask_setall(cfg[i].domain); } @@ -231,17 +247,14 @@ static struct irq_cfg *get_one_free_irq_ cfg = kzalloc_node(sizeof(*cfg), GFP_ATOMIC, node); if (cfg) { - if (!alloc_cpumask_var_node(&cfg->domain, GFP_ATOMIC, node)) { + if (!zalloc_cpumask_var_node(&cfg->domain, GFP_ATOMIC, node)) { kfree(cfg); cfg = NULL; - } else if (!alloc_cpumask_var_node(&cfg->old_domain, + } else if (!zalloc_cpumask_var_node(&cfg->old_domain, GFP_ATOMIC, node)) { free_cpumask_var(cfg->domain); kfree(cfg); cfg = NULL; - } else { - cpumask_clear(cfg->domain); - cpumask_clear(cfg->old_domain); } } @@ -455,13 +468,10 @@ static bool io_apic_level_ack_pending(st unsigned long flags; spin_lock_irqsave(&ioapic_lock, flags); - entry = cfg->irq_2_pin; - for (;;) { + for_each_irq_pin(entry, cfg->irq_2_pin) { unsigned int reg; int pin; - if (!entry) - break; pin = entry->pin; reg = io_apic_read(entry->apic, 0x10 + pin*2); /* Is the remote IRR bit set? */ @@ -469,9 +479,6 @@ static bool io_apic_level_ack_pending(st spin_unlock_irqrestore(&ioapic_lock, flags); return true; } - if (!entry->next) - break; - entry = entry->next; } spin_unlock_irqrestore(&ioapic_lock, flags); @@ -543,72 +550,68 @@ static void ioapic_mask_entry(int apic, * shared ISA-space IRQs, so we have to support them. We are super * fast in the common case, and fast for shared ISA-space IRQs. */ -static void add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pin) +static int +add_pin_to_irq_node_nopanic(struct irq_cfg *cfg, int node, int apic, int pin) { - struct irq_pin_list *entry; + struct irq_pin_list **last, *entry; - entry = cfg->irq_2_pin; - if (!entry) { - entry = get_one_free_irq_2_pin(node); - if (!entry) { - printk(KERN_ERR "can not alloc irq_2_pin to add %d - %d\n", - apic, pin); - return; - } - cfg->irq_2_pin = entry; - entry->apic = apic; - entry->pin = pin; - return; - } - - while (entry->next) { - /* not again, please */ + /* don't allow duplicates */ + last = &cfg->irq_2_pin; + for_each_irq_pin(entry, cfg->irq_2_pin) { if (entry->apic == apic && entry->pin == pin) - return; - - entry = entry->next; + return 0; + last = &entry->next; } - entry->next = get_one_free_irq_2_pin(node); - entry = entry->next; + entry = get_one_free_irq_2_pin(node); + if (!entry) { + printk(KERN_ERR "can not alloc irq_pin_list (%d,%d,%d)\n", + node, apic, pin); + return -ENOMEM; + } entry->apic = apic; entry->pin = pin; + + *last = entry; + return 0; +} + +static void add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pin) +{ + if (add_pin_to_irq_node_nopanic(cfg, node, apic, pin)) + panic("IO-APIC: failed to add irq-pin. Can not proceed\n"); } /* * Reroute an IRQ to a different pin. */ static void __init replace_pin_at_irq_node(struct irq_cfg *cfg, int node, - int oldapic, int oldpin, - int newapic, int newpin) + int oldapic, int oldpin, + int newapic, int newpin) { - struct irq_pin_list *entry = cfg->irq_2_pin; - int replaced = 0; + struct irq_pin_list *entry; - while (entry) { + for_each_irq_pin(entry, cfg->irq_2_pin) { if (entry->apic == oldapic && entry->pin == oldpin) { entry->apic = newapic; entry->pin = newpin; - replaced = 1; /* every one is different, right? */ - break; + return; } - entry = entry->next; } - /* why? call replace before add? */ - if (!replaced) - add_pin_to_irq_node(cfg, node, newapic, newpin); + /* old apic/pin didn't exist, so just add new ones */ + add_pin_to_irq_node(cfg, node, newapic, newpin); } -static inline void io_apic_modify_irq(struct irq_cfg *cfg, - int mask_and, int mask_or, - void (*final)(struct irq_pin_list *entry)) +static void io_apic_modify_irq(struct irq_cfg *cfg, + int mask_and, int mask_or, + void (*final)(struct irq_pin_list *entry)) { int pin; struct irq_pin_list *entry; - for (entry = cfg->irq_2_pin; entry != NULL; entry = entry->next) { + for_each_irq_pin(entry, cfg->irq_2_pin) { unsigned int reg; pin = entry->pin; reg = io_apic_read(entry->apic, 0x10 + pin * 2); @@ -625,7 +628,6 @@ static void __unmask_IO_APIC_irq(struct io_apic_modify_irq(cfg, ~IO_APIC_REDIR_MASKED, 0, NULL); } -#ifdef CONFIG_X86_64 static void io_apic_sync(struct irq_pin_list *entry) { /* @@ -641,11 +643,6 @@ static void __mask_IO_APIC_irq(struct ir { io_apic_modify_irq(cfg, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync); } -#else /* CONFIG_X86_32 */ -static void __mask_IO_APIC_irq(struct irq_cfg *cfg) -{ - io_apic_modify_irq(cfg, ~0, IO_APIC_REDIR_MASKED, NULL); -} static void __mask_and_edge_IO_APIC_irq(struct irq_cfg *cfg) { @@ -658,7 +655,6 @@ static void __unmask_and_level_IO_APIC_i io_apic_modify_irq(cfg, ~IO_APIC_REDIR_MASKED, IO_APIC_REDIR_LEVEL_TRIGGER, NULL); } -#endif /* CONFIG_X86_32 */ static void mask_IO_APIC_irq_desc(struct irq_desc *desc) { @@ -719,6 +715,7 @@ static void clear_IO_APIC (void) } #else #define add_pin_to_irq_node(cfg, node, apic, pin) +#define add_pin_to_irq_node_nopanic(cfg, node, apic, pin) 0 #endif /* CONFIG_XEN */ #ifdef CONFIG_X86_32 @@ -935,7 +932,7 @@ static int __init find_isa_irq_apic(int */ static int EISA_ELCR(unsigned int irq) { - if (irq < NR_IRQS_LEGACY) { + if (irq < nr_legacy_irqs) { unsigned int port = 0x4d0 + (irq >> 3); return (inb(port) >> (irq & 7)) & 1; } @@ -1547,7 +1544,7 @@ static void setup_IO_APIC_irq(int apic_i } ioapic_register_intr(irq, desc, trigger); - if (irq < NR_IRQS_LEGACY) + if (irq < nr_legacy_irqs) disable_8259A_irq(irq); ioapic_write_entry(apic_id, pin, entry); @@ -1775,12 +1772,8 @@ __apicdebuginit(void) print_IO_APIC(void if (!entry) continue; printk(KERN_DEBUG "IRQ%d ", irq); - for (;;) { + for_each_irq_pin(entry, cfg->irq_2_pin) printk("-> %d:%d", entry->apic, entry->pin); - if (!entry->next) - break; - entry = entry->next; - } printk("\n"); } @@ -1924,7 +1917,7 @@ __apicdebuginit(void) print_PIC(void) unsigned int v; unsigned long flags; - if (apic_verbosity == APIC_QUIET) + if (apic_verbosity == APIC_QUIET || !nr_legacy_irqs) return; printk(KERN_DEBUG "\nprinting PIC contents\n"); @@ -1956,7 +1949,7 @@ __apicdebuginit(int) print_all_ICs(void) print_PIC(); /* don't print out if apic is not there */ - if (!cpu_has_apic || disable_apic) + if (!cpu_has_apic && !apic_from_smp_config()) return 0; print_all_local_APICs(); @@ -1990,6 +1983,10 @@ void __init enable_IO_APIC(void) spin_unlock_irqrestore(&ioapic_lock, flags); nr_ioapic_registers[apic] = reg_01.bits.entries+1; } + + if (!nr_legacy_irqs) + return; + #ifndef CONFIG_XEN for(apic = 0; apic < nr_ioapics; apic++) { int pin; @@ -2049,6 +2046,9 @@ void disable_IO_APIC(void) */ clear_IO_APIC(); + if (!nr_legacy_irqs) + return; + /* * If the i8259 is routed through an IOAPIC * Put that IOAPIC in virtual wire mode @@ -2082,7 +2082,7 @@ void disable_IO_APIC(void) /* * Use virtual wire A mode when interrupt remapping is enabled. */ - if (cpu_has_apic) + if (cpu_has_apic || apic_from_smp_config()) disconnect_bsp_APIC(!intr_remapping_enabled && ioapic_i8259.pin != -1); } @@ -2095,7 +2095,7 @@ void disable_IO_APIC(void) * by Matt Domsch <Matt_Domsch@dell.com> Tue Dec 21 12:25:05 CST 1999 */ -static void __init setup_ioapic_ids_from_mpc(void) +void __init setup_ioapic_ids_from_mpc(void) { union IO_APIC_reg_00 reg_00; physid_mask_t phys_id_present_map; @@ -2104,9 +2104,8 @@ static void __init setup_ioapic_ids_from unsigned char old_id; unsigned long flags; - if (x86_quirks->setup_ioapic_ids && x86_quirks->setup_ioapic_ids()) + if (acpi_ioapic) return; - /* * Don't check I/O APIC IDs for xAPIC systems. They have * no meaning without the serial APIC bus. @@ -2280,7 +2279,7 @@ static unsigned int startup_ioapic_irq(u struct irq_cfg *cfg; spin_lock_irqsave(&ioapic_lock, flags); - if (irq < NR_IRQS_LEGACY) { + if (irq < nr_legacy_irqs) { disable_8259A_irq(irq); if (i8259A_irq_pending(irq)) was_pending = 1; @@ -2292,7 +2291,6 @@ static unsigned int startup_ioapic_irq(u return was_pending; } -#ifdef CONFIG_X86_64 static int ioapic_retrigger_irq(unsigned int irq) { @@ -2305,14 +2303,6 @@ static int ioapic_retrigger_irq(unsigned return 1; } -#else -static int ioapic_retrigger_irq(unsigned int irq) -{ - apic->send_IPI_self(irq_cfg(irq)->vector); - - return 1; -} -#endif /* * Level and edge triggered IO-APIC interrupts need different handling, @@ -2350,13 +2340,9 @@ static void __target_IO_APIC_irq(unsigne struct irq_pin_list *entry; u8 vector = cfg->vector; - entry = cfg->irq_2_pin; - for (;;) { + for_each_irq_pin(entry, cfg->irq_2_pin) { unsigned int reg; - if (!entry) - break; - apic = entry->apic; pin = entry->pin; /* @@ -2369,9 +2355,6 @@ static void __target_IO_APIC_irq(unsigne reg &= ~IO_APIC_REDIR_VECTOR_MASK; reg |= vector; io_apic_modify(apic, 0x10 + pin*2, reg); - if (!entry->next) - break; - entry = entry->next; } } @@ -2596,11 +2579,8 @@ atomic_t irq_mis_count; static void ack_apic_level(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); - -#ifdef CONFIG_X86_32 unsigned long v; int i; -#endif struct irq_cfg *cfg; int do_unmask_irq = 0; @@ -2613,31 +2593,28 @@ static void ack_apic_level(unsigned int } #endif -#ifdef CONFIG_X86_32 /* - * It appears there is an erratum which affects at least version 0x11 - * of I/O APIC (that's the 82093AA and cores integrated into various - * chipsets). Under certain conditions a level-triggered interrupt is - * erroneously delivered as edge-triggered one but the respective IRR - * bit gets set nevertheless. As a result the I/O unit expects an EOI - * message but it will never arrive and further interrupts are blocked - * from the source. The exact reason is so far unknown, but the - * phenomenon was observed when two consecutive interrupt requests - * from a given source get delivered to the same CPU and the source is - * temporarily disabled in between. - * - * A workaround is to simulate an EOI message manually. We achieve it - * by setting the trigger mode to edge and then to level when the edge - * trigger mode gets detected in the TMR of a local APIC for a - * level-triggered interrupt. We mask the source for the time of the - * operation to prevent an edge-triggered interrupt escaping meanwhile. - * The idea is from Manfred Spraul. --macro - */ + * It appears there is an erratum which affects at least version 0x11 + * of I/O APIC (that's the 82093AA and cores integrated into various + * chipsets). Under certain conditions a level-triggered interrupt is + * erroneously delivered as edge-triggered one but the respective IRR + * bit gets set nevertheless. As a result the I/O unit expects an EOI + * message but it will never arrive and further interrupts are blocked + * from the source. The exact reason is so far unknown, but the + * phenomenon was observed when two consecutive interrupt requests + * from a given source get delivered to the same CPU and the source is + * temporarily disabled in between. + * + * A workaround is to simulate an EOI message manually. We achieve it + * by setting the trigger mode to edge and then to level when the edge + * trigger mode gets detected in the TMR of a local APIC for a + * level-triggered interrupt. We mask the source for the time of the + * operation to prevent an edge-triggered interrupt escaping meanwhile. + * The idea is from Manfred Spraul. --macro + */ cfg = desc->chip_data; i = cfg->vector; - v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1)); -#endif /* * We must acknowledge the irq before we move it or the acknowledge will @@ -2679,7 +2656,7 @@ static void ack_apic_level(unsigned int unmask_IO_APIC_irq_desc(desc); } -#ifdef CONFIG_X86_32 + /* Tail end of version 0x11 I/O APIC bug workaround */ if (!(v & (1 << (i & 0x1f)))) { atomic_inc(&irq_mis_count); spin_lock(&ioapic_lock); @@ -2687,26 +2664,15 @@ static void ack_apic_level(unsigned int __unmask_and_level_IO_APIC_irq(cfg); spin_unlock(&ioapic_lock); } -#endif } #ifdef CONFIG_INTR_REMAP static void __eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg) { - int apic, pin; struct irq_pin_list *entry; - entry = cfg->irq_2_pin; - for (;;) { - - if (!entry) - break; - - apic = entry->apic; - pin = entry->pin; - io_apic_eoi(apic, pin); - entry = entry->next; - } + for_each_irq_pin(entry, cfg->irq_2_pin) + io_apic_eoi(entry->apic, entry->pin); } static void @@ -2796,7 +2762,7 @@ static inline void init_IO_APIC_traps(vo * so default to an old-fashioned 8259 * interrupt if we can.. */ - if (irq < NR_IRQS_LEGACY) + if (irq < nr_legacy_irqs) make_8259A_irq(irq); else /* Strange. Oh, well.. */ @@ -3136,7 +3102,7 @@ out: * the I/O APIC in all cases now. No actual device should request * it anyway. --macro */ -#define PIC_IRQS (1 << PIC_CASCADE_IR) +#define PIC_IRQS (1UL << PIC_CASCADE_IR) void __init setup_IO_APIC(void) { @@ -3148,23 +3114,21 @@ void __init setup_IO_APIC(void) * calling enable_IO_APIC() is moved to setup_local_APIC for BP */ #endif - - io_apic_irqs = ~PIC_IRQS; + io_apic_irqs = nr_legacy_irqs ? ~PIC_IRQS : ~0UL; apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n"); /* * Set up IO-APIC IRQ routing. */ #ifndef CONFIG_XEN -#ifdef CONFIG_X86_32 - if (!acpi_ioapic) - setup_ioapic_ids_from_mpc(); -#endif + x86_init.mpparse.setup_ioapic_ids(); + sync_Arb_IDs(); #endif setup_IO_APIC_irqs(); init_IO_APIC_traps(); - check_timer(); + if (nr_legacy_irqs) + check_timer(); } /* @@ -3274,7 +3238,6 @@ static int __init ioapic_init_sysfs(void device_initcall(ioapic_init_sysfs); -static int nr_irqs_gsi = NR_IRQS_LEGACY; /* * Dynamic irq allocate and deallocation */ @@ -3346,8 +3309,7 @@ void destroy_irq(unsigned int irq) cfg = desc->chip_data; dynamic_irq_cleanup(irq); /* connect back irq_cfg */ - if (desc) - desc->chip_data = cfg; + desc->chip_data = cfg; free_irte(irq); spin_lock_irqsave(&vector_lock, flags); @@ -4025,9 +3987,13 @@ static int __io_apic_set_pci_routing(str /* * IRQs < 16 are already in the irq_2_pin[] map */ - if (irq >= NR_IRQS_LEGACY) { + if (irq >= nr_legacy_irqs) { cfg = desc->chip_data; - add_pin_to_irq_node(cfg, node, ioapic, pin); + if (add_pin_to_irq_node_nopanic(cfg, node, ioapic, pin)) { + printk(KERN_INFO "can not add pin %d for irq %d\n", + pin, irq); + return 0; + } } setup_IO_APIC_irq(ioapic, pin, irq, desc, trigger, polarity); @@ -4056,11 +4022,28 @@ int io_apic_set_pci_routing(struct devic return __io_apic_set_pci_routing(dev, irq, irq_attr); } -/* -------------------------------------------------------------------------- - ACPI-based IOAPIC Configuration - -------------------------------------------------------------------------- */ +u8 __init io_apic_unique_id(u8 id) +{ +#ifdef CONFIG_X86_32 + if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && + !APIC_XAPIC(apic_version[boot_cpu_physical_apicid])) + return io_apic_get_unique_id(nr_ioapics, id); + else + return id; +#else + int i; + DECLARE_BITMAP(used, 256); -#ifdef CONFIG_ACPI + bitmap_zero(used, 256); + for (i = 0; i < nr_ioapics; i++) { + struct mpc_ioapic *ia = &mp_ioapics[i]; + __set_bit(ia->apicid, used); + } + if (!test_bit(id, used)) + return id; + return find_first_zero_bit(used, 256); +#endif +} #ifdef CONFIG_X86_32 int __init io_apic_get_unique_id(int ioapic, int apic_id) @@ -4171,8 +4154,6 @@ int acpi_get_override_irq(int bus_irq, i return 0; } -#endif /* CONFIG_ACPI */ - #ifndef CONFIG_XEN /* * This function currently is only a helper for the i386 smp boot process where @@ -4227,7 +4208,7 @@ void __init setup_ioapic_dest(void) static struct resource *ioapic_resources; -static struct resource * __init ioapic_setup_resources(void) +static struct resource * __init ioapic_setup_resources(int nr_ioapics) { unsigned long n; struct resource *res; @@ -4243,15 +4224,13 @@ static struct resource * __init ioapic_s mem = alloc_bootmem(n); res = (void *)mem; - if (mem != NULL) { - mem += sizeof(struct resource) * nr_ioapics; + mem += sizeof(struct resource) * nr_ioapics; - for (i = 0; i < nr_ioapics; i++) { - res[i].name = mem; - res[i].flags = IORESOURCE_MEM | IORESOURCE_BUSY; - sprintf(mem, "IOAPIC %u", i); - mem += IOAPIC_RESOURCE_NAME_SIZE; - } + for (i = 0; i < nr_ioapics; i++) { + res[i].name = mem; + res[i].flags = IORESOURCE_MEM | IORESOURCE_BUSY; + sprintf(mem, "IOAPIC %u", i); + mem += IOAPIC_RESOURCE_NAME_SIZE; } ioapic_resources = res; @@ -4265,7 +4244,7 @@ void __init ioapic_init_mappings(void) struct resource *ioapic_res; int i; - ioapic_res = ioapic_setup_resources(); + ioapic_res = ioapic_setup_resources(nr_ioapics); for (i = 0; i < nr_ioapics; i++) { if (smp_found_config) { ioapic_phys = mp_ioapics[i].apicaddr; @@ -4294,11 +4273,9 @@ fake_ioapic_page: __fix_to_virt(idx), ioapic_phys); idx++; - if (ioapic_res != NULL) { - ioapic_res->start = ioapic_phys; - ioapic_res->end = ioapic_phys + (4 * 1024) - 1; - ioapic_res++; - } + ioapic_res->start = ioapic_phys; + ioapic_res->end = ioapic_phys + (4 * 1024) - 1; + ioapic_res++; } } @@ -4320,3 +4297,78 @@ void __init ioapic_insert_resources(void } } #endif /* !CONFIG_XEN */ + +int mp_find_ioapic(int gsi) +{ + int i = 0; + + /* Find the IOAPIC that manages this GSI. */ + for (i = 0; i < nr_ioapics; i++) { + if ((gsi >= mp_gsi_routing[i].gsi_base) + && (gsi <= mp_gsi_routing[i].gsi_end)) + return i; + } + + printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi); + return -1; +} + +int mp_find_ioapic_pin(int ioapic, int gsi) +{ + if (WARN_ON(ioapic == -1)) + return -1; + if (WARN_ON(gsi > mp_gsi_routing[ioapic].gsi_end)) + return -1; + + return gsi - mp_gsi_routing[ioapic].gsi_base; +} + +static int bad_ioapic(unsigned long address) +{ + if (nr_ioapics >= MAX_IO_APICS) { + printk(KERN_WARNING "WARING: Max # of I/O APICs (%d) exceeded " + "(found %d), skipping\n", MAX_IO_APICS, nr_ioapics); + return 1; + } + if (!address) { + printk(KERN_WARNING "WARNING: Bogus (zero) I/O APIC address" + " found in table, skipping!\n"); + return 1; + } + return 0; +} + +void __init mp_register_ioapic(int id, u32 address, u32 gsi_base) +{ + int idx = 0; + + if (bad_ioapic(address)) + return; + + idx = nr_ioapics; + + mp_ioapics[idx].type = MP_IOAPIC; + mp_ioapics[idx].flags = MPC_APIC_USABLE; + mp_ioapics[idx].apicaddr = address; + +#ifndef CONFIG_XEN + set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address); +#endif + mp_ioapics[idx].apicid = io_apic_unique_id(id); + mp_ioapics[idx].apicver = io_apic_get_version(idx); + + /* + * Build basic GSI lookup table to facilitate gsi->io_apic lookups + * and to prevent reprogramming of IOAPIC pins (PCI GSIs). + */ + mp_gsi_routing[idx].gsi_base = gsi_base; + mp_gsi_routing[idx].gsi_end = gsi_base + + io_apic_get_redir_entries(idx); + + printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%x, " + "GSI %d-%d\n", idx, mp_ioapics[idx].apicid, + mp_ioapics[idx].apicver, mp_ioapics[idx].apicaddr, + mp_gsi_routing[idx].gsi_base, mp_gsi_routing[idx].gsi_end); + + nr_ioapics++; +} --- sle11sp1-2010-03-29.orig/arch/x86/kernel/cpu/Makefile 2010-02-09 16:56:39.000000000 +0100 +++ sle11sp1-2010-03-29/arch/x86/kernel/cpu/Makefile 2010-02-09 17:07:42.000000000 +0100 @@ -33,7 +33,7 @@ obj-$(CONFIG_CPU_FREQ) += cpufreq/ obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o -disabled-obj-$(CONFIG_XEN) := hypervisor.o vmware.o +disabled-obj-$(CONFIG_XEN) := hypervisor.o vmware.o sched.o quiet_cmd_mkcapflags = MKCAP $@ cmd_mkcapflags = $(PERL) $(srctree)/$(src)/mkcapflags.pl $< $@ --- sle11sp1-2010-03-29.orig/arch/x86/kernel/cpu/amd.c 2010-01-18 16:53:52.000000000 +0100 +++ sle11sp1-2010-03-29/arch/x86/kernel/cpu/amd.c 2010-01-18 16:55:14.000000000 +0100 @@ -336,7 +336,7 @@ static void __cpuinit amd_detect_cmp(str int amd_get_nb_id(int cpu) { int id = 0; -#ifdef CONFIG_SMP +#if defined(CONFIG_SMP) && !defined(CONFIG_XEN) id = per_cpu(cpu_llc_id, cpu); #endif return id; @@ -492,8 +492,10 @@ static void __cpuinit init_amd(struct cp if (c->x86 == 0x10 || c->x86 == 0x11) set_cpu_cap(c, X86_FEATURE_REP_GOOD); +#ifndef CONFIG_XEN /* get apicid instead of initial apic id from cpuid */ c->apicid = hard_smp_processor_id(); +#endif #else /* --- sle11sp1-2010-03-29.orig/arch/x86/kernel/cpu/common-xen.c 2009-11-06 10:52:09.000000000 +0100 +++ sle11sp1-2010-03-29/arch/x86/kernel/cpu/common-xen.c 2009-11-06 10:52:22.000000000 +0100 @@ -13,13 +13,13 @@ #include <linux/io.h> #include <asm/stackprotector.h> -#include <asm/perf_counter.h> +#include <asm/perf_event.h> #include <asm/mmu_context.h> #include <asm/hypervisor.h> #include <asm/processor.h> #include <asm/sections.h> -#include <asm/topology.h> -#include <asm/cpumask.h> +#include <linux/topology.h> +#include <linux/cpumask.h> #include <asm/pgtable.h> #include <asm/atomic.h> #include <asm/proto.h> @@ -28,13 +28,12 @@ #include <asm/desc.h> #include <asm/i387.h> #include <asm/mtrr.h> -#include <asm/numa.h> +#include <linux/numa.h> #include <asm/asm.h> #include <asm/cpu.h> #include <asm/mce.h> #include <asm/msr.h> #include <asm/pat.h> -#include <asm/smp.h> #ifdef CONFIG_X86_LOCAL_APIC #include <asm/uv/uv.h> @@ -102,17 +101,17 @@ DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_p * TLS descriptors are currently at a different place compared to i386. * Hopefully nobody expects them at a fixed place (Wine?) */ - [GDT_ENTRY_KERNEL32_CS] = { { { 0x0000ffff, 0x00cf9b00 } } }, - [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00af9b00 } } }, - [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9300 } } }, - [GDT_ENTRY_DEFAULT_USER32_CS] = { { { 0x0000ffff, 0x00cffb00 } } }, - [GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff300 } } }, - [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00affb00 } } }, + [GDT_ENTRY_KERNEL32_CS] = GDT_ENTRY_INIT(0xc09b, 0, 0xfffff), + [GDT_ENTRY_KERNEL_CS] = GDT_ENTRY_INIT(0xa09b, 0, 0xfffff), + [GDT_ENTRY_KERNEL_DS] = GDT_ENTRY_INIT(0xc093, 0, 0xfffff), + [GDT_ENTRY_DEFAULT_USER32_CS] = GDT_ENTRY_INIT(0xc0fb, 0, 0xfffff), + [GDT_ENTRY_DEFAULT_USER_DS] = GDT_ENTRY_INIT(0xc0f3, 0, 0xfffff), + [GDT_ENTRY_DEFAULT_USER_CS] = GDT_ENTRY_INIT(0xa0fb, 0, 0xfffff), #else - [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00cf9a00 } } }, - [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9200 } } }, - [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00cffa00 } } }, - [GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff200 } } }, + [GDT_ENTRY_KERNEL_CS] = GDT_ENTRY_INIT(0xc09a, 0, 0xfffff), + [GDT_ENTRY_KERNEL_DS] = GDT_ENTRY_INIT(0xc092, 0, 0xfffff), + [GDT_ENTRY_DEFAULT_USER_CS] = GDT_ENTRY_INIT(0xc0fa, 0, 0xfffff), + [GDT_ENTRY_DEFAULT_USER_DS] = GDT_ENTRY_INIT(0xc0f2, 0, 0xfffff), #ifndef CONFIG_XEN /* * Segments used for calling PnP BIOS have byte granularity. @@ -120,29 +119,29 @@ DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_p * the transfer segment sizes are set at run time. */ /* 32-bit code */ - [GDT_ENTRY_PNPBIOS_CS32] = { { { 0x0000ffff, 0x00409a00 } } }, + [GDT_ENTRY_PNPBIOS_CS32] = GDT_ENTRY_INIT(0x409a, 0, 0xffff), /* 16-bit code */ - [GDT_ENTRY_PNPBIOS_CS16] = { { { 0x0000ffff, 0x00009a00 } } }, + [GDT_ENTRY_PNPBIOS_CS16] = GDT_ENTRY_INIT(0x009a, 0, 0xffff), /* 16-bit data */ - [GDT_ENTRY_PNPBIOS_DS] = { { { 0x0000ffff, 0x00009200 } } }, + [GDT_ENTRY_PNPBIOS_DS] = GDT_ENTRY_INIT(0x0092, 0, 0xffff), /* 16-bit data */ - [GDT_ENTRY_PNPBIOS_TS1] = { { { 0x00000000, 0x00009200 } } }, + [GDT_ENTRY_PNPBIOS_TS1] = GDT_ENTRY_INIT(0x0092, 0, 0), /* 16-bit data */ - [GDT_ENTRY_PNPBIOS_TS2] = { { { 0x00000000, 0x00009200 } } }, + [GDT_ENTRY_PNPBIOS_TS2] = GDT_ENTRY_INIT(0x0092, 0, 0), /* * The APM segments have byte granularity and their bases * are set at run time. All have 64k limits. */ /* 32-bit code */ - [GDT_ENTRY_APMBIOS_BASE] = { { { 0x0000ffff, 0x00409a00 } } }, + [GDT_ENTRY_APMBIOS_BASE] = GDT_ENTRY_INIT(0x409a, 0, 0xffff), /* 16-bit code */ - [GDT_ENTRY_APMBIOS_BASE+1] = { { { 0x0000ffff, 0x00009a00 } } }, + [GDT_ENTRY_APMBIOS_BASE+1] = GDT_ENTRY_INIT(0x009a, 0, 0xffff), /* data */ - [GDT_ENTRY_APMBIOS_BASE+2] = { { { 0x0000ffff, 0x00409200 } } }, + [GDT_ENTRY_APMBIOS_BASE+2] = GDT_ENTRY_INIT(0x4092, 0, 0xffff), - [GDT_ENTRY_ESPFIX_SS] = { { { 0x0000ffff, 0x00cf9200 } } }, + [GDT_ENTRY_ESPFIX_SS] = GDT_ENTRY_INIT(0xc092, 0, 0xfffff), #endif - [GDT_ENTRY_PERCPU] = { { { 0x0000ffff, 0x00cf9200 } } }, + [GDT_ENTRY_PERCPU] = GDT_ENTRY_INIT(0xc092, 0, 0xfffff), GDT_STACK_CANARY_INIT #endif } }; @@ -900,7 +899,7 @@ void __init identify_boot_cpu(void) #else vgetcpu_set_mode(); #endif - init_hw_perf_counters(); + init_hw_perf_events(); } void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c) @@ -1013,7 +1012,7 @@ __setup("clearcpuid=", setup_disablecpui #ifdef CONFIG_X86_64 #ifndef CONFIG_X86_NO_IDT -struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table }; +struct desc_ptr idt_descr = { NR_VECTORS * 16 - 1, (unsigned long) idt_table }; #endif DEFINE_PER_CPU_FIRST(union irq_stack_union, @@ -1027,13 +1026,21 @@ void xen_switch_pt(void) #endif } -DEFINE_PER_CPU(char *, irq_stack_ptr) = - init_per_cpu_var(irq_stack_union.irq_stack) + IRQ_STACK_SIZE - 64; +/* + * The following four percpu variables are hot. Align current_task to + * cacheline size such that all four fall in the same cacheline. + */ +DEFINE_PER_CPU(struct task_struct *, current_task) ____cacheline_aligned = + &init_task; +EXPORT_PER_CPU_SYMBOL(current_task); DEFINE_PER_CPU(unsigned long, kernel_stack) = (unsigned long)&init_thread_union - KERNEL_STACK_OFFSET + THREAD_SIZE; EXPORT_PER_CPU_SYMBOL(kernel_stack); +DEFINE_PER_CPU(char *, irq_stack_ptr) = + init_per_cpu_var(irq_stack_union.irq_stack) + IRQ_STACK_SIZE - 64; + DEFINE_PER_CPU(unsigned int, irq_count) = -1; #ifndef CONFIG_X86_NO_TSS @@ -1049,8 +1056,7 @@ static const unsigned int exception_stac }; static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks - [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]) - __aligned(PAGE_SIZE); + [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]); #endif void __cpuinit syscall_init(void) @@ -1097,8 +1103,11 @@ DEFINE_PER_CPU(struct orig_ist, orig_ist #else /* CONFIG_X86_64 */ +DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task; +EXPORT_PER_CPU_SYMBOL(current_task); + #ifdef CONFIG_CC_STACKPROTECTOR -DEFINE_PER_CPU(unsigned long, stack_canary); +DEFINE_PER_CPU_ALIGNED(struct stack_canary, stack_canary); #endif /* Make sure %fs and %gs are initialized properly in idle threads */ --- sle11sp1-2010-03-29.orig/arch/x86/kernel/cpu/mcheck/mce-inject.c 2010-03-31 09:52:27.000000000 +0200 +++ sle11sp1-2010-03-29/arch/x86/kernel/cpu/mcheck/mce-inject.c 2009-11-06 10:52:22.000000000 +0100 @@ -143,7 +143,7 @@ static void raise_mce(struct mce *m) if (context == MCJ_CTX_RANDOM) return; -#ifdef CONFIG_X86_LOCAL_APIC +#if defined(CONFIG_X86_LOCAL_APIC) && !defined(CONFIG_XEN) if (m->inject_flags & MCJ_NMI_BROADCAST) { unsigned long start; int cpu; --- sle11sp1-2010-03-29.orig/arch/x86/kernel/cpu/mtrr/main-xen.c 2009-11-06 10:51:55.000000000 +0100 +++ sle11sp1-2010-03-29/arch/x86/kernel/cpu/mtrr/main-xen.c 2009-11-06 10:52:22.000000000 +0100 @@ -1,10 +1,9 @@ -#include <linux/init.h> -#include <linux/proc_fs.h> -#include <linux/ctype.h> +#define DEBUG + +#include <linux/uaccess.h> #include <linux/module.h> -#include <linux/seq_file.h> -#include <asm/uaccess.h> #include <linux/mutex.h> +#include <linux/init.h> #include <asm/mtrr.h> #include "mtrr.h" @@ -58,7 +57,7 @@ static void __init init_table(void) mtrr_usage_table[i] = 0; } -int mtrr_add_page(unsigned long base, unsigned long size, +int mtrr_add_page(unsigned long base, unsigned long size, unsigned int type, bool increment) { int error; @@ -88,25 +87,23 @@ int mtrr_add_page(unsigned long base, un static int mtrr_check(unsigned long base, unsigned long size) { if ((base & (PAGE_SIZE - 1)) || (size & (PAGE_SIZE - 1))) { - printk(KERN_WARNING - "mtrr: size and base must be multiples of 4 kiB\n"); - printk(KERN_DEBUG - "mtrr: size: 0x%lx base: 0x%lx\n", size, base); + pr_warning("mtrr: size and base must be multiples of 4 kiB\n"); + pr_debug("mtrr: size: 0x%lx base: 0x%lx\n", size, base); dump_stack(); return -1; } return 0; } -int -mtrr_add(unsigned long base, unsigned long size, unsigned int type, - bool increment) +int mtrr_add(unsigned long base, unsigned long size, unsigned int type, + bool increment) { if (mtrr_check(base, size)) return -EINVAL; return mtrr_add_page(base >> PAGE_SHIFT, size >> PAGE_SHIFT, type, increment); } +EXPORT_SYMBOL(mtrr_add); int mtrr_del_page(int reg, unsigned long base, unsigned long size) { @@ -128,13 +125,13 @@ int mtrr_del_page(int reg, unsigned long } } if (reg < 0) { - printk(KERN_DEBUG "mtrr: no MTRR for %lx000,%lx000 found\n", base, - size); + pr_debug("mtrr: no MTRR for %lx000,%lx000 found\n", + base, size); goto out; } } if (mtrr_usage_table[reg] < 1) { - printk(KERN_WARNING "mtrr: reg: %d has count=0\n", reg); + pr_warning("mtrr: reg: %d has count=0\n", reg); goto out; } if (--mtrr_usage_table[reg] < 1) { @@ -153,15 +150,12 @@ int mtrr_del_page(int reg, unsigned long return error; } -int -mtrr_del(int reg, unsigned long base, unsigned long size) +int mtrr_del(int reg, unsigned long base, unsigned long size) { if (mtrr_check(base, size)) return -EINVAL; return mtrr_del_page(reg, base >> PAGE_SHIFT, size >> PAGE_SHIFT); } - -EXPORT_SYMBOL(mtrr_add); EXPORT_SYMBOL(mtrr_del); /* --- sle11sp1-2010-03-29.orig/arch/x86/kernel/e820-xen.c 2009-11-06 10:52:09.000000000 +0100 +++ sle11sp1-2010-03-29/arch/x86/kernel/e820-xen.c 2009-12-04 11:31:40.000000000 +0100 @@ -134,7 +134,7 @@ static void __init __e820_add_region(str { int x = e820x->nr_map; - if (x == ARRAY_SIZE(e820x->map)) { + if (x >= ARRAY_SIZE(e820x->map)) { printk(KERN_ERR "Ooops! Too many entries in the memory map!\n"); return; } @@ -1455,7 +1455,7 @@ void __init e820_reserve_resources(void) struct resource *res; u64 end; - res = alloc_bootmem_low(sizeof(struct resource) * e820.nr_map); + res = alloc_bootmem(sizeof(struct resource) * e820.nr_map); e820_res = res; for (i = 0; i < e820.nr_map; i++) { end = e820.map[i].addr + e820.map[i].size - 1; @@ -1502,8 +1502,8 @@ static unsigned long ram_alignment(resou if (mb < 16) return 1024*1024; - /* To 32MB for anything above that */ - return 32*1024*1024; + /* To 64MB for anything above that */ + return 64*1024*1024; } #define MAX_RESOURCE_SIZE ((resource_size_t)-1) @@ -1543,59 +1543,8 @@ void __init e820_reserve_resources_late( #undef e820 -#ifndef CONFIG_XEN char *__init default_machine_specific_memory_setup(void) { - char *who = "BIOS-e820"; - u32 new_nr; - /* - * Try to copy the BIOS-supplied E820-map. - * - * Otherwise fake a memory map; one section from 0k->640k, - * the next section from 1mb->appropriate_mem_k - */ - new_nr = boot_params.e820_entries; - sanitize_e820_map(boot_params.e820_map, - ARRAY_SIZE(boot_params.e820_map), - &new_nr); - boot_params.e820_entries = new_nr; - if (append_e820_map(boot_params.e820_map, boot_params.e820_entries) - < 0) { - u64 mem_size; - - /* compare results from other methods and take the greater */ - if (boot_params.alt_mem_k - < boot_params.screen_info.ext_mem_k) { - mem_size = boot_params.screen_info.ext_mem_k; - who = "BIOS-88"; - } else { - mem_size = boot_params.alt_mem_k; - who = "BIOS-e801"; - } - - e820.nr_map = 0; - e820_add_region(0, LOWMEMSIZE(), E820_RAM); - e820_add_region(HIGH_MEMORY, mem_size << 10, E820_RAM); - } - - /* In case someone cares... */ - return who; -} - -char *__init __attribute__((weak)) machine_specific_memory_setup(void) -{ - if (x86_quirks->arch_memory_setup) { - char *who = x86_quirks->arch_memory_setup(); - - if (who) - return who; - } - return default_machine_specific_memory_setup(); -} -#endif - -static char * __init _memory_setup(void) -{ int rc, nr_map; struct xen_memory_map memmap; static struct e820entry __initdata map[E820MAX]; @@ -1639,7 +1588,7 @@ void __init setup_memory_map(void) { char *who; - who = _memory_setup(); + who = x86_init.resources.memory_setup(); #ifdef CONFIG_XEN if (is_initial_xendomain()) { printk(KERN_INFO "Xen-provided machine memory map:\n"); --- sle11sp1-2010-03-29.orig/arch/x86/kernel/early_printk-xen.c 2009-11-06 10:52:02.000000000 +0100 +++ sle11sp1-2010-03-29/arch/x86/kernel/early_printk-xen.c 2009-11-06 10:52:22.000000000 +0100 @@ -178,7 +178,6 @@ static __init void early_serial_init(cha * mappings. Someone should fix this for domain 0. For now, use fake serial. */ #define early_vga_console early_serial_console -#define xenboot_console early_serial_console #endif @@ -189,721 +188,6 @@ static struct console early_serial_conso .index = -1, }; -#ifdef CONFIG_EARLY_PRINTK_DBGP - -static struct ehci_caps __iomem *ehci_caps; -static struct ehci_regs __iomem *ehci_regs; -static struct ehci_dbg_port __iomem *ehci_debug; -static unsigned int dbgp_endpoint_out; - -struct ehci_dev { - u32 bus; - u32 slot; - u32 func; -}; - -static struct ehci_dev ehci_dev; - -#define USB_DEBUG_DEVNUM 127 - -#define DBGP_DATA_TOGGLE 0x8800 - -static inline u32 dbgp_pid_update(u32 x, u32 tok) -{ - return ((x ^ DBGP_DATA_TOGGLE) & 0xffff00) | (tok & 0xff); -} - -static inline u32 dbgp_len_update(u32 x, u32 len) -{ - return (x & ~0x0f) | (len & 0x0f); -} - -/* - * USB Packet IDs (PIDs) - */ - -/* token */ -#define USB_PID_OUT 0xe1 -#define USB_PID_IN 0x69 -#define USB_PID_SOF 0xa5 -#define USB_PID_SETUP 0x2d -/* handshake */ -#define USB_PID_ACK 0xd2 -#define USB_PID_NAK 0x5a -#define USB_PID_STALL 0x1e -#define USB_PID_NYET 0x96 -/* data */ -#define USB_PID_DATA0 0xc3 -#define USB_PID_DATA1 0x4b -#define USB_PID_DATA2 0x87 -#define USB_PID_MDATA 0x0f -/* Special */ -#define USB_PID_PREAMBLE 0x3c -#define USB_PID_ERR 0x3c -#define USB_PID_SPLIT 0x78 -#define USB_PID_PING 0xb4 -#define USB_PID_UNDEF_0 0xf0 - -#define USB_PID_DATA_TOGGLE 0x88 -#define DBGP_CLAIM (DBGP_OWNER | DBGP_ENABLED | DBGP_INUSE) - -#define PCI_CAP_ID_EHCI_DEBUG 0xa - -#define HUB_ROOT_RESET_TIME 50 /* times are in msec */ -#define HUB_SHORT_RESET_TIME 10 -#define HUB_LONG_RESET_TIME 200 -#define HUB_RESET_TIMEOUT 500 - -#define DBGP_MAX_PACKET 8 - -static int dbgp_wait_until_complete(void) -{ - u32 ctrl; - int loop = 0x100000; - - do { - ctrl = readl(&ehci_debug->control); - /* Stop when the transaction is finished */ - if (ctrl & DBGP_DONE) - break; - } while (--loop > 0); - - if (!loop) - return -1; - - /* - * Now that we have observed the completed transaction, - * clear the done bit. - */ - writel(ctrl | DBGP_DONE, &ehci_debug->control); - return (ctrl & DBGP_ERROR) ? -DBGP_ERRCODE(ctrl) : DBGP_LEN(ctrl); -} - -static void __init dbgp_mdelay(int ms) -{ - int i; - - while (ms--) { - for (i = 0; i < 1000; i++) - outb(0x1, 0x80); - } -} - -static void dbgp_breath(void) -{ - /* Sleep to give the debug port a chance to breathe */ -} - -static int dbgp_wait_until_done(unsigned ctrl) -{ - u32 pids, lpid; - int ret; - int loop = 3; - -retry: - writel(ctrl | DBGP_GO, &ehci_debug->control); - ret = dbgp_wait_until_complete(); - pids = readl(&ehci_debug->pids); - lpid = DBGP_PID_GET(pids); - - if (ret < 0) - return ret; - - /* - * If the port is getting full or it has dropped data - * start pacing ourselves, not necessary but it's friendly. - */ - if ((lpid == USB_PID_NAK) || (lpid == USB_PID_NYET)) - dbgp_breath(); - - /* If I get a NACK reissue the transmission */ - if (lpid == USB_PID_NAK) { - if (--loop > 0) - goto retry; - } - - return ret; -} - -static void dbgp_set_data(const void *buf, int size) -{ - const unsigned char *bytes = buf; - u32 lo, hi; - int i; - - lo = hi = 0; - for (i = 0; i < 4 && i < size; i++) - lo |= bytes[i] << (8*i); - for (; i < 8 && i < size; i++) - hi |= bytes[i] << (8*(i - 4)); - writel(lo, &ehci_debug->data03); - writel(hi, &ehci_debug->data47); -} - -static void __init dbgp_get_data(void *buf, int size) -{ - unsigned char *bytes = buf; - u32 lo, hi; - int i; - - lo = readl(&ehci_debug->data03); - hi = readl(&ehci_debug->data47); - for (i = 0; i < 4 && i < size; i++) - bytes[i] = (lo >> (8*i)) & 0xff; - for (; i < 8 && i < size; i++) - bytes[i] = (hi >> (8*(i - 4))) & 0xff; -} - -static int dbgp_bulk_write(unsigned devnum, unsigned endpoint, - const char *bytes, int size) -{ - u32 pids, addr, ctrl; - int ret; - - if (size > DBGP_MAX_PACKET) - return -1; - - addr = DBGP_EPADDR(devnum, endpoint); - - pids = readl(&ehci_debug->pids); - pids = dbgp_pid_update(pids, USB_PID_OUT); - - ctrl = readl(&ehci_debug->control); - ctrl = dbgp_len_update(ctrl, size); - ctrl |= DBGP_OUT; - ctrl |= DBGP_GO; - - dbgp_set_data(bytes, size); - writel(addr, &ehci_debug->address); - writel(pids, &ehci_debug->pids); - - ret = dbgp_wait_until_done(ctrl); - if (ret < 0) - return ret; - - return ret; -} - -static int __init dbgp_bulk_read(unsigned devnum, unsigned endpoint, void *data, - int size) -{ - u32 pids, addr, ctrl; - int ret; - - if (size > DBGP_MAX_PACKET) - return -1; - - addr = DBGP_EPADDR(devnum, endpoint); - - pids = readl(&ehci_debug->pids); - pids = dbgp_pid_update(pids, USB_PID_IN); - - ctrl = readl(&ehci_debug->control); - ctrl = dbgp_len_update(ctrl, size); - ctrl &= ~DBGP_OUT; - ctrl |= DBGP_GO; - - writel(addr, &ehci_debug->address); - writel(pids, &ehci_debug->pids); - ret = dbgp_wait_until_done(ctrl); - if (ret < 0) - return ret; - - if (size > ret) - size = ret; - dbgp_get_data(data, size); - return ret; -} - -static int __init dbgp_control_msg(unsigned devnum, int requesttype, - int request, int value, int index, void *data, int size) -{ - u32 pids, addr, ctrl; - struct usb_ctrlrequest req; - int read; - int ret; - - read = (requesttype & USB_DIR_IN) != 0; - if (size > (read ? DBGP_MAX_PACKET:0)) - return -1; - - /* Compute the control message */ - req.bRequestType = requesttype; - req.bRequest = request; - req.wValue = cpu_to_le16(value); - req.wIndex = cpu_to_le16(index); - req.wLength = cpu_to_le16(size); - - pids = DBGP_PID_SET(USB_PID_DATA0, USB_PID_SETUP); - addr = DBGP_EPADDR(devnum, 0); - - ctrl = readl(&ehci_debug->control); - ctrl = dbgp_len_update(ctrl, sizeof(req)); - ctrl |= DBGP_OUT; - ctrl |= DBGP_GO; - - /* Send the setup message */ - dbgp_set_data(&req, sizeof(req)); - writel(addr, &ehci_debug->address); - writel(pids, &ehci_debug->pids); - ret = dbgp_wait_until_done(ctrl); - if (ret < 0) - return ret; - - /* Read the result */ - return dbgp_bulk_read(devnum, 0, data, size); -} - - -/* Find a PCI capability */ -static u32 __init find_cap(u32 num, u32 slot, u32 func, int cap) -{ - u8 pos; - int bytes; - - if (!(read_pci_config_16(num, slot, func, PCI_STATUS) & - PCI_STATUS_CAP_LIST)) - return 0; - - pos = read_pci_config_byte(num, slot, func, PCI_CAPABILITY_LIST); - for (bytes = 0; bytes < 48 && pos >= 0x40; bytes++) { - u8 id; - - pos &= ~3; - id = read_pci_config_byte(num, slot, func, pos+PCI_CAP_LIST_ID); - if (id == 0xff) - break; - if (id == cap) - return pos; - - pos = read_pci_config_byte(num, slot, func, - pos+PCI_CAP_LIST_NEXT); - } - return 0; -} - -static u32 __init __find_dbgp(u32 bus, u32 slot, u32 func) -{ - u32 class; - - class = read_pci_config(bus, slot, func, PCI_CLASS_REVISION); - if ((class >> 8) != PCI_CLASS_SERIAL_USB_EHCI) - return 0; - - return find_cap(bus, slot, func, PCI_CAP_ID_EHCI_DEBUG); -} - -static u32 __init find_dbgp(int ehci_num, u32 *rbus, u32 *rslot, u32 *rfunc) -{ - u32 bus, slot, func; - - for (bus = 0; bus < 256; bus++) { - for (slot = 0; slot < 32; slot++) { - for (func = 0; func < 8; func++) { - unsigned cap; - - cap = __find_dbgp(bus, slot, func); - - if (!cap) - continue; - if (ehci_num-- != 0) - continue; - *rbus = bus; - *rslot = slot; - *rfunc = func; - return cap; - } - } - } - return 0; -} - -static int __init ehci_reset_port(int port) -{ - u32 portsc; - u32 delay_time, delay; - int loop; - - /* Reset the usb debug port */ - portsc = readl(&ehci_regs->port_status[port - 1]); - portsc &= ~PORT_PE; - portsc |= PORT_RESET; - writel(portsc, &ehci_regs->port_status[port - 1]); - - delay = HUB_ROOT_RESET_TIME; - for (delay_time = 0; delay_time < HUB_RESET_TIMEOUT; - delay_time += delay) { - dbgp_mdelay(delay); - - portsc = readl(&ehci_regs->port_status[port - 1]); - if (portsc & PORT_RESET) { - /* force reset to complete */ - loop = 2; - writel(portsc & ~(PORT_RWC_BITS | PORT_RESET), - &ehci_regs->port_status[port - 1]); - do { - portsc = readl(&ehci_regs->port_status[port-1]); - } while ((portsc & PORT_RESET) && (--loop > 0)); - } - - /* Device went away? */ - if (!(portsc & PORT_CONNECT)) - return -ENOTCONN; - - /* bomb out completely if something weird happend */ - if ((portsc & PORT_CSC)) - return -EINVAL; - - /* If we've finished resetting, then break out of the loop */ - if (!(portsc & PORT_RESET) && (portsc & PORT_PE)) - return 0; - } - return -EBUSY; -} - -static int __init ehci_wait_for_port(int port) -{ - u32 status; - int ret, reps; - - for (reps = 0; reps < 3; reps++) { - dbgp_mdelay(100); - status = readl(&ehci_regs->status); - if (status & STS_PCD) { - ret = ehci_reset_port(port); - if (ret == 0) - return 0; - } - } - return -ENOTCONN; -} - -#ifdef DBGP_DEBUG -# define dbgp_printk early_printk -#else -static inline void dbgp_printk(const char *fmt, ...) { } -#endif - -typedef void (*set_debug_port_t)(int port); - -static void __init default_set_debug_port(int port) -{ -} - -static set_debug_port_t __initdata set_debug_port = default_set_debug_port; - -static void __init nvidia_set_debug_port(int port) -{ - u32 dword; - dword = read_pci_config(ehci_dev.bus, ehci_dev.slot, ehci_dev.func, - 0x74); - dword &= ~(0x0f<<12); - dword |= ((port & 0x0f)<<12); - write_pci_config(ehci_dev.bus, ehci_dev.slot, ehci_dev.func, 0x74, - dword); - dbgp_printk("set debug port to %d\n", port); -} - -static void __init detect_set_debug_port(void) -{ - u32 vendorid; - - vendorid = read_pci_config(ehci_dev.bus, ehci_dev.slot, ehci_dev.func, - 0x00); - - if ((vendorid & 0xffff) == 0x10de) { - dbgp_printk("using nvidia set_debug_port\n"); - set_debug_port = nvidia_set_debug_port; - } -} - -static int __init ehci_setup(void) -{ - struct usb_debug_descriptor dbgp_desc; - u32 cmd, ctrl, status, portsc, hcs_params; - u32 debug_port, new_debug_port = 0, n_ports; - u32 devnum; - int ret, i; - int loop; - int port_map_tried; - int playtimes = 3; - -try_next_time: - port_map_tried = 0; - -try_next_port: - - hcs_params = readl(&ehci_caps->hcs_params); - debug_port = HCS_DEBUG_PORT(hcs_params); - n_ports = HCS_N_PORTS(hcs_params); - - dbgp_printk("debug_port: %d\n", debug_port); - dbgp_printk("n_ports: %d\n", n_ports); - - for (i = 1; i <= n_ports; i++) { - portsc = readl(&ehci_regs->port_status[i-1]); - dbgp_printk("portstatus%d: %08x\n", i, portsc); - } - - if (port_map_tried && (new_debug_port != debug_port)) { - if (--playtimes) { - set_debug_port(new_debug_port); - goto try_next_time; - } - return -1; - } - - loop = 10; - /* Reset the EHCI controller */ - cmd = readl(&ehci_regs->command); - cmd |= CMD_RESET; - writel(cmd, &ehci_regs->command); - do { - cmd = readl(&ehci_regs->command); - } while ((cmd & CMD_RESET) && (--loop > 0)); - - if (!loop) { - dbgp_printk("can not reset ehci\n"); - return -1; - } - dbgp_printk("ehci reset done\n"); - - /* Claim ownership, but do not enable yet */ - ctrl = readl(&ehci_debug->control); - ctrl |= DBGP_OWNER; - ctrl &= ~(DBGP_ENABLED | DBGP_INUSE); - writel(ctrl, &ehci_debug->control); - - /* Start the ehci running */ - cmd = readl(&ehci_regs->command); - cmd &= ~(CMD_LRESET | CMD_IAAD | CMD_PSE | CMD_ASE | CMD_RESET); - cmd |= CMD_RUN; - writel(cmd, &ehci_regs->command); - - /* Ensure everything is routed to the EHCI */ - writel(FLAG_CF, &ehci_regs->configured_flag); - - /* Wait until the controller is no longer halted */ - loop = 10; - do { - status = readl(&ehci_regs->status); - } while ((status & STS_HALT) && (--loop > 0)); - - if (!loop) { - dbgp_printk("ehci can be started\n"); - return -1; - } - dbgp_printk("ehci started\n"); - - /* Wait for a device to show up in the debug port */ - ret = ehci_wait_for_port(debug_port); - if (ret < 0) { - dbgp_printk("No device found in debug port\n"); - goto next_debug_port; - } - dbgp_printk("ehci wait for port done\n"); - - /* Enable the debug port */ - ctrl = readl(&ehci_debug->control); - ctrl |= DBGP_CLAIM; - writel(ctrl, &ehci_debug->control); - ctrl = readl(&ehci_debug->control); - if ((ctrl & DBGP_CLAIM) != DBGP_CLAIM) { - dbgp_printk("No device in debug port\n"); - writel(ctrl & ~DBGP_CLAIM, &ehci_debug->control); - goto err; - } - dbgp_printk("debug ported enabled\n"); - - /* Completely transfer the debug device to the debug controller */ - portsc = readl(&ehci_regs->port_status[debug_port - 1]); - portsc &= ~PORT_PE; - writel(portsc, &ehci_regs->port_status[debug_port - 1]); - - dbgp_mdelay(100); - - /* Find the debug device and make it device number 127 */ - for (devnum = 0; devnum <= 127; devnum++) { - ret = dbgp_control_msg(devnum, - USB_DIR_IN | USB_TYPE_STANDARD | USB_RECIP_DEVICE, - USB_REQ_GET_DESCRIPTOR, (USB_DT_DEBUG << 8), 0, - &dbgp_desc, sizeof(dbgp_desc)); - if (ret > 0) - break; - } - if (devnum > 127) { - dbgp_printk("Could not find attached debug device\n"); - goto err; - } - if (ret < 0) { - dbgp_printk("Attached device is not a debug device\n"); - goto err; - } - dbgp_endpoint_out = dbgp_desc.bDebugOutEndpoint; - - /* Move the device to 127 if it isn't already there */ - if (devnum != USB_DEBUG_DEVNUM) { - ret = dbgp_control_msg(devnum, - USB_DIR_OUT | USB_TYPE_STANDARD | USB_RECIP_DEVICE, - USB_REQ_SET_ADDRESS, USB_DEBUG_DEVNUM, 0, NULL, 0); - if (ret < 0) { - dbgp_printk("Could not move attached device to %d\n", - USB_DEBUG_DEVNUM); - goto err; - } - devnum = USB_DEBUG_DEVNUM; - dbgp_printk("debug device renamed to 127\n"); - } - - /* Enable the debug interface */ - ret = dbgp_control_msg(USB_DEBUG_DEVNUM, - USB_DIR_OUT | USB_TYPE_STANDARD | USB_RECIP_DEVICE, - USB_REQ_SET_FEATURE, USB_DEVICE_DEBUG_MODE, 0, NULL, 0); - if (ret < 0) { - dbgp_printk(" Could not enable the debug device\n"); - goto err; - } - dbgp_printk("debug interface enabled\n"); - - /* Perform a small write to get the even/odd data state in sync - */ - ret = dbgp_bulk_write(USB_DEBUG_DEVNUM, dbgp_endpoint_out, " ", 1); - if (ret < 0) { - dbgp_printk("dbgp_bulk_write failed: %d\n", ret); - goto err; - } - dbgp_printk("small write doned\n"); - - return 0; -err: - /* Things didn't work so remove my claim */ - ctrl = readl(&ehci_debug->control); - ctrl &= ~(DBGP_CLAIM | DBGP_OUT); - writel(ctrl, &ehci_debug->control); - return -1; - -next_debug_port: - port_map_tried |= (1<<(debug_port - 1)); - new_debug_port = ((debug_port-1+1)%n_ports) + 1; - if (port_map_tried != ((1<<n_ports) - 1)) { - set_debug_port(new_debug_port); - goto try_next_port; - } - if (--playtimes) { - set_debug_port(new_debug_port); - goto try_next_time; - } - - return -1; -} - -static int __init early_dbgp_init(char *s) -{ - u32 debug_port, bar, offset; - u32 bus, slot, func, cap; - void __iomem *ehci_bar; - u32 dbgp_num; - u32 bar_val; - char *e; - int ret; - u8 byte; - - if (!early_pci_allowed()) - return -1; - - dbgp_num = 0; - if (*s) - dbgp_num = simple_strtoul(s, &e, 10); - dbgp_printk("dbgp_num: %d\n", dbgp_num); - - cap = find_dbgp(dbgp_num, &bus, &slot, &func); - if (!cap) - return -1; - - dbgp_printk("Found EHCI debug port on %02x:%02x.%1x\n", bus, slot, - func); - - debug_port = read_pci_config(bus, slot, func, cap); - bar = (debug_port >> 29) & 0x7; - bar = (bar * 4) + 0xc; - offset = (debug_port >> 16) & 0xfff; - dbgp_printk("bar: %02x offset: %03x\n", bar, offset); - if (bar != PCI_BASE_ADDRESS_0) { - dbgp_printk("only debug ports on bar 1 handled.\n"); - - return -1; - } - - bar_val = read_pci_config(bus, slot, func, PCI_BASE_ADDRESS_0); - dbgp_printk("bar_val: %02x offset: %03x\n", bar_val, offset); - if (bar_val & ~PCI_BASE_ADDRESS_MEM_MASK) { - dbgp_printk("only simple 32bit mmio bars supported\n"); - - return -1; - } - - /* double check if the mem space is enabled */ - byte = read_pci_config_byte(bus, slot, func, 0x04); - if (!(byte & 0x2)) { - byte |= 0x02; - write_pci_config_byte(bus, slot, func, 0x04, byte); - dbgp_printk("mmio for ehci enabled\n"); - } - - /* - * FIXME I don't have the bar size so just guess PAGE_SIZE is more - * than enough. 1K is the biggest I have seen. - */ - set_fixmap_nocache(FIX_DBGP_BASE, bar_val & PAGE_MASK); - ehci_bar = (void __iomem *)__fix_to_virt(FIX_DBGP_BASE); - ehci_bar += bar_val & ~PAGE_MASK; - dbgp_printk("ehci_bar: %p\n", ehci_bar); - - ehci_caps = ehci_bar; - ehci_regs = ehci_bar + HC_LENGTH(readl(&ehci_caps->hc_capbase)); - ehci_debug = ehci_bar + offset; - ehci_dev.bus = bus; - ehci_dev.slot = slot; - ehci_dev.func = func; - - detect_set_debug_port(); - - ret = ehci_setup(); - if (ret < 0) { - dbgp_printk("ehci_setup failed\n"); - ehci_debug = NULL; - - return -1; - } - - return 0; -} - -static void early_dbgp_write(struct console *con, const char *str, u32 n) -{ - int chunk, ret; - - if (!ehci_debug) - return; - while (n > 0) { - chunk = n; - if (chunk > DBGP_MAX_PACKET) - chunk = DBGP_MAX_PACKET; - ret = dbgp_bulk_write(USB_DEBUG_DEVNUM, - dbgp_endpoint_out, str, chunk); - str += chunk; - n -= chunk; - } -} - -static struct console early_dbgp_console = { - .name = "earlydbg", - .write = early_dbgp_write, - .flags = CON_PRINTBUFFER, - .index = -1, -}; -#endif - /* Direct interface for emergencies */ static struct console *early_console = &early_vga_console; static int __initdata early_console_initialized; @@ -920,10 +204,24 @@ asmlinkage void early_printk(const char va_end(ap); } +static inline void early_console_register(struct console *con, int keep_early) +{ + if (early_console->index != -1) { + printk(KERN_CRIT "ERROR: earlyprintk= %s already used\n", + con->name); + return; + } + early_console = con; + if (keep_early) + early_console->flags &= ~CON_BOOT; + else + early_console->flags |= CON_BOOT; + register_console(early_console); +} static int __init setup_early_printk(char *buf) { - int keep_early; + int keep; if (!buf) return 0; @@ -932,44 +230,41 @@ static int __init setup_early_printk(cha return 0; early_console_initialized = 1; - keep_early = (strstr(buf, "keep") != NULL); + keep = (strstr(buf, "keep") != NULL); - if (!strncmp(buf, "serial", 6)) { - early_serial_init(buf + 6); - early_console = &early_serial_console; - } else if (!strncmp(buf, "ttyS", 4)) { - early_serial_init(buf); - early_console = &early_serial_console; - } else if (!strncmp(buf, "vga", 3)) { + while (*buf != '\0') { + if (!strncmp(buf, "serial", 6)) { + buf += 6; + early_serial_init(buf); + early_console_register(&early_serial_console, keep); + if (!strncmp(buf, ",ttyS", 5)) + buf += 5; + } + if (!strncmp(buf, "ttyS", 4)) { + early_serial_init(buf + 4); + early_console_register(&early_serial_console, keep); + } #ifndef CONFIG_XEN - && boot_params.screen_info.orig_video_isVGA == 1) { - max_xpos = boot_params.screen_info.orig_video_cols; - max_ypos = boot_params.screen_info.orig_video_lines; - current_ypos = boot_params.screen_info.orig_y; + if (!strncmp(buf, "vga", 3) && + boot_params.screen_info.orig_video_isVGA == 1) { + max_xpos = boot_params.screen_info.orig_video_cols; + max_ypos = boot_params.screen_info.orig_video_lines; + current_ypos = boot_params.screen_info.orig_y; +#else + if (!strncmp(buf, "vga", 3) || !strncmp(buf, "xen", 3)) { #endif - early_console = &early_vga_console; + early_console_register(&early_vga_console, keep); + } #ifdef CONFIG_EARLY_PRINTK_DBGP - } else if (!strncmp(buf, "dbgp", 4)) { - if (early_dbgp_init(buf+4) < 0) - return 0; - early_console = &early_dbgp_console; - /* - * usb subsys will reset ehci controller, so don't keep - * that early console - */ - keep_early = 0; + if (!strncmp(buf, "dbgp", 4) && !early_dbgp_init(buf + 4)) + early_console_register(&early_dbgp_console, keep); #endif -#ifdef CONFIG_XEN - } else if (!strncmp(buf, "xen", 3)) { - early_console = &xenboot_console; +#ifdef CONFIG_HVC_XEN + if (!strncmp(buf, "xen", 3)) + early_console_register(&xenboot_console, keep); #endif + buf++; } - - if (keep_early) - early_console->flags &= ~CON_BOOT; - else - early_console->flags |= CON_BOOT; - register_console(early_console); return 0; } --- sle11sp1-2010-03-29.orig/arch/x86/kernel/entry_64-xen.S 2009-11-06 10:52:09.000000000 +0100 +++ sle11sp1-2010-03-29/arch/x86/kernel/entry_64-xen.S 2009-11-06 10:52:22.000000000 +0100 @@ -53,6 +53,7 @@ #include <asm/hw_irq.h> #include <asm/page_types.h> #include <asm/irqflags.h> +#include <asm/processor-flags.h> #include <asm/ftrace.h> #include <asm/percpu.h> #include <xen/interface/xen.h> @@ -150,7 +151,7 @@ ENTRY(ftrace_graph_caller) END(ftrace_graph_caller) GLOBAL(return_to_handler) - subq $80, %rsp + subq $24, %rsp /* Save the return values */ movq %rax, (%rsp) @@ -159,10 +160,10 @@ GLOBAL(return_to_handler) call ftrace_return_to_handler - movq %rax, 72(%rsp) + movq %rax, 16(%rsp) movq 8(%rsp), %rdx movq (%rsp), %rax - addq $72, %rsp + addq $16, %rsp retq #endif @@ -553,20 +554,13 @@ sysret_signal: bt $TIF_SYSCALL_AUDIT,%edx jc sysret_audit #endif - /* edx: work flags (arg3) */ - leaq -ARGOFFSET(%rsp),%rdi # &pt_regs -> arg1 - xorl %esi,%esi # oldset -> arg2 - SAVE_REST - FIXUP_TOP_OF_STACK %r11 - call do_notify_resume - RESTORE_TOP_OF_STACK %r11 - RESTORE_REST - movl $_TIF_WORK_MASK,%edi - /* Use IRET because user could have changed frame. This - works because ptregscall_common has called FIXUP_TOP_OF_STACK. */ - DISABLE_INTERRUPTS(CLBR_NONE) - TRACE_IRQS_OFF - jmp int_with_check + /* + * We have a signal, or exit tracing or single-step. + * These all wind up with the iret return path anyway, + * so just join that path right now. + */ + FIXUP_TOP_OF_STACK %r11, -ARGOFFSET + jmp int_check_syscall_exit_work badsys: movq $-ENOSYS,RAX-ARGOFFSET(%rsp) @@ -675,6 +669,7 @@ int_careful: int_very_careful: TRACE_IRQS_ON ENABLE_INTERRUPTS(CLBR_NONE) +int_check_syscall_exit_work: SAVE_REST /* Check for syscall exit trace */ testl $_TIF_WORK_SYSCALL_EXIT,%edx @@ -921,7 +916,7 @@ apicinterrupt ERROR_APIC_VECTOR \ apicinterrupt SPURIOUS_APIC_VECTOR \ spurious_interrupt smp_spurious_interrupt -#ifdef CONFIG_PERF_COUNTERS +#ifdef CONFIG_PERF_EVENTS apicinterrupt LOCAL_PENDING_VECTOR \ perf_pending_interrupt smp_perf_pending_interrupt #endif --- sle11sp1-2010-03-29.orig/arch/x86/kernel/head-xen.c 2009-11-06 10:52:02.000000000 +0100 +++ sle11sp1-2010-03-29/arch/x86/kernel/head-xen.c 2009-11-06 10:52:22.000000000 +0100 @@ -59,7 +59,6 @@ void __init reserve_ebda_region(void) #include <asm/fixmap.h> #include <asm/pgtable.h> #include <asm/sections.h> -#include <asm/setup_arch.h> #include <xen/interface/callback.h> #include <xen/interface/memory.h> @@ -163,7 +162,7 @@ void __init xen_start_kernel(void) } -void __init machine_specific_arch_setup(void) +void __init xen_arch_setup(void) { int ret; static const struct callback_register __initconst event = { --- sle11sp1-2010-03-29.orig/arch/x86/kernel/head32-xen.c 2009-11-06 10:52:02.000000000 +0100 +++ sle11sp1-2010-03-29/arch/x86/kernel/head32-xen.c 2009-11-06 10:52:22.000000000 +0100 @@ -9,11 +9,26 @@ #include <linux/start_kernel.h> #include <asm/setup.h> -#include <asm/setup_arch.h> #include <asm/sections.h> #include <asm/e820.h> -#include <asm/bios_ebda.h> +#include <asm/page.h> #include <asm/trampoline.h> +#include <asm/apic.h> +#include <asm/io_apic.h> +#include <asm/bios_ebda.h> + +static void __init i386_default_early_setup(void) +{ + /* Initialize 32bit specific setup functions */ + if (is_initial_xendomain()) + x86_init.resources.probe_roms = probe_roms; + x86_init.resources.reserve_resources = i386_reserve_resources; +#ifndef CONFIG_XEN + x86_init.mpparse.setup_ioapic_ids = setup_ioapic_ids_from_mpc; + + reserve_ebda_region(); +#endif +} void __init i386_start_kernel(void) { @@ -31,7 +46,16 @@ void __init i386_start_kernel(void) reserve_early(ramdisk_image, ramdisk_end, "RAMDISK"); } #endif - reserve_ebda_region(); + + /* Call the subarch specific early setup function */ + switch (boot_params.hdr.hardware_subarch) { + case X86_SUBARCH_MRST: + x86_mrst_early_setup(); + break; + default: + i386_default_early_setup(); + break; + } #else { int max_cmdline; @@ -42,6 +66,7 @@ void __init i386_start_kernel(void) boot_command_line[max_cmdline-1] = '\0'; } + i386_default_early_setup(); xen_start_kernel(); #endif --- sle11sp1-2010-03-29.orig/arch/x86/kernel/head64-xen.c 2009-11-06 10:52:02.000000000 +0100 +++ sle11sp1-2010-03-29/arch/x86/kernel/head64-xen.c 2009-11-06 10:52:22.000000000 +0100 @@ -20,15 +20,14 @@ #include <asm/proto.h> #include <asm/smp.h> #include <asm/setup.h> -#include <asm/setup_arch.h> #include <asm/desc.h> #include <asm/pgtable.h> #include <asm/tlbflush.h> #include <asm/sections.h> #include <asm/kdebug.h> #include <asm/e820.h> -#include <asm/bios_ebda.h> #include <asm/trampoline.h> +#include <asm/bios_ebda.h> #ifndef CONFIG_XEN static void __init zap_identity_mappings(void) --- sle11sp1-2010-03-29.orig/arch/x86/kernel/head_32-xen.S 2009-11-06 10:52:09.000000000 +0100 +++ sle11sp1-2010-03-29/arch/x86/kernel/head_32-xen.S 2009-11-06 10:52:22.000000000 +0100 @@ -30,7 +30,7 @@ #define X86_CAPABILITY new_cpu_data+CPUINFO_x86_capability #define X86_VENDOR_ID new_cpu_data+CPUINFO_x86_vendor_id -.section .text.head,"ax",@progbits +__HEAD #define VIRT_ENTRY_OFFSET 0x0 .org VIRT_ENTRY_OFFSET ENTRY(startup_32) @@ -69,7 +69,6 @@ ENTRY(startup_32) */ movl $per_cpu__gdt_page,%eax movl $per_cpu__stack_canary,%ecx - subl $20, %ecx movw %cx, 8 * GDT_ENTRY_STACK_CANARY + 2(%eax) shrl $16, %ecx movb %cl, 8 * GDT_ENTRY_STACK_CANARY + 4(%eax) @@ -122,7 +121,7 @@ ENTRY(hypercall_page) /* * BSS section */ -.section ".bss.page_aligned","wa" +__PAGE_ALIGNED_BSS .align PAGE_SIZE_asm ENTRY(swapper_pg_fixmap) .fill 1024,4,0 --- sle11sp1-2010-03-29.orig/arch/x86/kernel/head_64-xen.S 2009-11-06 10:52:09.000000000 +0100 +++ sle11sp1-2010-03-29/arch/x86/kernel/head_64-xen.S 2009-11-06 10:52:22.000000000 +0100 @@ -23,7 +23,7 @@ #include <asm/percpu.h> #include <xen/interface/elfnote.h> - .section .text.head, "ax", @progbits + __HEAD .code64 .globl startup_64 startup_64: @@ -51,7 +51,7 @@ startup_64: #define NEXT_PAGE(name) \ .balign PAGE_SIZE; \ - phys_##name = . - .text.head; \ + phys_##name = . - .head.text; \ ENTRY(name) NEXT_PAGE(init_level4_pgt) @@ -104,7 +104,7 @@ NEXT_PAGE(hypercall_page) #undef NEXT_PAGE - .section .bss.page_aligned, "aw", @nobits + __PAGE_ALIGNED_BSS .align PAGE_SIZE ENTRY(empty_zero_page) .skip PAGE_SIZE --- sle11sp1-2010-03-29.orig/arch/x86/kernel/irq-xen.c 2009-12-18 09:58:56.000000000 +0100 +++ sle11sp1-2010-03-29/arch/x86/kernel/irq-xen.c 2009-12-18 09:59:05.000000000 +0100 @@ -67,10 +67,10 @@ static int show_other_interrupts(struct for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->irq_spurious_count); seq_printf(p, " Spurious interrupts\n"); - seq_printf(p, "%*s: ", prec, "CNT"); + seq_printf(p, "%*s: ", prec, "PMI"); for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->apic_perf_irqs); - seq_printf(p, " Performance counter interrupts\n"); + seq_printf(p, " Performance monitoring interrupts\n"); seq_printf(p, "%*s: ", prec, "PND"); for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->apic_pending_irqs); @@ -112,7 +112,7 @@ static int show_other_interrupts(struct seq_printf(p, "%10u ", irq_stats(j)->irq_threshold_count); seq_printf(p, " Threshold APIC interrupts\n"); #endif -#ifdef CONFIG_X86_NEW_MCE +#ifdef CONFIG_X86_MCE seq_printf(p, "%*s: ", prec, "MCE"); for_each_online_cpu(j) seq_printf(p, "%10u ", per_cpu(mce_exception_count, j)); @@ -212,7 +212,7 @@ u64 arch_irq_stat_cpu(unsigned int cpu) #ifdef CONFIG_X86_MCE_THRESHOLD sum += irq_stats(cpu)->irq_threshold_count; #endif -#ifdef CONFIG_X86_NEW_MCE +#ifdef CONFIG_X86_MCE sum += per_cpu(mce_exception_count, cpu); sum += per_cpu(mce_poll_count, cpu); #endif --- sle11sp1-2010-03-29.orig/arch/x86/kernel/irq_32-xen.c 2009-11-06 10:52:02.000000000 +0100 +++ sle11sp1-2010-03-29/arch/x86/kernel/irq_32-xen.c 2009-11-06 10:52:22.000000000 +0100 @@ -218,7 +218,6 @@ bool handle_irq(unsigned irq, struct pt_ void fixup_irqs(void) { unsigned int irq; - static int warned; struct irq_desc *desc; for_each_irq_desc(irq, desc) { @@ -236,8 +235,8 @@ void fixup_irqs(void) } if (desc->chip->set_affinity) desc->chip->set_affinity(irq, affinity); - else if (desc->action && !(warned++)) - printk("Cannot set affinity for irq %i\n", irq); + else if (desc->action) + printk_once("Cannot set affinity for irq %i\n", irq); } #if 0 --- sle11sp1-2010-03-29.orig/arch/x86/kernel/ldt-xen.c 2009-11-06 10:51:55.000000000 +0100 +++ sle11sp1-2010-03-29/arch/x86/kernel/ldt-xen.c 2009-11-06 10:52:22.000000000 +0100 @@ -70,8 +70,8 @@ static int alloc_ldt(mm_context_t *pc, i XENFEAT_writable_descriptor_tables); load_LDT(pc); #ifdef CONFIG_SMP - if (!cpus_equal(current->mm->cpu_vm_mask, - cpumask_of_cpu(smp_processor_id()))) + if (!cpumask_equal(mm_cpumask(current->mm), + cpumask_of(smp_processor_id()))) smp_call_function(flush_ldt, current->mm, 1); preempt_enable(); #endif --- sle11sp1-2010-03-29.orig/arch/x86/kernel/microcode_core-xen.c 2009-11-06 10:52:09.000000000 +0100 +++ sle11sp1-2010-03-29/arch/x86/kernel/microcode_core-xen.c 2009-11-06 10:52:22.000000000 +0100 @@ -97,8 +97,8 @@ static ssize_t microcode_write(struct fi { ssize_t ret = -EINVAL; - if ((len >> PAGE_SHIFT) > num_physpages) { - pr_err("microcode: too much data (max %ld pages)\n", num_physpages); + if ((len >> PAGE_SHIFT) > totalram_pages) { + pr_err("microcode: too much data (max %ld pages)\n", totalram_pages); return ret; } @@ -121,7 +121,7 @@ static const struct file_operations micr static struct miscdevice microcode_dev = { .minor = MICROCODE_MINOR, .name = "microcode", - .devnode = "cpu/microcode", + .nodename = "cpu/microcode", .fops = µcode_fops, }; --- sle11sp1-2010-03-29.orig/arch/x86/kernel/mpparse-xen.c 2009-11-06 10:52:09.000000000 +0100 +++ sle11sp1-2010-03-29/arch/x86/kernel/mpparse-xen.c 2009-11-06 10:52:22.000000000 +0100 @@ -51,6 +51,13 @@ static int __init mpf_checksum(unsigned return sum & 0xFF; } +#ifndef CONFIG_XEN +int __init default_mpc_apic_id(struct mpc_cpu *m) +{ + return m->apicid; +} +#endif + static void __init MP_processor_info(struct mpc_cpu *m) { #ifndef CONFIG_XEN @@ -62,10 +69,7 @@ static void __init MP_processor_info(str return; } - if (x86_quirks->mpc_apic_id) - apicid = x86_quirks->mpc_apic_id(m); - else - apicid = m->apicid; + apicid = x86_init.mpparse.mpc_apic_id(m); if (m->cpuflag & CPU_BOOTPROCESSOR) { bootup_cpu = " (Bootup-CPU)"; @@ -80,16 +84,18 @@ static void __init MP_processor_info(str } #ifdef CONFIG_X86_IO_APIC -static void __init MP_bus_info(struct mpc_bus *m) +void __init default_mpc_oem_bus_info(struct mpc_bus *m, char *str) { - char str[7]; memcpy(str, m->bustype, 6); str[6] = 0; + apic_printk(APIC_VERBOSE, "Bus #%d is %s\n", m->busid, str); +} - if (x86_quirks->mpc_oem_bus_info) - x86_quirks->mpc_oem_bus_info(m, str); - else - apic_printk(APIC_VERBOSE, "Bus #%d is %s\n", m->busid, str); +static void __init MP_bus_info(struct mpc_bus *m) +{ + char str[7]; + + x86_init.mpparse.mpc_oem_bus_info(m, str); #if MAX_MP_BUSSES < 256 if (m->busid >= MAX_MP_BUSSES) { @@ -106,8 +112,8 @@ static void __init MP_bus_info(struct mp mp_bus_id_to_type[m->busid] = MP_BUS_ISA; #endif } else if (strncmp(str, BUSTYPE_PCI, sizeof(BUSTYPE_PCI) - 1) == 0) { - if (x86_quirks->mpc_oem_pci_bus) - x86_quirks->mpc_oem_pci_bus(m); + if (x86_init.mpparse.mpc_oem_pci_bus) + x86_init.mpparse.mpc_oem_pci_bus(m); clear_bit(m->busid, mp_bus_not_pci); #if defined(CONFIG_EISA) || defined(CONFIG_MCA) @@ -301,6 +307,8 @@ static void __init smp_dump_mptable(stru 1, mpc, mpc->length, 1); } +void __init default_smp_read_mpc_oem(struct mpc_table *mpc) { } + static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early) { char str[16]; @@ -322,16 +330,13 @@ static int __init smp_read_mpc(struct mp if (early) return 1; - if (mpc->oemptr && x86_quirks->smp_read_mpc_oem) { - struct mpc_oemtable *oem_table = (void *)(long)mpc->oemptr; - x86_quirks->smp_read_mpc_oem(oem_table, mpc->oemsize); - } + if (mpc->oemptr) + x86_init.mpparse.smp_read_mpc_oem(mpc); /* * Now process the configuration blocks. */ - if (x86_quirks->mpc_record) - *x86_quirks->mpc_record = 0; + x86_init.mpparse.mpc_record(0); while (count < mpc->length) { switch (*mpt) { @@ -363,8 +368,7 @@ static int __init smp_read_mpc(struct mp count = mpc->length; break; } - if (x86_quirks->mpc_record) - (*x86_quirks->mpc_record)++; + x86_init.mpparse.mpc_record(1); } #ifdef CONFIG_X86_BIGSMP @@ -492,11 +496,11 @@ static void __init construct_ioapic_tabl MP_bus_info(&bus); } - ioapic.type = MP_IOAPIC; - ioapic.apicid = 2; - ioapic.apicver = mpc_default_type > 4 ? 0x10 : 0x01; - ioapic.flags = MPC_APIC_USABLE; - ioapic.apicaddr = 0xFEC00000; + ioapic.type = MP_IOAPIC; + ioapic.apicid = 2; + ioapic.apicver = mpc_default_type > 4 ? 0x10 : 0x01; + ioapic.flags = MPC_APIC_USABLE; + ioapic.apicaddr = IO_APIC_DEFAULT_PHYS_BASE; MP_ioapic_info(&ioapic); /* @@ -618,7 +622,7 @@ static int __init check_physptr(struct m /* * Scan the memory blocks for an SMP configuration block. */ -static void __init __get_smp_config(unsigned int early) +void __init default_get_smp_config(unsigned int early) { struct mpf_intel *mpf = mpf_found; @@ -635,11 +639,6 @@ static void __init __get_smp_config(unsi if (acpi_lapic && acpi_ioapic) return; - if (x86_quirks->mach_get_smp_config) { - if (x86_quirks->mach_get_smp_config(early)) - return; - } - printk(KERN_INFO "Intel MultiProcessor Specification v1.%d\n", mpf->specification); #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_32) && !defined(CONFIG_XEN) @@ -680,16 +679,6 @@ static void __init __get_smp_config(unsi */ } -void __init early_get_smp_config(void) -{ - __get_smp_config(1); -} - -void __init get_smp_config(void) -{ - __get_smp_config(0); -} - #ifndef CONFIG_XEN static void __init smp_reserve_bootmem(struct mpf_intel *mpf) { @@ -761,16 +750,12 @@ static int __init smp_scan_config(unsign return 0; } -static void __init __find_smp_config(unsigned int reserve) +void __init default_find_smp_config(unsigned int reserve) { #ifndef CONFIG_XEN unsigned int address; #endif - if (x86_quirks->mach_find_smp_config) { - if (x86_quirks->mach_find_smp_config(reserve)) - return; - } /* * FIXME: Linux assumes you have 640K of base ram.. * this continues the error... @@ -807,16 +792,6 @@ static void __init __find_smp_config(uns #endif } -void __init early_find_smp_config(void) -{ - __find_smp_config(0); -} - -void __init find_smp_config(void) -{ - __find_smp_config(1); -} - #ifdef CONFIG_X86_IO_APIC static u8 __initdata irq_used[MAX_IRQ_SOURCES]; --- sle11sp1-2010-03-29.orig/arch/x86/kernel/pci-dma-xen.c 2009-11-06 10:52:09.000000000 +0100 +++ sle11sp1-2010-03-29/arch/x86/kernel/pci-dma-xen.c 2009-11-18 14:54:16.000000000 +0100 @@ -3,6 +3,7 @@ #include <linux/dmar.h> #include <linux/bootmem.h> #include <linux/pci.h> +#include <linux/kmemleak.h> #include <asm/proto.h> #include <asm/dma.h> @@ -32,17 +33,22 @@ int no_iommu __read_mostly; /* Set this to 1 if there is a HW IOMMU in the system */ int iommu_detected __read_mostly = 0; -int iommu_pass_through; +/* + * This variable becomes 1 if iommu=pt is passed on the kernel command line. + * If this variable is 1, IOMMU implementations do no DMA translation for + * devices and allow every device to access to whole physical memory. This is + * useful if a user want to use an IOMMU only for KVM device assignment to + * guests and not for driver dma translation. + */ +int iommu_pass_through __read_mostly; dma_addr_t bad_dma_address __read_mostly = 0; EXPORT_SYMBOL(bad_dma_address); -/* Dummy device used for NULL arguments (normally ISA). Better would - be probably a smaller DMA mask, but this is bug-to-bug compatible - to older i386. */ +/* Dummy device used for NULL arguments (normally ISA). */ struct device x86_dma_fallback_dev = { .init_name = "fallback device", - .coherent_dma_mask = DMA_BIT_MASK(32), + .coherent_dma_mask = ISA_DMA_BIT_MASK, .dma_mask = &x86_dma_fallback_dev.coherent_dma_mask, }; EXPORT_SYMBOL(x86_dma_fallback_dev); @@ -88,6 +94,11 @@ void __init dma32_reserve_bootmem(void) size = roundup(dma32_bootmem_size, align); dma32_bootmem_ptr = __alloc_bootmem_nopanic(size, align, 512ULL<<20); + /* + * Kmemleak should not scan this block as it may not be mapped via the + * kernel direct mapping. + */ + kmemleak_ignore(dma32_bootmem_ptr); if (dma32_bootmem_ptr) dma32_bootmem_size = size; else @@ -178,7 +189,7 @@ again: #ifndef CONFIG_XEN addr = page_to_phys(page); - if (!is_buffer_dma_capable(dma_mask, addr, size)) { + if (addr + size > dma_mask) { __free_pages(page, order); if (dma_mask < DMA_BIT_MASK(32) && !(flag & GFP_DMA)) { @@ -266,10 +277,8 @@ static __init int iommu_setup(char *p) if (!strncmp(p, "soft", 4)) swiotlb = 1; #endif - if (!strncmp(p, "pt", 2)) { + if (!strncmp(p, "pt", 2)) iommu_pass_through = 1; - return 1; - } gart_parse_options(p); @@ -381,7 +390,7 @@ void pci_iommu_shutdown(void) amd_iommu_shutdown(); } /* Must execute after PCI subsystem */ -fs_initcall(pci_iommu_init); +rootfs_initcall(pci_iommu_init); #ifdef CONFIG_PCI /* Many VIA bridges seem to corrupt data for DAC. Disable it here */ --- sle11sp1-2010-03-29.orig/arch/x86/kernel/pci-nommu-xen.c 2010-01-27 14:45:15.000000000 +0100 +++ sle11sp1-2010-03-29/arch/x86/kernel/pci-nommu-xen.c 2010-01-27 14:49:11.000000000 +0100 @@ -36,7 +36,7 @@ gnttab_map_sg(struct device *hwdev, stru sg->dma_address = gnttab_dma_map_page(sg_page(sg)) + sg->offset; sg->dma_length = sg->length; - IOMMU_BUG_ON(address_needs_mapping( + IOMMU_BUG_ON(!dma_capable( hwdev, sg->dma_address, sg->length)); IOMMU_BUG_ON(range_straddles_page_boundary( page_to_pseudophys(sg_page(sg)) + sg->offset, @@ -69,7 +69,7 @@ gnttab_map_page(struct device *dev, stru dma = gnttab_dma_map_page(page) + offset; IOMMU_BUG_ON(range_straddles_page_boundary(page_to_pseudophys(page) + offset, size)); - IOMMU_BUG_ON(address_needs_mapping(dev, dma, size)); + IOMMU_BUG_ON(!dma_capable(dev, dma, size)); return dma; } @@ -81,19 +81,36 @@ gnttab_unmap_page(struct device *dev, dm gnttab_dma_unmap_page(dma_addr); } +static void nommu_sync_single_for_device(struct device *dev, + dma_addr_t addr, size_t size, + enum dma_data_direction dir) +{ + flush_write_buffers(); +} + + +static void nommu_sync_sg_for_device(struct device *dev, + struct scatterlist *sg, int nelems, + enum dma_data_direction dir) +{ + flush_write_buffers(); +} + static int nommu_dma_supported(struct device *hwdev, u64 mask) { return 1; } struct dma_map_ops nommu_dma_ops = { - .alloc_coherent = dma_generic_alloc_coherent, - .free_coherent = dma_generic_free_coherent, - .map_page = gnttab_map_page, - .unmap_page = gnttab_unmap_page, - .map_sg = gnttab_map_sg, - .unmap_sg = gnttab_unmap_sg, - .dma_supported = nommu_dma_supported, + .alloc_coherent = dma_generic_alloc_coherent, + .free_coherent = dma_generic_free_coherent, + .map_page = gnttab_map_page, + .unmap_page = gnttab_unmap_page, + .map_sg = gnttab_map_sg, + .unmap_sg = gnttab_unmap_sg, + .sync_single_for_device = nommu_sync_single_for_device, + .sync_sg_for_device = nommu_sync_sg_for_device, + .dma_supported = nommu_dma_supported, }; void __init no_iommu_init(void) --- sle11sp1-2010-03-29.orig/arch/x86/kernel/process-xen.c 2009-11-06 10:52:09.000000000 +0100 +++ sle11sp1-2010-03-29/arch/x86/kernel/process-xen.c 2009-11-06 10:52:23.000000000 +0100 @@ -9,7 +9,7 @@ #include <linux/pm.h> #include <linux/clockchips.h> #include <linux/random.h> -#include <trace/power.h> +#include <trace/events/power.h> #include <asm/system.h> #include <asm/apic.h> #include <asm/syscalls.h> @@ -26,9 +26,6 @@ EXPORT_SYMBOL(idle_nomwait); struct kmem_cache *task_xstate_cachep; -DEFINE_TRACE(power_start); -DEFINE_TRACE(power_end); - int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) { *dst = *src; @@ -285,9 +282,7 @@ static inline int hlt_use_halt(void) */ void xen_idle(void) { - struct power_trace it; - - trace_power_start(&it, POWER_CSTATE, 1); + trace_power_start(POWER_CSTATE, 1); current_thread_info()->status &= ~TS_POLLING; /* * TS_POLLING-cleared state must be visible before we @@ -300,7 +295,6 @@ void xen_idle(void) else local_irq_enable(); current_thread_info()->status |= TS_POLLING; - trace_power_end(&it); } #ifdef CONFIG_APM_MODULE EXPORT_SYMBOL(default_idle); @@ -354,9 +348,7 @@ EXPORT_SYMBOL_GPL(cpu_idle_wait); */ void mwait_idle_with_hints(unsigned long ax, unsigned long cx) { - struct power_trace it; - - trace_power_start(&it, POWER_CSTATE, (ax>>4)+1); + trace_power_start(POWER_CSTATE, (ax>>4)+1); if (!need_resched()) { if (cpu_has(¤t_cpu_data, X86_FEATURE_CLFLUSH_MONITOR)) clflush((void *)¤t_thread_info()->flags); @@ -366,15 +358,13 @@ void mwait_idle_with_hints(unsigned long if (!need_resched()) __mwait(ax, cx); } - trace_power_end(&it); } /* Default MONITOR/MWAIT with no hints, used for default C1 state */ static void mwait_idle(void) { - struct power_trace it; if (!need_resched()) { - trace_power_start(&it, POWER_CSTATE, 1); + trace_power_start(POWER_CSTATE, 1); if (cpu_has(¤t_cpu_data, X86_FEATURE_CLFLUSH_MONITOR)) clflush((void *)¤t_thread_info()->flags); @@ -384,7 +374,6 @@ static void mwait_idle(void) __sti_mwait(0, 0); else local_irq_enable(); - trace_power_end(&it); } else local_irq_enable(); } @@ -397,13 +386,11 @@ static void mwait_idle(void) */ static void poll_idle(void) { - struct power_trace it; - - trace_power_start(&it, POWER_CSTATE, 0); + trace_power_start(POWER_CSTATE, 0); local_irq_enable(); while (!need_resched()) cpu_relax(); - trace_power_end(&it); + trace_power_end(0); } #ifndef CONFIG_XEN @@ -556,10 +543,8 @@ void __init init_c1e_mask(void) { #ifndef CONFIG_XEN /* If we're using c1e_idle, we need to allocate c1e_mask. */ - if (pm_idle == c1e_idle) { - alloc_cpumask_var(&c1e_mask, GFP_KERNEL); - cpumask_clear(c1e_mask); - } + if (pm_idle == c1e_idle) + zalloc_cpumask_var(&c1e_mask, GFP_KERNEL); #endif } --- sle11sp1-2010-03-29.orig/arch/x86/kernel/process_32-xen.c 2009-11-06 10:52:09.000000000 +0100 +++ sle11sp1-2010-03-29/arch/x86/kernel/process_32-xen.c 2009-11-06 10:52:23.000000000 +0100 @@ -66,9 +66,6 @@ asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); asmlinkage void cstar_ret_from_fork(void) __asm__("cstar_ret_from_fork"); -DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task; -EXPORT_PER_CPU_SYMBOL(current_task); - /* * Return saved PC of a blocked thread. */ @@ -360,6 +357,7 @@ __switch_to(struct task_struct *prev_p, #ifndef CONFIG_X86_NO_TSS struct tss_struct *tss = &per_cpu(init_tss, cpu); #endif + bool preload_fpu; #if CONFIG_XEN_COMPAT > 0x030002 struct physdev_set_iopl iopl_op; struct physdev_set_iobitmap iobmp_op; @@ -373,15 +371,24 @@ __switch_to(struct task_struct *prev_p, /* XEN NOTE: FS/GS saved in switch_mm(), not here. */ /* + * If the task has used fpu the last 5 timeslices, just do a full + * restore of the math state immediately to avoid the trap; the + * chances of needing FPU soon are obviously high now + */ + preload_fpu = tsk_used_math(next_p) && next_p->fpu_counter > 5; + + /* * This is basically '__unlazy_fpu', except that we queue a * multicall to indicate FPU task switch, rather than * synchronously trapping to Xen. */ if (task_thread_info(prev_p)->status & TS_USEDFPU) { __save_init_fpu(prev_p); /* _not_ save_init_fpu() */ - mcl->op = __HYPERVISOR_fpu_taskswitch; - mcl->args[0] = 1; - mcl++; + if (!preload_fpu) { + mcl->op = __HYPERVISOR_fpu_taskswitch; + mcl->args[0] = 1; + mcl++; + } } #if 0 /* lazy fpu sanity check */ else BUG_ON(!(read_cr0() & 8)); @@ -427,6 +434,14 @@ __switch_to(struct task_struct *prev_p, mcl++; } + /* If we're going to preload the fpu context, make sure clts + is run while we're batching the cpu state updates. */ + if (preload_fpu) { + mcl->op = __HYPERVISOR_fpu_taskswitch; + mcl->args[0] = 0; + mcl++; + } + if (unlikely(prev->io_bitmap_ptr || next->io_bitmap_ptr)) { set_xen_guest_handle(iobmp_op.bitmap, (char *)next->io_bitmap_ptr); @@ -451,7 +466,7 @@ __switch_to(struct task_struct *prev_p, BUG(); /* we're going to use this soon, after a few expensive things */ - if (next_p->fpu_counter > 5) + if (preload_fpu) prefetch(next->xstate); /* @@ -470,15 +485,8 @@ __switch_to(struct task_struct *prev_p, */ arch_end_context_switch(next_p); - /* If the task has used fpu the last 5 timeslices, just do a full - * restore of the math state immediately to avoid the trap; the - * chances of needing FPU soon are obviously high now - * - * tsk_used_math() checks prevent calling math_state_restore(), - * which can sleep in the case of !tsk_used_math() - */ - if (tsk_used_math(next_p) && next_p->fpu_counter > 5) - math_state_restore(); + if (preload_fpu) + __math_state_restore(); /* * Restore %gs if needed (which is common) --- sle11sp1-2010-03-29.orig/arch/x86/kernel/process_64-xen.c 2009-11-06 10:52:09.000000000 +0100 +++ sle11sp1-2010-03-29/arch/x86/kernel/process_64-xen.c 2010-03-17 14:37:05.000000000 +0100 @@ -64,9 +64,6 @@ asmlinkage extern void ret_from_fork(void); -DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task; -EXPORT_PER_CPU_SYMBOL(current_task); - DEFINE_PER_CPU(unsigned long, old_rsp); static DEFINE_PER_CPU(unsigned char, is_idle); @@ -402,6 +399,7 @@ __switch_to(struct task_struct *prev_p, #ifndef CONFIG_X86_NO_TSS struct tss_struct *tss = &per_cpu(init_tss, cpu); #endif + bool preload_fpu; #if CONFIG_XEN_COMPAT > 0x030002 struct physdev_set_iopl iopl_op; struct physdev_set_iobitmap iobmp_op; @@ -412,8 +410,15 @@ __switch_to(struct task_struct *prev_p, #endif multicall_entry_t _mcl[8], *mcl = _mcl; + /* + * If the task has used fpu the last 5 timeslices, just do a full + * restore of the math state immediately to avoid the trap; the + * chances of needing FPU soon are obviously high now + */ + preload_fpu = tsk_used_math(next_p) && next_p->fpu_counter > 5; + /* we're going to use this soon, after a few expensive things */ - if (next_p->fpu_counter > 5) + if (preload_fpu) prefetch(next->xstate); /* @@ -425,12 +430,21 @@ __switch_to(struct task_struct *prev_p, */ if (task_thread_info(prev_p)->status & TS_USEDFPU) { __save_init_fpu(prev_p); /* _not_ save_init_fpu() */ - mcl->op = __HYPERVISOR_fpu_taskswitch; - mcl->args[0] = 1; - mcl++; + if (!preload_fpu) { + mcl->op = __HYPERVISOR_fpu_taskswitch; + mcl->args[0] = 1; + mcl++; + } } else prev_p->fpu_counter = 0; + /* Make sure cpu is ready for new context */ + if (preload_fpu) { + mcl->op = __HYPERVISOR_fpu_taskswitch; + mcl->args[0] = 0; + mcl++; + } + /* * Reload sp0. * This is load_sp0(tss, next) with a multicall. @@ -550,15 +564,12 @@ __switch_to(struct task_struct *prev_p, task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV)) __switch_to_xtra(prev_p, next_p); - /* If the task has used fpu the last 5 timeslices, just do a full - * restore of the math state immediately to avoid the trap; the - * chances of needing FPU soon are obviously high now - * - * tsk_used_math() checks prevent calling math_state_restore(), - * which can sleep in the case of !tsk_used_math() + /* + * Preload the FPU context, now that we've determined that the + * task is likely to be using it. */ - if (tsk_used_math(next_p) && next_p->fpu_counter > 5) - math_state_restore(); + if (preload_fpu) + __math_state_restore(); return prev_p; } --- sle11sp1-2010-03-29.orig/arch/x86/kernel/quirks-xen.c 2009-11-06 10:52:09.000000000 +0100 +++ sle11sp1-2010-03-29/arch/x86/kernel/quirks-xen.c 2009-11-06 10:52:23.000000000 +0100 @@ -509,7 +509,7 @@ static void __init quirk_amd_nb_node(str pci_read_config_dword(nb_ht, 0x60, &val); set_dev_node(&dev->dev, val & 7); - pci_dev_put(dev); + pci_dev_put(nb_ht); } DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB, --- sle11sp1-2010-03-29.orig/arch/x86/kernel/rtc.c 2009-11-06 10:51:25.000000000 +0100 +++ sle11sp1-2010-03-29/arch/x86/kernel/rtc.c 2009-11-06 10:52:23.000000000 +0100 @@ -189,8 +189,10 @@ void read_persistent_clock(struct timesp unsigned long retval, flags; #ifdef CONFIG_XEN - if (!is_initial_xendomain()) - return xen_read_persistent_clock(); + if (!is_initial_xendomain()) { + xen_read_persistent_clock(ts); + return; + } #endif spin_lock_irqsave(&rtc_lock, flags); retval = x86_platform.get_wallclock(); --- sle11sp1-2010-03-29.orig/arch/x86/kernel/setup-xen.c 2009-11-06 10:52:09.000000000 +0100 +++ sle11sp1-2010-03-29/arch/x86/kernel/setup-xen.c 2009-11-18 14:54:16.000000000 +0100 @@ -27,6 +27,7 @@ #include <linux/screen_info.h> #include <linux/ioport.h> #include <linux/acpi.h> +#include <linux/sfi.h> #include <linux/apm_bios.h> #include <linux/initrd.h> #include <linux/bootmem.h> @@ -66,6 +67,7 @@ #include <linux/percpu.h> #include <linux/crash_dump.h> +#include <linux/tboot.h> #include <video/edid.h> @@ -138,10 +140,6 @@ start_info_t *xen_start_info; EXPORT_SYMBOL(xen_start_info); #endif -#ifndef ARCH_SETUP -#define ARCH_SETUP -#endif - /* * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries. * The direct mapping extends to max_pfn_mapped, so that we can directly access @@ -164,9 +162,9 @@ int default_cpu_present_to_apicid(int mp return __default_cpu_present_to_apicid(mps_cpu); } -int default_check_phys_apicid_present(int boot_cpu_physical_apicid) +int default_check_phys_apicid_present(int phys_apicid) { - return __default_check_phys_apicid_present(boot_cpu_physical_apicid); + return __default_check_phys_apicid_present(phys_apicid); } #endif @@ -203,13 +201,6 @@ static struct resource bss_resource = { #ifdef CONFIG_X86_32 -static struct resource video_ram_resource = { - .name = "Video RAM area", - .start = 0xa0000, - .end = 0xbffff, - .flags = IORESOURCE_BUSY | IORESOURCE_MEM -}; - /* cpu data as detected by the assembly code in head.S */ struct cpuinfo_x86 new_cpu_data __cpuinitdata = {0, 0, 0, 0, -1, 1, 0, 0, -1}; /* common cpu data for all cpus */ @@ -670,7 +661,7 @@ static struct resource standard_io_resou .flags = IORESOURCE_BUSY | IORESOURCE_IO } }; -static void __init reserve_standard_io_resources(void) +void __init reserve_standard_io_resources(void) { int i; @@ -706,10 +697,6 @@ static int __init setup_elfcorehdr(char early_param("elfcorehdr", setup_elfcorehdr); #endif -static struct x86_quirks default_x86_quirks __initdata; - -struct x86_quirks *x86_quirks __initdata = &default_x86_quirks; - #ifdef CONFIG_X86_RESERVE_LOW_64K static int __init dmi_low_memory_corruption(const struct dmi_system_id *d) { @@ -742,6 +729,13 @@ static struct dmi_system_id __initdata b }, }, { + .callback = dmi_low_memory_corruption, + .ident = "Phoenix/MSC BIOS", + .matches = { + DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix/MSC"), + }, + }, + { /* * AMI BIOS with low memory corruption was found on Intel DG45ID board. * It hase different DMI_BIOS_VENDOR = "Intel Corp.", for now we will @@ -865,7 +859,7 @@ void __init setup_arch(char **cmdline_p) copy_edid(); #endif /* CONFIG_XEN */ - ARCH_SETUP + x86_init.oem.arch_setup(); setup_memory_map(); parse_setup_data(); @@ -906,6 +900,16 @@ void __init setup_arch(char **cmdline_p) strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE); *cmdline_p = command_line; +#ifdef CONFIG_X86_64 + /* + * Must call this twice: Once just to detect whether hardware doesn't + * support NX (so that the early EHCI debug console setup can safely + * call set_fixmap(), and then again after parsing early parameters to + * honor the respective command line option. + */ + check_efer(); +#endif + parse_early_param(); #ifdef CONFIG_X86_64 @@ -945,12 +949,9 @@ void __init setup_arch(char **cmdline_p) * VMware detection requires dmi to be available, so this * needs to be done after dmi_scan_machine, for the BP. */ - init_hypervisor(&boot_cpu_data); + init_hypervisor_platform(); -#ifdef CONFIG_X86_32 - if (is_initial_xendomain()) - probe_roms(); -#endif + x86_init.resources.probe_roms(); #ifndef CONFIG_XEN /* after parse_early_param, so could debug it */ @@ -1103,10 +1104,11 @@ void __init setup_arch(char **cmdline_p) kvmclock_init(); #endif - xen_pagetable_setup_start(swapper_pg_dir); + x86_init.paging.pagetable_setup_start(swapper_pg_dir); paging_init(); - xen_pagetable_setup_done(swapper_pg_dir); - paravirt_post_allocator_init(); + x86_init.paging.pagetable_setup_done(swapper_pg_dir); + + tboot_probe(); #ifdef CONFIG_X86_64 map_vsyscall(); @@ -1197,13 +1199,13 @@ void __init setup_arch(char **cmdline_p) */ acpi_boot_init(); -#if defined(CONFIG_X86_MPPARSE) || defined(CONFIG_X86_VISWS) + sfi_init(); + /* * get boot-time SMP configuration: */ if (smp_found_config) get_smp_config(); -#endif prefill_possible_map(); @@ -1227,11 +1229,7 @@ void __init setup_arch(char **cmdline_p) e820_reserve_resources(); #endif -#ifdef CONFIG_X86_32 - if (is_initial_xendomain()) - request_resource(&iomem_resource, &video_ram_resource); -#endif - reserve_standard_io_resources(); + x86_init.resources.reserve_resources(); #ifndef CONFIG_XEN e820_setup_gap(); @@ -1261,80 +1259,25 @@ void __init setup_arch(char **cmdline_p) #endif #endif #endif /* CONFIG_XEN */ + x86_init.oem.banner(); } -#if defined(CONFIG_X86_32) && !defined(CONFIG_XEN) - -/** - * x86_quirk_intr_init - post gate setup interrupt initialisation - * - * Description: - * Fill in any interrupts that may have been left out by the general - * init_IRQ() routine. interrupts having to do with the machine rather - * than the devices on the I/O bus (like APIC interrupts in intel MP - * systems) are started here. - **/ -void __init x86_quirk_intr_init(void) -{ - if (x86_quirks->arch_intr_init) { - if (x86_quirks->arch_intr_init()) - return; - } -} - -/** - * x86_quirk_trap_init - initialise system specific traps - * - * Description: - * Called as the final act of trap_init(). Used in VISWS to initialise - * the various board specific APIC traps. - **/ -void __init x86_quirk_trap_init(void) -{ - if (x86_quirks->arch_trap_init) { - if (x86_quirks->arch_trap_init()) - return; - } -} +#ifdef CONFIG_X86_32 -static struct irqaction irq0 = { - .handler = timer_interrupt, - .flags = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_IRQPOLL | IRQF_TIMER, - .name = "timer" +static struct resource video_ram_resource = { + .name = "Video RAM area", + .start = 0xa0000, + .end = 0xbffff, + .flags = IORESOURCE_BUSY | IORESOURCE_MEM }; -/** - * x86_quirk_pre_time_init - do any specific initialisations before. - * - **/ -void __init x86_quirk_pre_time_init(void) +void __init i386_reserve_resources(void) { - if (x86_quirks->arch_pre_time_init) - x86_quirks->arch_pre_time_init(); + if (is_initial_xendomain()) + request_resource(&iomem_resource, &video_ram_resource); + reserve_standard_io_resources(); } -/** - * x86_quirk_time_init - do any specific initialisations for the system timer. - * - * Description: - * Must plug the system timer interrupt source at HZ into the IRQ listed - * in irq_vectors.h:TIMER_IRQ - **/ -void __init x86_quirk_time_init(void) -{ - if (x86_quirks->arch_time_init) { - /* - * A nonzero return code does not mean failure, it means - * that the architecture quirk does not want any - * generic (timer) setup to be performed after this: - */ - if (x86_quirks->arch_time_init()) - return; - } - - irq0.mask = cpumask_of_cpu(0); - setup_irq(0, &irq0); -} #endif /* CONFIG_X86_32 */ #ifdef CONFIG_XEN --- sle11sp1-2010-03-29.orig/arch/x86/kernel/sfi.c 2010-03-31 09:52:27.000000000 +0200 +++ sle11sp1-2010-03-29/arch/x86/kernel/sfi.c 2009-11-06 10:52:23.000000000 +0100 @@ -31,7 +31,7 @@ #include <asm/setup.h> #include <asm/apic.h> -#ifdef CONFIG_X86_LOCAL_APIC +#if defined(CONFIG_X86_LOCAL_APIC) && !defined(CONFIG_XEN) static unsigned long sfi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE; void __init mp_sfi_register_lapic_address(unsigned long address) @@ -99,9 +99,12 @@ static int __init sfi_parse_ioapic(struc pentry++; } +#ifndef CONFIG_XEN WARN(pic_mode, KERN_WARNING "SFI: pic_mod shouldn't be 1 when IOAPIC table is present\n"); pic_mode = 0; +#endif + return 0; } #endif /* CONFIG_X86_IO_APIC */ @@ -111,7 +114,7 @@ static int __init sfi_parse_ioapic(struc */ int __init sfi_platform_init(void) { -#ifdef CONFIG_X86_LOCAL_APIC +#if defined(CONFIG_X86_LOCAL_APIC) && !defined(CONFIG_XEN) mp_sfi_register_lapic_address(sfi_lapic_addr); sfi_table_parse(SFI_SIG_CPUS, NULL, NULL, sfi_parse_cpus); #endif --- sle11sp1-2010-03-29.orig/arch/x86/kernel/time-xen.c 2010-02-04 09:42:47.000000000 +0100 +++ sle11sp1-2010-03-29/arch/x86/kernel/time-xen.c 2010-02-09 17:07:46.000000000 +0100 @@ -1,31 +1,12 @@ /* - * Copyright (C) 1991, 1992, 1995 Linus Torvalds + * Copyright (c) 1991,1992,1995 Linus Torvalds + * Copyright (c) 1994 Alan Modra + * Copyright (c) 1995 Markus Kuhn + * Copyright (c) 1996 Ingo Molnar + * Copyright (c) 1998 Andrea Arcangeli + * Copyright (c) 2002,2006 Vojtech Pavlik + * Copyright (c) 2003 Andi Kleen * - * This file contains the PC-specific time handling details: - * reading the RTC at bootup, etc.. - * 1994-07-02 Alan Modra - * fixed set_rtc_mmss, fixed time.year for >= 2000, new mktime - * 1995-03-26 Markus Kuhn - * fixed 500 ms bug at call to set_rtc_mmss, fixed DS12887 - * precision CMOS clock update - * 1996-05-03 Ingo Molnar - * fixed time warps in do_[slow|fast]_gettimeoffset() - * 1997-09-10 Updated NTP code according to technical memorandum Jan '96 - * "A Kernel Model for Precision Timekeeping" by Dave Mills - * 1998-09-05 (Various) - * More robust do_fast_gettimeoffset() algorithm implemented - * (works with APM, Cyrix 6x86MX and Centaur C6), - * monotonic gettimeofday() with fast_get_timeoffset(), - * drift-proof precision TSC calibration on boot - * (C. Scott Ananian <cananian@alumni.princeton.edu>, Andrew D. - * Balsa <andrebalsa@altern.org>, Philip Gladstone <philip@raptor.com>; - * ported from 2.0.35 Jumbo-9 by Michael Krause <m.krause@tu-harburg.de>). - * 1998-12-16 Andrea Arcangeli - * Fixed Jumbo-9 code in 2.1.131: do_gettimeofday was missing 1 jiffy - * because was not accounting lost_ticks. - * 1998-12-24 Copyright (C) 1998 Andrea Arcangeli - * Fixed a xtime SMP race (we need the xtime_lock rw spinlock to - * serialize accesses to xtime/lost_ticks). */ #include <linux/init.h> @@ -39,6 +20,7 @@ #include <linux/clocksource.h> #include <linux/sysdev.h> +#include <asm/vsyscall.h> #include <asm/delay.h> #include <asm/time.h> #include <asm/timer.h> @@ -52,7 +34,6 @@ DEFINE_SPINLOCK(i8253_lock); EXPORT_SYMBOL(i8253_lock); #ifdef CONFIG_X86_64 -#include <asm/vsyscall.h> volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES; #endif @@ -415,38 +396,33 @@ unsigned long profile_pc(struct pt_regs { unsigned long pc = instruction_pointer(regs); -#if defined(CONFIG_SMP) || defined(__x86_64__) if (!user_mode_vm(regs) && in_lock_functions(pc)) { -# ifdef CONFIG_FRAME_POINTER +#ifdef CONFIG_FRAME_POINTER return *(unsigned long *)(regs->bp + sizeof(long)); -# else -# ifdef __i386__ - unsigned long *sp = (unsigned long *)®s->sp; -# else - unsigned long *sp = (unsigned long *)regs->sp; -# endif - - /* Return address is either directly at stack pointer - or above a saved flags. Eflags has bits 22-31 zero, - kernel addresses don't. */ +#else + unsigned long *sp = + (unsigned long *)kernel_stack_pointer(regs); + + /* + * Return address is either directly at stack pointer + * or above a saved flags. Eflags has bits 22-31 zero, + * kernel addresses don't. + */ if (sp[0] >> 22) return sp[0]; if (sp[1] >> 22) return sp[1]; -# endif - } #endif + } return pc; } EXPORT_SYMBOL(profile_pc); /* - * This is the same as the above, except we _also_ save the current - * Time Stamp Counter value at the time of the timer interrupt, so that - * we later on can estimate the time of day more exactly. + * Default timer interrupt handler */ -irqreturn_t timer_interrupt(int irq, void *dev_id) +static irqreturn_t timer_interrupt(int irq, void *dev_id) { s64 delta, delta_cpu, stolen, blocked; unsigned int i, cpu = smp_processor_id(); @@ -566,8 +542,7 @@ irqreturn_t timer_interrupt(int irq, voi /* Local timer processing (see update_process_times()). */ run_local_timers(); - if (rcu_pending(cpu)) - rcu_check_callbacks(cpu, user_mode_vm(get_irq_regs())); + rcu_check_callbacks(cpu, user_mode_vm(get_irq_regs())); printk_tick(); scheduler_tick(); run_posix_cpu_timers(current); @@ -667,7 +642,7 @@ static void init_missing_ticks_accountin runstate->time[RUNSTATE_offline]; } -unsigned long xen_read_persistent_clock(void) +void xen_read_persistent_clock(struct timespec *ts) { const shared_info_t *s = HYPERVISOR_shared_info; u32 version, sec, nsec; @@ -684,7 +659,8 @@ unsigned long xen_read_persistent_clock( delta = local_clock() + (u64)sec * NSEC_PER_SEC + nsec; do_div(delta, NSEC_PER_SEC); - return delta; + ts->tv_sec = delta; + ts->tv_nsec = 0; } int xen_update_persistent_clock(void) --- sle11sp1-2010-03-29.orig/arch/x86/kernel/traps-xen.c 2009-11-06 10:52:09.000000000 +0100 +++ sle11sp1-2010-03-29/arch/x86/kernel/traps-xen.c 2009-11-06 10:52:23.000000000 +0100 @@ -14,7 +14,6 @@ #include <linux/spinlock.h> #include <linux/kprobes.h> #include <linux/uaccess.h> -#include <linux/utsname.h> #include <linux/kdebug.h> #include <linux/kernel.h> #include <linux/module.h> @@ -59,12 +58,12 @@ #include <asm/mach_traps.h> #ifdef CONFIG_X86_64 +#include <asm/x86_init.h> #include <asm/pgalloc.h> #include <asm/proto.h> #else #include <asm/processor-flags.h> #include <asm/setup.h> -#include <asm/traps.h> asmlinkage int system_call(void); @@ -74,11 +73,9 @@ char ignore_fpu_irq; #ifndef CONFIG_X86_NO_IDT /* * The IDT has to be page-aligned to simplify the Pentium - * F0 0F bug workaround.. We have a special link segment - * for this. + * F0 0F bug workaround. */ -gate_desc idt_table[256] - __attribute__((__section__(".data.idt"))) = { { { { 0, 0 } } }, }; +gate_desc idt_table[NR_VECTORS] __page_aligned_data = { { { { 0, 0 } } }, }; #endif #endif @@ -780,27 +777,6 @@ do_spurious_interrupt_bug(struct pt_regs #endif } -#ifdef CONFIG_X86_32 -unsigned long patch_espfix_desc(unsigned long uesp, unsigned long kesp) -{ - struct desc_struct *gdt = get_cpu_gdt_table(smp_processor_id()); - unsigned long base = (kesp - uesp) & -THREAD_SIZE; - unsigned long new_kesp = kesp - base; - unsigned long lim_pages = (new_kesp | (THREAD_SIZE - 1)) >> PAGE_SHIFT; - __u64 desc = *(__u64 *)&gdt[GDT_ENTRY_ESPFIX_SS]; - - /* Set up base for espfix segment */ - desc &= 0x00f0ff0000000000ULL; - desc |= ((((__u64)base) << 16) & 0x000000ffffff0000ULL) | - ((((__u64)base) << 32) & 0xff00000000000000ULL) | - ((((__u64)lim_pages) << 32) & 0x000f000000000000ULL) | - (lim_pages & 0xffff); - *(__u64 *)&gdt[GDT_ENTRY_ESPFIX_SS] = desc; - - return new_kesp; -} -#endif - asmlinkage void __attribute__((weak)) smp_thermal_interrupt(void) { } @@ -811,6 +787,28 @@ asmlinkage void __attribute__((weak)) sm #endif /* CONFIG_XEN */ /* + * __math_state_restore assumes that cr0.TS is already clear and the + * fpu state is all ready for use. Used during context switch. + */ +void __math_state_restore(void) +{ + struct thread_info *thread = current_thread_info(); + struct task_struct *tsk = thread->task; + + /* + * Paranoid restore. send a SIGSEGV if we fail to restore the state. + */ + if (unlikely(restore_fpu_checking(tsk))) { + stts(); + force_sig(SIGSEGV, tsk); + return; + } + + thread->status |= TS_USEDFPU; /* So we fnsave on switch_to() */ + tsk->fpu_counter++; +} + +/* * 'math_state_restore()' saves the current math information in the * old math state array, and gets the new ones from the current task * @@ -841,17 +839,7 @@ asmlinkage void math_state_restore(void) } /* NB. 'clts' is done for us by Xen during virtual trap. */ - /* - * Paranoid restore. send a SIGSEGV if we fail to restore the state. - */ - if (unlikely(restore_fpu_checking(tsk))) { - stts(); - force_sig(SIGSEGV, tsk); - return; - } - - thread->status |= TS_USEDFPU; /* So we fnsave on switch_to() */ - tsk->fpu_counter++; + __math_state_restore(); } EXPORT_SYMBOL_GPL(math_state_restore); @@ -967,6 +955,8 @@ void __init trap_init(void) * Should be a barrier for any external CPU state: */ cpu_init(); + + x86_init.irqs.trap_init(); } void __cpuinit smp_trap_init(trap_info_t *trap_ctxt) --- sle11sp1-2010-03-29.orig/arch/x86/kernel/vsyscall_64-xen.c 2009-11-06 10:52:09.000000000 +0100 +++ sle11sp1-2010-03-29/arch/x86/kernel/vsyscall_64-xen.c 2009-11-06 10:52:23.000000000 +0100 @@ -87,6 +87,7 @@ void update_vsyscall(struct timespec *wa vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec; vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec; vsyscall_gtod_data.wall_to_monotonic = wall_to_monotonic; + vsyscall_gtod_data.wall_time_coarse = __current_kernel_time(); write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags); } @@ -227,19 +228,11 @@ static long __vsyscall(3) venosys_1(void } #ifdef CONFIG_SYSCTL - -static int -vsyscall_sysctl_change(ctl_table *ctl, int write, struct file * filp, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - return proc_dointvec(ctl, write, filp, buffer, lenp, ppos); -} - static ctl_table kernel_table2[] = { { .procname = "vsyscall64", .data = &vsyscall_gtod_data.sysctl_enabled, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = vsyscall_sysctl_change }, + .proc_handler = proc_dointvec }, {} }; --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/arch/x86/kernel/x86_init-xen.c 2009-11-06 10:52:23.000000000 +0100 @@ -0,0 +1,70 @@ +/* + * Copyright (C) 2009 Thomas Gleixner <tglx@linutronix.de> + * + * For licencing details see kernel-base/COPYING + */ +#include <linux/bitmap.h> +#include <linux/init.h> +#include <linux/threads.h> + +#include <asm/mpspec.h> +#include <asm/setup.h> +#include <asm/apic.h> +#include <asm/e820.h> +#include <asm/time.h> +#include <asm/irq.h> + +void __cpuinit x86_init_noop(void) { } +void __init x86_init_uint_noop(unsigned int unused) { } +void __init x86_init_pgd_noop(pgd_t *unused) { } + +/* + * The platform setup functions are preset with the default functions + * for standard PC hardware. + */ +struct x86_init_ops x86_init __initdata = { + + .resources = { + .probe_roms = x86_init_noop, + .reserve_resources = reserve_standard_io_resources, + .memory_setup = default_machine_specific_memory_setup, + }, + + .mpparse = { + .mpc_record = x86_init_uint_noop, + .setup_ioapic_ids = x86_init_noop, + .mpc_apic_id = NULL, + .smp_read_mpc_oem = default_smp_read_mpc_oem, + .mpc_oem_bus_info = default_mpc_oem_bus_info, + .find_smp_config = default_find_smp_config, + .get_smp_config = default_get_smp_config, + }, + + .irqs = { + .pre_vector_init = NULL, + .intr_init = NULL, + .trap_init = x86_init_noop, + }, + + .oem = { + .arch_setup = xen_arch_setup, + .banner = x86_init_noop, + }, + + .paging = { + .pagetable_setup_start = x86_init_pgd_noop, + .pagetable_setup_done = x86_init_pgd_noop, + }, + + .timers = { + .setup_percpu_clockev = NULL, + .tsc_pre_init = x86_init_noop, + .timer_init = x86_init_noop, + }, +}; + +struct x86_platform_ops x86_platform = { + .calibrate_tsc = NULL, + .get_wallclock = mach_get_cmos_time, + .set_wallclock = mach_set_rtc_mmss, +}; --- sle11sp1-2010-03-29.orig/arch/x86/mm/fault-xen.c 2009-11-06 10:52:09.000000000 +0100 +++ sle11sp1-2010-03-29/arch/x86/mm/fault-xen.c 2009-11-06 10:52:23.000000000 +0100 @@ -10,7 +10,7 @@ #include <linux/bootmem.h> /* max_low_pfn */ #include <linux/kprobes.h> /* __kprobes, ... */ #include <linux/mmiotrace.h> /* kmmio_handler, ... */ -#include <linux/perf_counter.h> /* perf_swcounter_event */ +#include <linux/perf_event.h> /* perf_sw_event */ #include <asm/traps.h> /* dotraplinkage, ... */ #include <asm/pgalloc.h> /* pgd_*(), ... */ @@ -167,6 +167,7 @@ force_sig_info_fault(int si_signo, int s info.si_errno = 0; info.si_code = si_code; info.si_addr = (void __user *)address; + info.si_addr_lsb = si_code == BUS_MCEERR_AR ? PAGE_SHIFT : 0; force_sig_info(si_signo, &info, tsk); } @@ -293,27 +294,25 @@ check_v8086_mode(struct pt_regs *regs, u tsk->thread.screen_bitmap |= 1 << bit; } -static void dump_pagetable(unsigned long address) +static bool low_pfn(unsigned long pfn) { - __typeof__(pte_val(__pte(0))) page; + return pfn < max_low_pfn; +} - page = read_cr3(); - page = ((__typeof__(page) *) __va(page))[address >> PGDIR_SHIFT]; +static void dump_pagetable(unsigned long address) +{ + pgd_t *base = __va(read_cr3()); + pgd_t *pgd = &base[pgd_index(address)]; + pmd_t *pmd; + pte_t *pte; #ifdef CONFIG_X86_PAE - printk("*pdpt = %016Lx ", page); - if ((page & _PAGE_PRESENT) - && mfn_to_local_pfn(page >> PAGE_SHIFT) < max_low_pfn) { - page = mfn_to_pfn(page >> PAGE_SHIFT); - page <<= PAGE_SHIFT; - page = ((__typeof__(page) *) __va(page))[(address >> PMD_SHIFT) - & (PTRS_PER_PMD - 1)]; - printk(KERN_CONT "*pde = %016Lx ", page); - page &= ~_PAGE_NX; - } -#else - printk("*pde = %08lx ", page); + printk("*pdpt = %016Lx ", __pgd_val(*pgd)); + if (!low_pfn(pgd_val(*pgd) >> PAGE_SHIFT) || !pgd_present(*pgd)) + goto out; #endif + pmd = pmd_offset(pud_offset(pgd, address), address); + printk(KERN_CONT "*pde = %0*Lx ", sizeof(*pmd) * 2, (u64)__pmd_val(*pmd)); /* * We must not directly access the pte in the highpte @@ -321,17 +320,12 @@ static void dump_pagetable(unsigned long * And let's rather not kmap-atomic the pte, just in case * it's allocated already: */ - if ((page & _PAGE_PRESENT) - && mfn_to_local_pfn(page >> PAGE_SHIFT) < max_low_pfn - && !(page & _PAGE_PSE)) { - - page = mfn_to_pfn(page >> PAGE_SHIFT); - page <<= PAGE_SHIFT; - page = ((__typeof__(page) *) __va(page))[(address >> PAGE_SHIFT) - & (PTRS_PER_PTE - 1)]; - printk(KERN_CONT "*pte = %0*Lx ", sizeof(page)*2, (u64)page); - } + if (!low_pfn(pmd_pfn(*pmd)) || !pmd_present(*pmd) || pmd_large(*pmd)) + goto out; + pte = pte_offset_kernel(pmd, address); + printk(KERN_CONT "*pte = %0*Lx ", sizeof(*pte) * 2, (u64)__pte_val(*pte)); +out: printk(KERN_CONT "\n"); } @@ -460,16 +454,12 @@ static int bad_address(void *p) static void dump_pagetable(unsigned long address) { - pgd_t *pgd; + pgd_t *base = __va(read_cr3() & PHYSICAL_PAGE_MASK); + pgd_t *pgd = base + pgd_index(address); pud_t *pud; pmd_t *pmd; pte_t *pte; - pgd = (pgd_t *)read_cr3(); - - pgd = __va((unsigned long)pgd & PHYSICAL_PAGE_MASK); - - pgd += pgd_index(address); if (bad_address(pgd)) goto bad; @@ -809,10 +799,12 @@ out_of_memory(struct pt_regs *regs, unsi } static void -do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address) +do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address, + unsigned int fault) { struct task_struct *tsk = current; struct mm_struct *mm = tsk->mm; + int code = BUS_ADRERR; up_read(&mm->mmap_sem); @@ -828,7 +820,15 @@ do_sigbus(struct pt_regs *regs, unsigned tsk->thread.error_code = error_code; tsk->thread.trap_no = 14; - force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk); +#ifdef CONFIG_MEMORY_FAILURE + if (fault & VM_FAULT_HWPOISON) { + printk(KERN_ERR + "MCE: Killing %s:%d due to hardware memory corruption fault at %lx\n", + tsk->comm, tsk->pid, address); + code = BUS_MCEERR_AR; + } +#endif + force_sig_info_fault(SIGBUS, code, address, tsk); } static noinline void @@ -838,8 +838,8 @@ mm_fault_error(struct pt_regs *regs, uns if (fault & VM_FAULT_OOM) { out_of_memory(regs, error_code, address); } else { - if (fault & VM_FAULT_SIGBUS) - do_sigbus(regs, error_code, address); + if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON)) + do_sigbus(regs, error_code, address, fault); else BUG(); } @@ -1053,7 +1053,7 @@ do_page_fault(struct pt_regs *regs, unsi if (unlikely(error_code & PF_RSVD)) pgtable_bad(regs, error_code, address); - perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address); + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address); /* * If we're in an interrupt, have no user context or are running @@ -1150,11 +1150,11 @@ good_area: if (fault & VM_FAULT_MAJOR) { tsk->maj_flt++; - perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0, + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0, regs, address); } else { tsk->min_flt++; - perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0, + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0, regs, address); } --- sle11sp1-2010-03-29.orig/arch/x86/mm/highmem_32-xen.c 2009-11-06 10:52:09.000000000 +0100 +++ sle11sp1-2010-03-29/arch/x86/mm/highmem_32-xen.c 2009-11-06 10:52:23.000000000 +0100 @@ -24,7 +24,7 @@ void kunmap(struct page *page) * no global lock is needed and because the kmap code must perform a global TLB * invalidation when the kmap pool wraps. * - * However when holding an atomic kmap is is not legal to sleep, so atomic + * However when holding an atomic kmap it is not legal to sleep, so atomic * kmaps are appropriate for short, tight code paths only. */ void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot) @@ -149,9 +149,7 @@ EXPORT_SYMBOL(kunmap); EXPORT_SYMBOL(kmap_atomic); EXPORT_SYMBOL(kunmap_atomic); EXPORT_SYMBOL(kmap_atomic_prot); -#ifdef CONFIG_HIGHPTE EXPORT_SYMBOL(kmap_atomic_to_page); -#endif EXPORT_SYMBOL(clear_highpage); EXPORT_SYMBOL(copy_highpage); --- sle11sp1-2010-03-29.orig/arch/x86/mm/init-xen.c 2009-11-06 10:52:09.000000000 +0100 +++ sle11sp1-2010-03-29/arch/x86/mm/init-xen.c 2009-11-06 10:52:23.000000000 +0100 @@ -36,69 +36,6 @@ extern unsigned long extend_init_mapping extern void xen_finish_init_mapping(void); #endif -int nx_enabled; - -#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) -static int disable_nx __cpuinitdata; - -/* - * noexec = on|off - * - * Control non-executable mappings for processes. - * - * on Enable - * off Disable - */ -static int __init noexec_setup(char *str) -{ - if (!str) - return -EINVAL; - if (!strncmp(str, "on", 2)) { - __supported_pte_mask |= _PAGE_NX; - disable_nx = 0; - } else if (!strncmp(str, "off", 3)) { - disable_nx = 1; - __supported_pte_mask &= ~_PAGE_NX; - } - return 0; -} -early_param("noexec", noexec_setup); -#endif - -#ifdef CONFIG_X86_PAE -static void __init set_nx(void) -{ - unsigned int v[4], l, h; - - if (cpu_has_pae && (cpuid_eax(0x80000000) > 0x80000001)) { - cpuid(0x80000001, &v[0], &v[1], &v[2], &v[3]); - - if ((v[3] & (1 << 20)) && !disable_nx) { - rdmsr(MSR_EFER, l, h); - l |= EFER_NX; - wrmsr(MSR_EFER, l, h); - nx_enabled = 1; - __supported_pte_mask |= _PAGE_NX; - } - } -} -#else -static inline void set_nx(void) -{ -} -#endif - -#ifdef CONFIG_X86_64 -void __cpuinit check_efer(void) -{ - unsigned long efer; - - rdmsrl(MSR_EFER, efer); - if (!(efer & EFER_NX) || disable_nx) - __supported_pte_mask &= ~_PAGE_NX; -} -#endif - static void __init find_early_table_space(unsigned long end, int use_pse, int use_gbpages) { --- sle11sp1-2010-03-29.orig/arch/x86/mm/init_32-xen.c 2009-11-06 10:52:09.000000000 +0100 +++ sle11sp1-2010-03-29/arch/x86/mm/init_32-xen.c 2009-11-06 10:52:23.000000000 +0100 @@ -87,7 +87,7 @@ static pmd_t * __init one_md_table_init( #ifdef CONFIG_X86_PAE if (!(__pgd_val(*pgd) & _PAGE_PRESENT)) { if (after_bootmem) - pmd_table = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE); + pmd_table = (pmd_t *)alloc_bootmem_pages(PAGE_SIZE); else pmd_table = (pmd_t *)alloc_low_page(); paravirt_alloc_pmd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT); @@ -124,7 +124,7 @@ static pte_t * __init one_page_table_ini #endif if (!page_table) page_table = - (pte_t *)alloc_bootmem_low_pages(PAGE_SIZE); + (pte_t *)alloc_bootmem_pages(PAGE_SIZE); } else page_table = (pte_t *)alloc_low_page(); @@ -914,8 +914,6 @@ static void __init test_wp_bit(void) } } -static struct kcore_list kcore_mem, kcore_vmalloc; - void __init mem_init(void) { int codesize, reservedpages, datasize, initsize; @@ -949,13 +947,9 @@ void __init mem_init(void) datasize = (unsigned long) &_edata - (unsigned long) &_etext; initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin; - kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT); - kclist_add(&kcore_vmalloc, (void *)VMALLOC_START, - VMALLOC_END-VMALLOC_START); - printk(KERN_INFO "Memory: %luk/%luk available (%dk kernel code, " "%dk reserved, %dk data, %dk init, %ldk highmem)\n", - (unsigned long) nr_free_pages() << (PAGE_SHIFT-10), + nr_free_pages() << (PAGE_SHIFT-10), num_physpages << (PAGE_SHIFT-10), codesize >> 10, reservedpages << (PAGE_SHIFT-10), --- sle11sp1-2010-03-29.orig/arch/x86/mm/init_64-xen.c 2009-11-06 10:52:09.000000000 +0100 +++ sle11sp1-2010-03-29/arch/x86/mm/init_64-xen.c 2009-11-06 10:52:23.000000000 +0100 @@ -894,8 +894,7 @@ EXPORT_SYMBOL_GPL(memory_add_physaddr_to #endif /* CONFIG_MEMORY_HOTPLUG */ -static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel, - kcore_modules, kcore_vsyscall; +static struct kcore_list kcore_vsyscall; void __init mem_init(void) { @@ -931,17 +930,12 @@ void __init mem_init(void) initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin; /* Register memory areas for /proc/kcore */ - kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT); - kclist_add(&kcore_vmalloc, (void *)VMALLOC_START, - VMALLOC_END-VMALLOC_START); - kclist_add(&kcore_kernel, &_stext, _end - _stext); - kclist_add(&kcore_modules, (void *)MODULES_VADDR, MODULES_LEN); kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START, - VSYSCALL_END - VSYSCALL_START); + VSYSCALL_END - VSYSCALL_START, KCORE_OTHER); printk(KERN_INFO "Memory: %luk/%luk available (%ldk kernel code, " "%ldk absent, %ldk reserved, %ldk data, %ldk init)\n", - (unsigned long) nr_free_pages() << (PAGE_SHIFT-10), + nr_free_pages() << (PAGE_SHIFT-10), max_pfn << (PAGE_SHIFT-10), codesize >> 10, absent_pages << (PAGE_SHIFT-10), --- sle11sp1-2010-03-29.orig/arch/x86/mm/iomap_32-xen.c 2009-11-06 10:52:09.000000000 +0100 +++ sle11sp1-2010-03-29/arch/x86/mm/iomap_32-xen.c 2009-11-06 10:52:23.000000000 +0100 @@ -22,7 +22,7 @@ #include <linux/module.h> #include <linux/highmem.h> -int is_io_mapping_possible(resource_size_t base, unsigned long size) +static int is_io_mapping_possible(resource_size_t base, unsigned long size) { #if !defined(CONFIG_X86_PAE) && defined(CONFIG_PHYS_ADDR_T_64BIT) /* There is no way to map greater than 1 << 32 address without PAE */ @@ -31,7 +31,30 @@ int is_io_mapping_possible(resource_size #endif return 1; } -EXPORT_SYMBOL_GPL(is_io_mapping_possible); + +int iomap_create_wc(resource_size_t base, unsigned long size, pgprot_t *prot) +{ + unsigned long flag = _PAGE_CACHE_WC; + int ret; + + if (!is_io_mapping_possible(base, size)) + return -EINVAL; + + ret = io_reserve_memtype(base, base + size, &flag); + if (ret) + return ret; + + *prot = __pgprot(__PAGE_KERNEL | flag); + return 0; +} +EXPORT_SYMBOL_GPL(iomap_create_wc); + +void +iomap_free(resource_size_t base, unsigned long size) +{ + io_free_memtype(base, base + size); +} +EXPORT_SYMBOL_GPL(iomap_free); void *kmap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot) { --- sle11sp1-2010-03-29.orig/arch/x86/mm/ioremap-xen.c 2009-11-06 10:52:02.000000000 +0100 +++ sle11sp1-2010-03-29/arch/x86/mm/ioremap-xen.c 2009-11-18 14:56:06.000000000 +0100 @@ -23,81 +23,7 @@ #include <asm/pgalloc.h> #include <asm/pat.h> -static inline int phys_addr_valid(resource_size_t addr) -{ -#ifdef CONFIG_PHYS_ADDR_T_64BIT - return !(addr >> boot_cpu_data.x86_phys_bits); -#else - return 1; -#endif -} - -#ifdef CONFIG_X86_64 - -#define phys_base 0 - -unsigned long __phys_addr(unsigned long x) -{ - if (x >= __START_KERNEL_map) { - x -= __START_KERNEL_map; - VIRTUAL_BUG_ON(x >= KERNEL_IMAGE_SIZE); - x += phys_base; - } else { - VIRTUAL_BUG_ON(x < PAGE_OFFSET); - x -= PAGE_OFFSET; - VIRTUAL_BUG_ON(!phys_addr_valid(x)); - } - return x; -} -EXPORT_SYMBOL(__phys_addr); - -bool __virt_addr_valid(unsigned long x) -{ - if (x >= __START_KERNEL_map) { - x -= __START_KERNEL_map; - if (x >= KERNEL_IMAGE_SIZE) - return false; - x += phys_base; - } else { - if (x < PAGE_OFFSET) - return false; - x -= PAGE_OFFSET; - if (!phys_addr_valid(x)) - return false; - } - - return pfn_valid(x >> PAGE_SHIFT); -} -EXPORT_SYMBOL(__virt_addr_valid); - -#undef phys_base - -#else - -#ifdef CONFIG_DEBUG_VIRTUAL -unsigned long __phys_addr(unsigned long x) -{ - /* VMALLOC_* aren't constants */ - VIRTUAL_BUG_ON(x < PAGE_OFFSET); - VIRTUAL_BUG_ON(__vmalloc_start_set && is_vmalloc_addr((void *) x)); - return x - PAGE_OFFSET; -} -EXPORT_SYMBOL(__phys_addr); -#endif - -bool __virt_addr_valid(unsigned long x) -{ - if (x < PAGE_OFFSET) - return false; - if (__vmalloc_start_set && is_vmalloc_addr((void *) x)) - return false; - if (x >= FIXADDR_START) - return false; - return pfn_valid((x - PAGE_OFFSET) >> PAGE_SHIFT); -} -EXPORT_SYMBOL(__virt_addr_valid); - -#endif +#include "physaddr.h" static int direct_remap_area_pte_fn(pte_t *pte, struct page *pmd_page, @@ -407,30 +333,19 @@ static void __iomem *__ioremap_caller(re retval = reserve_memtype(phys_addr, (u64)phys_addr + size, prot_val, &new_prot_val); if (retval) { - pr_debug("Warning: reserve_memtype returned %d\n", retval); + printk(KERN_ERR "ioremap reserve_memtype failed %d\n", retval); return NULL; } if (prot_val != new_prot_val) { - /* - * Do not fallback to certain memory types with certain - * requested type: - * - request is uc-, return cannot be write-back - * - request is uc-, return cannot be write-combine - * - request is write-combine, return cannot be write-back - */ - if ((prot_val == _PAGE_CACHE_UC_MINUS && - (new_prot_val == _PAGE_CACHE_WB || - new_prot_val == _PAGE_CACHE_WC)) || - (prot_val == _PAGE_CACHE_WC && - new_prot_val == _PAGE_CACHE_WB)) { - pr_debug( + if (!is_new_memtype_allowed(phys_addr, size, + prot_val, new_prot_val)) { + printk(KERN_ERR "ioremap error for 0x%llx-0x%llx, requested 0x%lx, got 0x%lx\n", (unsigned long long)phys_addr, (unsigned long long)(phys_addr + size), prot_val, new_prot_val); - free_memtype(phys_addr, phys_addr + size); - return NULL; + goto err_free_memtype; } prot_val = new_prot_val; } @@ -456,27 +371,26 @@ static void __iomem *__ioremap_caller(re */ area = get_vm_area_caller(size, VM_IOREMAP, caller); if (!area) - return NULL; + goto err_free_memtype; area->phys_addr = phys_addr; vaddr = (unsigned long) area->addr; - if (kernel_map_sync_memtype(phys_addr, size, prot_val)) { - free_memtype(phys_addr, phys_addr + size); - free_vm_area(area); - return NULL; - } + if (kernel_map_sync_memtype(phys_addr, size, prot_val)) + goto err_free_area; if (__direct_remap_pfn_range(&init_mm, vaddr, PFN_DOWN(phys_addr), - size, prot, domid)) { - free_memtype(phys_addr, phys_addr + size); - free_vm_area(area); - return NULL; - } + size, prot, domid)) + goto err_free_area; ret_addr = (void __iomem *) (vaddr + offset); mmiotrace_ioremap(unaligned_phys_addr, unaligned_size, ret_addr); return ret_addr; +err_free_area: + free_vm_area(area); +err_free_memtype: + free_memtype(phys_addr, phys_addr + size); + return NULL; } /** --- sle11sp1-2010-03-29.orig/arch/x86/mm/pageattr-xen.c 2009-11-06 10:52:09.000000000 +0100 +++ sle11sp1-2010-03-29/arch/x86/mm/pageattr-xen.c 2009-11-06 10:52:23.000000000 +0100 @@ -12,6 +12,7 @@ #include <linux/seq_file.h> #include <linux/debugfs.h> #include <linux/pfn.h> +#include <linux/percpu.h> #include <asm/e820.h> #include <asm/processor.h> @@ -143,6 +144,7 @@ void clflush_cache_range(void *vaddr, un mb(); } +EXPORT_SYMBOL_GPL(clflush_cache_range); static void __cpa_flush_all(void *arg) { @@ -707,7 +709,7 @@ static int cpa_process_alias(struct cpa_ { struct cpa_data alias_cpa; unsigned long laddr = (unsigned long)__va(cpa->pfn << PAGE_SHIFT); - unsigned long vaddr, remapped; + unsigned long vaddr; int ret; if (cpa->pfn >= max_pfn_mapped) @@ -765,24 +767,6 @@ static int cpa_process_alias(struct cpa_ } #endif - /* - * If the PMD page was partially used for per-cpu remapping, - * the recycled area needs to be split and modified. Because - * the area is always proper subset of a PMD page - * cpa->numpages is guaranteed to be 1 for these areas, so - * there's no need to loop over and check for further remaps. - */ - remapped = (unsigned long)pcpu_lpage_remapped((void *)laddr); - if (remapped) { - WARN_ON(cpa->numpages > 1); - alias_cpa = *cpa; - alias_cpa.vaddr = &remapped; - alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY); - ret = __change_page_attr_set_clr(&alias_cpa, 0); - if (ret) - return ret; - } - return 0; } @@ -843,6 +827,7 @@ static int change_page_attr_set_clr(unsi { struct cpa_data cpa; int ret, cache, checkalias; + unsigned long baddr = 0; /* * Check, if we are requested to change a not supported @@ -874,6 +859,11 @@ static int change_page_attr_set_clr(unsi */ WARN_ON_ONCE(1); } + /* + * Save address for cache flush. *addr is modified in the call + * to __change_page_attr_set_clr() below. + */ + baddr = *addr; } /* Must avoid aliasing mappings in the highmem code */ @@ -921,7 +911,7 @@ static int change_page_attr_set_clr(unsi cpa_flush_array(addr, numpages, cache, cpa.flags, pages); } else - cpa_flush_range(*addr, numpages, cache); + cpa_flush_range(baddr, numpages, cache); } else cpa_flush_all(cache); --- sle11sp1-2010-03-29.orig/arch/x86/mm/pat-xen.c 2010-02-05 11:17:21.000000000 +0100 +++ sle11sp1-2010-03-29/arch/x86/mm/pat-xen.c 2010-02-05 11:22:27.000000000 +0100 @@ -15,6 +15,7 @@ #include <linux/gfp.h> #include <linux/mm.h> #include <linux/fs.h> +#include <linux/rbtree.h> #include <asm/cacheflush.h> #include <asm/processor.h> @@ -80,6 +81,7 @@ enum { void pat_init(void) { u64 pat; + bool boot_cpu = !boot_pat_state; if (!pat_enabled) return; @@ -131,8 +133,10 @@ void pat_init(void) if (!boot_pat_state) boot_pat_state = pat; #endif - printk(KERN_INFO "x86 PAT enabled: cpu %d, old 0x%Lx, new 0x%Lx\n", - smp_processor_id(), boot_pat_state, pat); + + if (boot_cpu) + printk(KERN_INFO "x86 PAT enabled: cpu %d, old 0x%Lx, new 0x%Lx\n", + smp_processor_id(), boot_pat_state, pat); } #undef PAT @@ -160,11 +164,10 @@ static char *cattr_name(unsigned long fl * areas). All the aliases have the same cache attributes of course. * Zero attributes are represented as holes. * - * Currently the data structure is a list because the number of mappings - * are expected to be relatively small. If this should be a problem - * it could be changed to a rbtree or similar. + * The data structure is a list that is also organized as an rbtree + * sorted on the start address of memtype range. * - * memtype_lock protects the whole list. + * memtype_lock protects both the linear list and rbtree. */ struct memtype { @@ -172,11 +175,53 @@ struct memtype { u64 end; unsigned long type; struct list_head nd; + struct rb_node rb; }; +static struct rb_root memtype_rbroot = RB_ROOT; static LIST_HEAD(memtype_list); static DEFINE_SPINLOCK(memtype_lock); /* protects memtype list */ +static struct memtype *memtype_rb_search(struct rb_root *root, u64 start) +{ + struct rb_node *node = root->rb_node; + struct memtype *last_lower = NULL; + + while (node) { + struct memtype *data = container_of(node, struct memtype, rb); + + if (data->start < start) { + last_lower = data; + node = node->rb_right; + } else if (data->start > start) { + node = node->rb_left; + } else + return data; + } + + /* Will return NULL if there is no entry with its start <= start */ + return last_lower; +} + +static void memtype_rb_insert(struct rb_root *root, struct memtype *data) +{ + struct rb_node **new = &(root->rb_node); + struct rb_node *parent = NULL; + + while (*new) { + struct memtype *this = container_of(*new, struct memtype, rb); + + parent = *new; + if (data->start <= this->start) + new = &((*new)->rb_left); + else if (data->start > this->start) + new = &((*new)->rb_right); + } + + rb_link_node(&data->rb, parent, new); + rb_insert_color(&data->rb, root); +} + static int pat_pagerange_is_ram(resource_size_t start, resource_size_t end); static inline u8 _mtrr_type_lookup(u64 start, u64 end) { @@ -240,9 +285,6 @@ chk_conflict(struct memtype *new, struct return -EBUSY; } -static struct memtype *cached_entry; -static u64 cached_start; - static int pat_pagerange_is_ram(resource_size_t start, resource_size_t end) { int ram_page = 0, not_rampage = 0; @@ -271,69 +313,65 @@ static int pat_pagerange_is_ram(resource } /* - * For RAM pages, mark the pages as non WB memory type using - * PageNonWB (PG_arch_1). We allow only one set_memory_uc() or - * set_memory_wc() on a RAM page at a time before marking it as WB again. - * This is ok, because only one driver will be owning the page and - * doing set_memory_*() calls. - * - * For now, we use PageNonWB to track that the RAM page is being mapped - * as non WB. In future, we will have to use one more flag - * (or some other mechanism in page_struct) to distinguish between - * UC and WC mapping. + * For RAM pages, we use page flags to mark the pages with appropriate type. + * Here we do two pass: + * - Find the memtype of all the pages in the range, look for any conflicts + * - In case of no conflicts, set the new memtype for pages in the range + * + * Caller must hold memtype_lock for atomicity. */ static int reserve_ram_pages_type(u64 start, u64 end, unsigned long req_type, unsigned long *new_type) { struct page *page; - unsigned long mfn, end_mfn; + unsigned long mfn; + + if (req_type == _PAGE_CACHE_UC) { + /* We do not support strong UC */ + WARN_ON_ONCE(1); + req_type = _PAGE_CACHE_UC_MINUS; + } for (mfn = (start >> PAGE_SHIFT); mfn < (end >> PAGE_SHIFT); ++mfn) { - unsigned long pfn = mfn_to_local_pfn(mfn); + unsigned long type, pfn = mfn_to_local_pfn(mfn); BUG_ON(!pfn_valid(pfn)); page = pfn_to_page(pfn); - if (page_mapped(page) || PageNonWB(page)) - goto out; + type = get_page_memtype(page); + if (type != -1) { + printk(KERN_INFO "reserve_ram_pages_type failed " + "0x%Lx-0x%Lx, track 0x%lx, req 0x%lx\n", + start, end, type, req_type); + if (new_type) + *new_type = type; - SetPageNonWB(page); + return -EBUSY; + } } - return 0; -out: - end_mfn = mfn; - for (mfn = (start >> PAGE_SHIFT); mfn < end_mfn; ++mfn) { + if (new_type) + *new_type = req_type; + + for (mfn = (start >> PAGE_SHIFT); mfn < (end >> PAGE_SHIFT); ++mfn) { page = pfn_to_page(mfn_to_local_pfn(mfn)); - ClearPageNonWB(page); + set_page_memtype(page, req_type); } - - return -EINVAL; + return 0; } static int free_ram_pages_type(u64 start, u64 end) { struct page *page; - unsigned long mfn, end_mfn; + unsigned long mfn; for (mfn = (start >> PAGE_SHIFT); mfn < (end >> PAGE_SHIFT); ++mfn) { unsigned long pfn = mfn_to_local_pfn(mfn); BUG_ON(!pfn_valid(pfn)); page = pfn_to_page(pfn); - if (page_mapped(page) || !PageNonWB(page)) - goto out; - - ClearPageNonWB(page); + set_page_memtype(page, -1); } return 0; - -out: - end_mfn = mfn; - for (mfn = (start >> PAGE_SHIFT); mfn < end_mfn; ++mfn) { - page = pfn_to_page(mfn_to_local_pfn(mfn)); - SetPageNonWB(page); - } - return -EINVAL; } /* @@ -367,6 +405,8 @@ int reserve_memtype(u64 start, u64 end, if (new_type) { if (req_type == -1) *new_type = _PAGE_CACHE_WB; + else if (req_type == _PAGE_CACHE_WC) + *new_type = _PAGE_CACHE_UC_MINUS; else *new_type = req_type & _PAGE_CACHE_MASK; } @@ -392,11 +432,16 @@ int reserve_memtype(u64 start, u64 end, *new_type = actual_type; is_range_ram = pat_pagerange_is_ram(start, end); - if (is_range_ram == 1) - return reserve_ram_pages_type(start, end, req_type, - new_type); - else if (is_range_ram < 0) + if (is_range_ram == 1) { + + spin_lock(&memtype_lock); + err = reserve_ram_pages_type(start, end, req_type, new_type); + spin_unlock(&memtype_lock); + + return err; + } else if (is_range_ram < 0) { return -EINVAL; + } new = kmalloc(sizeof(struct memtype), GFP_KERNEL); if (!new) @@ -408,17 +453,11 @@ int reserve_memtype(u64 start, u64 end, spin_lock(&memtype_lock); - if (cached_entry && start >= cached_start) - entry = cached_entry; - else - entry = list_entry(&memtype_list, struct memtype, nd); - /* Search for existing mapping that overlaps the current range */ where = NULL; - list_for_each_entry_continue(entry, &memtype_list, nd) { + list_for_each_entry(entry, &memtype_list, nd) { if (end <= entry->start) { where = entry->nd.prev; - cached_entry = list_entry(where, struct memtype, nd); break; } else if (start <= entry->start) { /* end > entry->start */ err = chk_conflict(new, entry, new_type); @@ -426,8 +465,6 @@ int reserve_memtype(u64 start, u64 end, dprintk("Overlap at 0x%Lx-0x%Lx\n", entry->start, entry->end); where = entry->nd.prev; - cached_entry = list_entry(where, - struct memtype, nd); } break; } else if (start < entry->end) { /* start > entry->start */ @@ -435,8 +472,6 @@ int reserve_memtype(u64 start, u64 end, if (!err) { dprintk("Overlap at 0x%Lx-0x%Lx\n", entry->start, entry->end); - cached_entry = list_entry(entry->nd.prev, - struct memtype, nd); /* * Move to right position in the linked @@ -464,13 +499,13 @@ int reserve_memtype(u64 start, u64 end, return err; } - cached_start = start; - if (where) list_add(&new->nd, where); else list_add_tail(&new->nd, &memtype_list); + memtype_rb_insert(&memtype_rbroot, new); + spin_unlock(&memtype_lock); dprintk("reserve_memtype added 0x%Lx-0x%Lx, track %s, req %s, ret %s\n", @@ -482,7 +517,7 @@ int reserve_memtype(u64 start, u64 end, int free_memtype(u64 start, u64 end) { - struct memtype *entry; + struct memtype *entry, *saved_entry; int err = -EINVAL; int is_range_ram; @@ -494,23 +529,58 @@ int free_memtype(u64 start, u64 end) return 0; is_range_ram = pat_pagerange_is_ram(start, end); - if (is_range_ram == 1) - return free_ram_pages_type(start, end); - else if (is_range_ram < 0) + if (is_range_ram == 1) { + + spin_lock(&memtype_lock); + err = free_ram_pages_type(start, end); + spin_unlock(&memtype_lock); + + return err; + } else if (is_range_ram < 0) { return -EINVAL; + } spin_lock(&memtype_lock); - list_for_each_entry(entry, &memtype_list, nd) { + + entry = memtype_rb_search(&memtype_rbroot, start); + if (unlikely(entry == NULL)) + goto unlock_ret; + + /* + * Saved entry points to an entry with start same or less than what + * we searched for. Now go through the list in both directions to look + * for the entry that matches with both start and end, with list stored + * in sorted start address + */ + saved_entry = entry; + list_for_each_entry_from(entry, &memtype_list, nd) { if (entry->start == start && entry->end == end) { - if (cached_entry == entry || cached_start == start) - cached_entry = NULL; + rb_erase(&entry->rb, &memtype_rbroot); + list_del(&entry->nd); + kfree(entry); + err = 0; + break; + } else if (entry->start > start) { + break; + } + } + + if (!err) + goto unlock_ret; + entry = saved_entry; + list_for_each_entry_reverse(entry, &memtype_list, nd) { + if (entry->start == start && entry->end == end) { + rb_erase(&entry->rb, &memtype_rbroot); list_del(&entry->nd); kfree(entry); err = 0; break; + } else if (entry->start < start) { + break; } } +unlock_ret: spin_unlock(&memtype_lock); if (err) { @@ -524,6 +594,103 @@ int free_memtype(u64 start, u64 end) } +#ifndef CONFIG_XEN +/** + * lookup_memtype - Looksup the memory type for a physical address + * @paddr: physical address of which memory type needs to be looked up + * + * Only to be called when PAT is enabled + * + * Returns _PAGE_CACHE_WB, _PAGE_CACHE_WC, _PAGE_CACHE_UC_MINUS or + * _PAGE_CACHE_UC + */ +static unsigned long lookup_memtype(u64 paddr) +{ + int rettype = _PAGE_CACHE_WB; + struct memtype *entry; + + if (is_ISA_range(paddr, paddr + PAGE_SIZE - 1)) + return rettype; + + if (pat_pagerange_is_ram(paddr, paddr + PAGE_SIZE)) { + struct page *page; + spin_lock(&memtype_lock); + page = pfn_to_page(paddr >> PAGE_SHIFT); + rettype = get_page_memtype(page); + spin_unlock(&memtype_lock); + /* + * -1 from get_page_memtype() implies RAM page is in its + * default state and not reserved, and hence of type WB + */ + if (rettype == -1) + rettype = _PAGE_CACHE_WB; + + return rettype; + } + + spin_lock(&memtype_lock); + + entry = memtype_rb_search(&memtype_rbroot, paddr); + if (entry != NULL) + rettype = entry->type; + else + rettype = _PAGE_CACHE_UC_MINUS; + + spin_unlock(&memtype_lock); + return rettype; +} +#endif + +/** + * io_reserve_memtype - Request a memory type mapping for a region of memory + * @start: start (physical address) of the region + * @end: end (physical address) of the region + * @type: A pointer to memtype, with requested type. On success, requested + * or any other compatible type that was available for the region is returned + * + * On success, returns 0 + * On failure, returns non-zero + */ +int io_reserve_memtype(resource_size_t start, resource_size_t end, + unsigned long *type) +{ + resource_size_t size = end - start; + unsigned long req_type = *type; + unsigned long new_type; + int ret; + + WARN_ON_ONCE(iomem_map_sanity_check(start, size)); + + ret = reserve_memtype(start, end, req_type, &new_type); + if (ret) + goto out_err; + + if (!is_new_memtype_allowed(start, size, req_type, new_type)) + goto out_free; + + if (kernel_map_sync_memtype(start, size, new_type) < 0) + goto out_free; + + *type = new_type; + return 0; + +out_free: + free_memtype(start, end); + ret = -EBUSY; +out_err: + return ret; +} + +/** + * io_free_memtype - Release a memory type mapping for a region of memory + * @start: start (physical address) of the region + * @end: end (physical address) of the region + */ +void io_free_memtype(resource_size_t start, resource_size_t end) +{ + free_memtype(start, end); +} + pgprot_t phys_mem_access_prot(struct file *file, unsigned long mfn, unsigned long size, pgprot_t vma_prot) { @@ -605,9 +772,6 @@ int phys_mem_access_prot_allowed(struct */ int kernel_map_sync_memtype(u64 ma, unsigned long size, unsigned long flags) { - if (!pat_enabled) - return 0; - return ioremap_check_change_attr(ma >> PAGE_SHIFT, size, flags); } @@ -628,11 +792,29 @@ static int reserve_pfn_range(u64 paddr, is_ram = pat_pagerange_is_ram(paddr, paddr + size); /* - * reserve_pfn_range() doesn't support RAM pages. Maintain the current - * behavior with RAM pages by returning success. + * reserve_pfn_range() for RAM pages. We do not refcount to keep + * track of number of mappings of RAM pages. We can assert that + * the type requested matches the type of first page in the range. */ - if (is_ram != 0) + if (is_ram) { + if (!pat_enabled) + return 0; + + flags = lookup_memtype(paddr); + if (want_flags != flags) { + printk(KERN_WARNING + "%s:%d map pfn RAM range req %s for %Lx-%Lx, got %s\n", + current->comm, current->pid, + cattr_name(want_flags), + (unsigned long long)paddr, + (unsigned long long)(paddr + size), + cattr_name(flags)); + *vma_prot = __pgprot((pgprot_val(*vma_prot) & + (~_PAGE_CACHE_MASK)) | + flags); + } return 0; + } ret = reserve_memtype(paddr, paddr + size, want_flags, &flags); if (ret) @@ -694,14 +876,6 @@ int track_pfn_vma_copy(struct vm_area_st unsigned long vma_size = vma->vm_end - vma->vm_start; pgprot_t pgprot; - if (!pat_enabled) - return 0; - - /* - * For now, only handle remap_pfn_range() vmas where - * is_linear_pfn_mapping() == TRUE. Handling of - * vm_insert_pfn() is TBD. - */ if (is_linear_pfn_mapping(vma)) { /* * reserve the whole chunk covered by vma. We need the @@ -729,23 +903,24 @@ int track_pfn_vma_copy(struct vm_area_st int track_pfn_vma_new(struct vm_area_struct *vma, pgprot_t *prot, unsigned long pfn, unsigned long size) { + unsigned long flags; resource_size_t paddr; unsigned long vma_size = vma->vm_end - vma->vm_start; - if (!pat_enabled) - return 0; - - /* - * For now, only handle remap_pfn_range() vmas where - * is_linear_pfn_mapping() == TRUE. Handling of - * vm_insert_pfn() is TBD. - */ if (is_linear_pfn_mapping(vma)) { /* reserve the whole chunk starting from vm_pgoff */ paddr = (resource_size_t)vma->vm_pgoff << PAGE_SHIFT; return reserve_pfn_range(paddr, vma_size, prot, 0); } + if (!pat_enabled) + return 0; + + /* for vm_insert_pfn and friends, we set prot based on lookup */ + flags = lookup_memtype(pfn << PAGE_SHIFT); + *prot = __pgprot((pgprot_val(vma->vm_page_prot) & (~_PAGE_CACHE_MASK)) | + flags); + return 0; } @@ -760,14 +935,6 @@ void untrack_pfn_vma(struct vm_area_stru resource_size_t paddr; unsigned long vma_size = vma->vm_end - vma->vm_start; - if (!pat_enabled) - return; - - /* - * For now, only handle remap_pfn_range() vmas where - * is_linear_pfn_mapping() == TRUE. Handling of - * vm_insert_pfn() is TBD. - */ if (is_linear_pfn_mapping(vma)) { /* free the whole chunk starting from vm_pgoff */ paddr = (resource_size_t)vma->vm_pgoff << PAGE_SHIFT; @@ -844,7 +1011,7 @@ static int memtype_seq_show(struct seq_f return 0; } -static struct seq_operations memtype_seq_ops = { +static const struct seq_operations memtype_seq_ops = { .start = memtype_seq_start, .next = memtype_seq_next, .stop = memtype_seq_stop, --- sle11sp1-2010-03-29.orig/arch/x86/mm/pgtable-xen.c 2009-11-06 10:52:09.000000000 +0100 +++ sle11sp1-2010-03-29/arch/x86/mm/pgtable-xen.c 2009-11-06 10:52:23.000000000 +0100 @@ -692,8 +692,7 @@ int ptep_set_access_flags(struct vm_area if (likely(vma->vm_mm == current->mm)) { if (HYPERVISOR_update_va_mapping(address, entry, - uvm_multi(vma->vm_mm->cpu_vm_mask) | - UVMF_INVLPG)) + uvm_multi(mm_cpumask(vma->vm_mm))|UVMF_INVLPG)) BUG(); } else { xen_l1_entry_update(ptep, entry); --- sle11sp1-2010-03-29.orig/arch/x86/mm/physaddr.c 2010-03-31 09:52:27.000000000 +0200 +++ sle11sp1-2010-03-29/arch/x86/mm/physaddr.c 2009-11-06 10:52:23.000000000 +0100 @@ -8,6 +8,10 @@ #ifdef CONFIG_X86_64 +#ifdef CONFIG_XEN +#define phys_base 0 +#endif + unsigned long __phys_addr(unsigned long x) { if (x >= __START_KERNEL_map) { --- sle11sp1-2010-03-29.orig/drivers/acpi/processor_core.c 2010-01-04 12:41:22.000000000 +0100 +++ sle11sp1-2010-03-29/drivers/acpi/processor_core.c 2010-01-04 12:42:24.000000000 +0100 @@ -880,7 +880,7 @@ static int __cpuinit acpi_processor_add( result = processor_extcntl_prepare(pr); if (result) - goto end; + goto err_power_exit; pr->cdev = thermal_cooling_device_register("Processor", device, &processor_cooling_ops); --- sle11sp1-2010-03-29.orig/drivers/char/agp/agp.h 2010-03-31 09:52:27.000000000 +0200 +++ sle11sp1-2010-03-29/drivers/char/agp/agp.h 2009-11-06 10:52:23.000000000 +0100 @@ -31,6 +31,10 @@ #include <asm/agp.h> /* for flush_agp_cache() */ +#ifndef virt_to_gart +#define virt_to_gart virt_to_phys +#endif + #define PFX "agpgart: " //#define AGP_DEBUG 1 --- sle11sp1-2010-03-29.orig/drivers/char/agp/amd-k7-agp.c 2010-03-31 09:52:27.000000000 +0200 +++ sle11sp1-2010-03-29/drivers/char/agp/amd-k7-agp.c 2009-11-06 10:52:23.000000000 +0100 @@ -44,7 +44,7 @@ static int amd_create_page_map(struct am #ifndef CONFIG_X86 SetPageReserved(virt_to_page(page_map->real)); global_cache_flush(); - page_map->remapped = ioremap_nocache(virt_to_phys(page_map->real), + page_map->remapped = ioremap_nocache(virt_to_gart(page_map->real), PAGE_SIZE); if (page_map->remapped == NULL) { ClearPageReserved(virt_to_page(page_map->real)); @@ -160,7 +160,7 @@ static int amd_create_gatt_table(struct agp_bridge->gatt_table_real = (u32 *)page_dir.real; agp_bridge->gatt_table = (u32 __iomem *)page_dir.remapped; - agp_bridge->gatt_bus_addr = virt_to_phys(page_dir.real); + agp_bridge->gatt_bus_addr = virt_to_gart(page_dir.real); /* Get the address for the gart region. * This is a bus address even on the alpha, b/c its @@ -173,7 +173,7 @@ static int amd_create_gatt_table(struct /* Calculate the agp offset */ for (i = 0; i < value->num_entries / 1024; i++, addr += 0x00400000) { - writel(virt_to_phys(amd_irongate_private.gatt_pages[i]->real) | 1, + writel(virt_to_gart(amd_irongate_private.gatt_pages[i]->real) | 1, page_dir.remapped+GET_PAGE_DIR_OFF(addr)); readl(page_dir.remapped+GET_PAGE_DIR_OFF(addr)); /* PCI Posting. */ } --- sle11sp1-2010-03-29.orig/drivers/char/agp/amd64-agp.c 2010-03-31 09:52:27.000000000 +0200 +++ sle11sp1-2010-03-29/drivers/char/agp/amd64-agp.c 2009-11-06 10:52:23.000000000 +0100 @@ -178,7 +178,7 @@ static const struct aper_size_info_32 am static int amd_8151_configure(void) { - unsigned long gatt_bus = virt_to_phys(agp_bridge->gatt_table_real); + unsigned long gatt_bus = virt_to_gart(agp_bridge->gatt_table_real); int i; /* Configure AGP regs in each x86-64 host bridge. */ @@ -558,7 +558,7 @@ static void __devexit agp_amd64_remove(s { struct agp_bridge_data *bridge = pci_get_drvdata(pdev); - release_mem_region(virt_to_phys(bridge->gatt_table_real), + release_mem_region(virt_to_gart(bridge->gatt_table_real), amd64_aperture_sizes[bridge->aperture_size_idx].size); agp_remove_bridge(bridge); agp_put_bridge(bridge); --- sle11sp1-2010-03-29.orig/drivers/char/agp/ati-agp.c 2010-03-31 09:52:27.000000000 +0200 +++ sle11sp1-2010-03-29/drivers/char/agp/ati-agp.c 2009-11-06 10:52:23.000000000 +0100 @@ -360,7 +360,7 @@ static int ati_create_gatt_table(struct agp_bridge->gatt_table_real = (u32 *)page_dir.real; agp_bridge->gatt_table = (u32 __iomem *) page_dir.remapped; - agp_bridge->gatt_bus_addr = virt_to_phys(page_dir.real); + agp_bridge->gatt_bus_addr = virt_to_gart(page_dir.real); /* Write out the size register */ current_size = A_SIZE_LVL2(agp_bridge->current_size); @@ -390,7 +390,7 @@ static int ati_create_gatt_table(struct /* Calculate the agp offset */ for (i = 0; i < value->num_entries / 1024; i++, addr += 0x00400000) { - writel(virt_to_phys(ati_generic_private.gatt_pages[i]->real) | 1, + writel(virt_to_gart(ati_generic_private.gatt_pages[i]->real) | 1, page_dir.remapped+GET_PAGE_DIR_OFF(addr)); readl(page_dir.remapped+GET_PAGE_DIR_OFF(addr)); /* PCI Posting. */ } --- sle11sp1-2010-03-29.orig/drivers/char/agp/efficeon-agp.c 2010-03-31 09:52:27.000000000 +0200 +++ sle11sp1-2010-03-29/drivers/char/agp/efficeon-agp.c 2009-11-06 10:52:23.000000000 +0100 @@ -226,7 +226,7 @@ static int efficeon_create_gatt_table(st efficeon_private.l1_table[index] = page; - value = virt_to_phys((unsigned long *)page) | pati | present | index; + value = virt_to_gart((unsigned long *)page) | pati | present | index; pci_write_config_dword(agp_bridge->dev, EFFICEON_ATTPAGE, value); --- sle11sp1-2010-03-29.orig/drivers/char/agp/generic.c 2010-03-31 09:52:27.000000000 +0200 +++ sle11sp1-2010-03-29/drivers/char/agp/generic.c 2009-11-06 10:52:23.000000000 +0100 @@ -988,7 +988,7 @@ int agp_generic_create_gatt_table(struct set_memory_uc((unsigned long)table, 1 << page_order); bridge->gatt_table = (void *)table; #else - bridge->gatt_table = ioremap_nocache(virt_to_phys(table), + bridge->gatt_table = ioremap_nocache(virt_to_gart(table), (PAGE_SIZE * (1 << page_order))); bridge->driver->cache_flush(); #endif @@ -1001,7 +1001,7 @@ int agp_generic_create_gatt_table(struct return -ENOMEM; } - bridge->gatt_bus_addr = virt_to_phys(bridge->gatt_table_real); + bridge->gatt_bus_addr = virt_to_gart(bridge->gatt_table_real); /* AK: bogus, should encode addresses > 4GB */ for (i = 0; i < num_entries; i++) { --- sle11sp1-2010-03-29.orig/drivers/char/agp/sworks-agp.c 2010-03-31 09:52:27.000000000 +0200 +++ sle11sp1-2010-03-29/drivers/char/agp/sworks-agp.c 2009-11-06 10:52:23.000000000 +0100 @@ -155,7 +155,7 @@ static int serverworks_create_gatt_table /* Create a fake scratch directory */ for (i = 0; i < 1024; i++) { writel(agp_bridge->scratch_page, serverworks_private.scratch_dir.remapped+i); - writel(virt_to_phys(serverworks_private.scratch_dir.real) | 1, page_dir.remapped+i); + writel(virt_to_gart(serverworks_private.scratch_dir.real) | 1, page_dir.remapped+i); } retval = serverworks_create_gatt_pages(value->num_entries / 1024); @@ -167,7 +167,7 @@ static int serverworks_create_gatt_table agp_bridge->gatt_table_real = (u32 *)page_dir.real; agp_bridge->gatt_table = (u32 __iomem *)page_dir.remapped; - agp_bridge->gatt_bus_addr = virt_to_phys(page_dir.real); + agp_bridge->gatt_bus_addr = virt_to_gart(page_dir.real); /* Get the address for the gart region. * This is a bus address even on the alpha, b/c its @@ -179,7 +179,7 @@ static int serverworks_create_gatt_table /* Calculate the agp offset */ for (i = 0; i < value->num_entries / 1024; i++) - writel(virt_to_phys(serverworks_private.gatt_pages[i]->real)|1, page_dir.remapped+i); + writel(virt_to_gart(serverworks_private.gatt_pages[i]->real)|1, page_dir.remapped+i); return 0; } --- sle11sp1-2010-03-29.orig/drivers/net/Kconfig 2009-11-06 10:51:07.000000000 +0100 +++ sle11sp1-2010-03-29/drivers/net/Kconfig 2009-11-06 11:36:52.000000000 +0100 @@ -3235,7 +3235,7 @@ config VIRTIO_NET config VMXNET3 tristate "VMware VMXNET3 ethernet driver" - depends on PCI && X86 && INET + depends on PCI && X86 && !XEN && INET help This driver supports VMware's vmxnet3 virtual ethernet NIC. To compile this driver as a module, choose M here: the --- sle11sp1-2010-03-29.orig/drivers/pci/msi-xen.c 2009-12-04 11:30:30.000000000 +0100 +++ sle11sp1-2010-03-29/drivers/pci/msi-xen.c 2009-12-04 11:32:09.000000000 +0100 @@ -16,12 +16,11 @@ #include <linux/proc_fs.h> #include <linux/msi.h> #include <linux/smp.h> +#include <linux/errno.h> +#include <linux/io.h> #include <xen/evtchn.h> -#include <asm/errno.h> -#include <asm/io.h> - #include "pci.h" #include "msi.h" @@ -479,7 +478,7 @@ static int msix_capability_init(struct p * to determine if MSI/-X are supported for the device. If MSI/-X is * supported return 0, else return an error code. **/ -static int pci_msi_check_device(struct pci_dev* dev, int nvec, int type) +static int pci_msi_check_device(struct pci_dev *dev, int nvec, int type) { struct pci_bus *bus; int ret; @@ -496,8 +495,9 @@ static int pci_msi_check_device(struct p if (nvec < 1) return -ERANGE; - /* Any bridge which does NOT route MSI transactions from it's - * secondary bus to it's primary bus must set NO_MSI flag on + /* + * Any bridge which does NOT route MSI transactions from its + * secondary bus to its primary bus must set NO_MSI flag on * the secondary pci_bus. * We expect only arch-specific PCI host bus controller driver * or quirks for specific PCI bridges to be setting NO_MSI. @@ -615,7 +615,7 @@ void pci_msi_shutdown(struct pci_dev *de dev->msi_enabled = 0; } -void pci_disable_msi(struct pci_dev* dev) +void pci_disable_msi(struct pci_dev *dev) { pci_msi_shutdown(dev); } @@ -655,14 +655,14 @@ int pci_msix_table_size(struct pci_dev * **/ extern int pci_frontend_enable_msix(struct pci_dev *dev, struct msix_entry *entries, int nvec); -int pci_enable_msix(struct pci_dev* dev, struct msix_entry *entries, int nvec) +int pci_enable_msix(struct pci_dev *dev, struct msix_entry *entries, int nvec) { int status, nr_entries; int i, j, temp; struct msi_dev_list *msi_dev_entry = get_msi_dev_pirq_list(dev); if (!entries) - return -EINVAL; + return -EINVAL; #ifdef CONFIG_XEN_PCIDEV_FRONTEND if (!is_initial_xendomain()) { @@ -737,7 +737,7 @@ int pci_enable_msix(struct pci_dev* dev, EXPORT_SYMBOL(pci_enable_msix); extern void pci_frontend_disable_msix(struct pci_dev* dev); -void pci_msix_shutdown(struct pci_dev* dev) +void pci_msix_shutdown(struct pci_dev *dev) { if (!pci_msi_enable || !dev || !dev->msix_enabled) return; @@ -770,7 +770,8 @@ void pci_msix_shutdown(struct pci_dev* d pci_intx_for_msi(dev, 1); dev->msix_enabled = 0; } -void pci_disable_msix(struct pci_dev* dev) + +void pci_disable_msix(struct pci_dev *dev) { pci_msix_shutdown(dev); } @@ -785,14 +786,14 @@ EXPORT_SYMBOL(pci_disable_msix); * allocated for this device function, are reclaimed to unused state, * which may be used later on. **/ -void msi_remove_pci_irq_vectors(struct pci_dev* dev) +void msi_remove_pci_irq_vectors(struct pci_dev *dev) { unsigned long flags; struct msi_dev_list *msi_dev_entry; struct msi_pirq_entry *pirq_entry, *tmp; if (!pci_msi_enable || !dev) - return; + return; msi_dev_entry = get_msi_dev_pirq_list(dev); --- sle11sp1-2010-03-29.orig/drivers/pci/probe.c 2010-03-29 09:08:49.000000000 +0200 +++ sle11sp1-2010-03-29/drivers/pci/probe.c 2010-03-29 09:09:32.000000000 +0200 @@ -1133,7 +1133,11 @@ int pci_scan_slot(struct pci_bus *bus, i if (dev && !dev->is_added) /* new device? */ nr++; +#ifndef pcibios_scan_all_fns if (dev && dev->multifunction) { +#else + if (dev ? dev->multifunction : pcibios_scan_all_fns(bus, devfn)) { +#endif for (fn = 1; fn < 8; fn++) { dev = pci_scan_single_device(bus, devfn + fn); if (dev) { --- sle11sp1-2010-03-29.orig/drivers/sfi/sfi_core.c 2010-03-31 09:52:27.000000000 +0200 +++ sle11sp1-2010-03-29/drivers/sfi/sfi_core.c 2009-12-16 11:53:57.000000000 +0100 @@ -387,6 +387,11 @@ void __init sfi_init(void) if (!acpi_disabled) disable_sfi(); +#ifdef CONFIG_XEN + if (!is_initial_xendomain()) + disable_sfi(); +#endif + if (sfi_disabled) return; --- sle11sp1-2010-03-29.orig/drivers/staging/hv/Kconfig 2010-03-31 09:52:27.000000000 +0200 +++ sle11sp1-2010-03-29/drivers/staging/hv/Kconfig 2009-11-06 10:52:23.000000000 +0100 @@ -1,6 +1,6 @@ config HYPERV tristate "Microsoft Hyper-V client drivers" - depends on X86 && m + depends on X86 && !XEN && m default n help Select this option to run Linux as a Hyper-V client operating --- sle11sp1-2010-03-29.orig/drivers/xen/Kconfig 2009-12-18 12:27:52.000000000 +0100 +++ sle11sp1-2010-03-29/drivers/xen/Kconfig 2010-03-26 08:39:39.000000000 +0100 @@ -22,6 +22,7 @@ config XEN_UNPRIVILEGED_GUEST select PM select PM_SLEEP select PM_SLEEP_SMP if SMP + select PM_RUNTIME if PCI select SUSPEND config XEN_PRIVCMD --- sle11sp1-2010-03-29.orig/drivers/xen/Makefile 2009-11-06 10:52:09.000000000 +0100 +++ sle11sp1-2010-03-29/drivers/xen/Makefile 2009-11-06 10:52:23.000000000 +0100 @@ -10,6 +10,11 @@ obj-$(CONFIG_XEN) += evtchn/ obj-y += xenbus/ obj-$(CONFIG_XEN) += char/ +nostackp := $(call cc-option, -fno-stack-protector) +ifeq ($(CONFIG_PARAVIRT_XEN),y) +CFLAGS_features.o := $(nostackp) +endif + obj-$(CONFIG_XEN) += features.o util.o obj-$(CONFIG_HOTPLUG_CPU) += $(xen-hotplug-y) obj-$(CONFIG_XEN_XENCOMM) += $(xen-xencomm-y) --- sle11sp1-2010-03-29.orig/drivers/xen/balloon/balloon.c 2010-02-02 14:56:12.000000000 +0100 +++ sle11sp1-2010-03-29/drivers/xen/balloon/balloon.c 2010-03-31 09:58:59.000000000 +0200 @@ -77,6 +77,11 @@ static DEFINE_MUTEX(balloon_mutex); */ DEFINE_SPINLOCK(balloon_lock); +#ifndef MODULE +#include <linux/pagevec.h> +static struct pagevec free_pagevec; +#endif + struct balloon_stats balloon_stats; /* We increase/decrease in batches which fit in a page */ @@ -200,14 +205,27 @@ static struct page *balloon_next_page(st static inline void balloon_free_page(struct page *page) { #ifndef MODULE - if (put_page_testzero(page)) - free_cold_page(page); + if (put_page_testzero(page) && !pagevec_add(&free_pagevec, page)) { + __pagevec_free(&free_pagevec); + pagevec_reinit(&free_pagevec); + } #else - /* free_cold_page() is not being exported. */ + /* pagevec interface is not being exported. */ __free_page(page); #endif } +static inline void balloon_free_and_unlock(unsigned long flags) +{ +#ifndef MODULE + if (pagevec_count(&free_pagevec)) { + __pagevec_free(&free_pagevec); + pagevec_reinit(&free_pagevec); + } +#endif + balloon_unlock(flags); +} + static void balloon_alarm(unsigned long unused) { schedule_work(&balloon_worker); @@ -320,7 +338,7 @@ static int increase_reservation(unsigned totalram_pages = bs.current_pages - totalram_bias; out: - balloon_unlock(flags); + balloon_free_and_unlock(flags); #ifndef MODULE setup_per_zone_wmarks(); @@ -559,6 +577,7 @@ static int __init balloon_init(void) IPRINTK("Initialising balloon driver.\n"); #ifdef CONFIG_XEN + pagevec_init(&free_pagevec, true); bs.current_pages = min(xen_start_info->nr_pages, max_pfn); totalram_pages = bs.current_pages; #else @@ -720,8 +739,8 @@ struct page **alloc_empty_pages_and_page } if (ret != 0) { - balloon_unlock(flags); balloon_free_page(page); + balloon_free_and_unlock(flags); goto err; } --- sle11sp1-2010-03-29.orig/drivers/xen/blkfront/vbd.c 2010-01-18 16:54:23.000000000 +0100 +++ sle11sp1-2010-03-29/drivers/xen/blkfront/vbd.c 2010-01-18 16:54:56.000000000 +0100 @@ -105,7 +105,7 @@ static struct xlbd_major_info *major_inf #define XLBD_MAJOR_VBD_ALT(idx) ((idx) ^ XLBD_MAJOR_VBD_START ^ (XLBD_MAJOR_VBD_START + 1)) -static struct block_device_operations xlvbd_block_fops = +static const struct block_device_operations xlvbd_block_fops = { .owner = THIS_MODULE, .open = blkif_open, --- sle11sp1-2010-03-29.orig/drivers/xen/blktap2/device.c 2010-03-01 14:34:58.000000000 +0100 +++ sle11sp1-2010-03-29/drivers/xen/blktap2/device.c 2009-11-06 10:52:23.000000000 +0100 @@ -141,7 +141,7 @@ blktap_device_ioctl(struct block_device return 0; } -static struct block_device_operations blktap_device_file_operations = { +static const struct block_device_operations blktap_device_file_operations = { .owner = THIS_MODULE, .open = blktap_device_open, .release = blktap_device_release, --- sle11sp1-2010-03-29.orig/drivers/xen/core/evtchn.c 2010-02-09 17:06:02.000000000 +0100 +++ sle11sp1-2010-03-29/drivers/xen/core/evtchn.c 2009-11-06 11:04:38.000000000 +0100 @@ -138,13 +138,13 @@ static inline unsigned int type_from_irq } /* IRQ <-> VIRQ mapping. */ -DEFINE_PER_CPU(int, virq_to_irq[NR_VIRQS]) = {[0 ... NR_VIRQS-1] = -1}; +DEFINE_PER_CPU(int[NR_VIRQS], virq_to_irq) = {[0 ... NR_VIRQS-1] = -1}; /* IRQ <-> IPI mapping. */ #ifndef NR_IPIS #define NR_IPIS 1 #endif -DEFINE_PER_CPU(int, ipi_to_irq[NR_IPIS]) = {[0 ... NR_IPIS-1] = -1}; +DEFINE_PER_CPU(int[NR_IPIS], ipi_to_irq) = {[0 ... NR_IPIS-1] = -1}; #ifdef CONFIG_SMP --- sle11sp1-2010-03-29.orig/drivers/xen/core/reboot.c 2009-11-06 10:51:42.000000000 +0100 +++ sle11sp1-2010-03-29/drivers/xen/core/reboot.c 2010-03-15 12:13:22.000000000 +0100 @@ -83,7 +83,7 @@ static int xen_suspend(void *__unused) int err, old_state; daemonize("suspend"); - err = set_cpus_allowed(current, cpumask_of_cpu(0)); + err = set_cpus_allowed_ptr(current, cpumask_of(0)); if (err) { printk(KERN_ERR "Xen suspend can't run on CPU0 (%d)\n", err); goto fail; --- sle11sp1-2010-03-29.orig/drivers/xen/netback/interface.c 2010-01-04 12:40:53.000000000 +0100 +++ sle11sp1-2010-03-29/drivers/xen/netback/interface.c 2010-01-04 12:42:38.000000000 +0100 @@ -159,7 +159,7 @@ static void netbk_get_strings(struct net } } -static struct ethtool_ops network_ethtool_ops = +static const struct ethtool_ops network_ethtool_ops = { .get_drvinfo = netbk_get_drvinfo, --- sle11sp1-2010-03-29.orig/drivers/xen/netback/loopback.c 2009-11-06 10:52:09.000000000 +0100 +++ sle11sp1-2010-03-29/drivers/xen/netback/loopback.c 2009-11-06 10:52:23.000000000 +0100 @@ -134,7 +134,7 @@ static int loopback_start_xmit(struct sk if (!skb_remove_foreign_references(skb)) { np->stats.tx_dropped++; dev_kfree_skb(skb); - return 0; + return NETDEV_TX_OK; } dst_release(skb_dst(skb)); @@ -173,7 +173,7 @@ static int loopback_start_xmit(struct sk netif_rx(skb); - return 0; + return NETDEV_TX_OK; } static struct net_device_stats *loopback_get_stats(struct net_device *dev) @@ -182,7 +182,7 @@ static struct net_device_stats *loopback return &np->stats; } -static struct ethtool_ops network_ethtool_ops = +static const struct ethtool_ops network_ethtool_ops = { .get_tx_csum = ethtool_op_get_tx_csum, .set_tx_csum = ethtool_op_set_tx_csum, --- sle11sp1-2010-03-29.orig/drivers/xen/netback/netback.c 2010-01-04 12:40:57.000000000 +0100 +++ sle11sp1-2010-03-29/drivers/xen/netback/netback.c 2009-11-06 10:52:23.000000000 +0100 @@ -340,12 +340,12 @@ int netif_be_start_xmit(struct sk_buff * skb_queue_tail(&rx_queue, skb); tasklet_schedule(&net_rx_tasklet); - return 0; + return NETDEV_TX_OK; drop: netif->stats.tx_dropped++; dev_kfree_skb(skb); - return 0; + return NETDEV_TX_OK; } #if 0 --- sle11sp1-2010-03-29.orig/drivers/xen/netfront/netfront.c 2009-11-06 10:52:09.000000000 +0100 +++ sle11sp1-2010-03-29/drivers/xen/netfront/netfront.c 2009-11-06 10:52:23.000000000 +0100 @@ -953,7 +953,7 @@ static int network_start_xmit(struct sk_ if (np->accel_vif_state.hooks && np->accel_vif_state.hooks->start_xmit(skb, dev)) { /* Fast path has sent this packet */ - return 0; + return NETDEV_TX_OK; } frags += DIV_ROUND_UP(offset + len, PAGE_SIZE); @@ -1042,12 +1042,12 @@ static int network_start_xmit(struct sk_ spin_unlock_irq(&np->tx_lock); - return 0; + return NETDEV_TX_OK; drop: np->stats.tx_dropped++; dev_kfree_skb(skb); - return 0; + return NETDEV_TX_OK; } static irqreturn_t netif_int(int irq, void *dev_id) @@ -1872,7 +1872,7 @@ static void netif_uninit(struct net_devi gnttab_free_grant_references(np->gref_rx_head); } -static struct ethtool_ops network_ethtool_ops = +static const struct ethtool_ops network_ethtool_ops = { .get_tx_csum = ethtool_op_get_tx_csum, .set_tx_csum = ethtool_op_set_tx_csum, --- sle11sp1-2010-03-29.orig/drivers/xen/sfc_netback/accel_fwd.c 2009-11-06 10:51:17.000000000 +0100 +++ sle11sp1-2010-03-29/drivers/xen/sfc_netback/accel_fwd.c 2009-11-06 10:52:23.000000000 +0100 @@ -181,11 +181,10 @@ int netback_accel_fwd_add(const __u8 *ma unsigned long flags; cuckoo_hash_mac_key key = cuckoo_mac_to_key(mac); struct port_fwd *fwd_set = (struct port_fwd *)fwd_priv; - DECLARE_MAC_BUF(buf); BUG_ON(fwd_priv == NULL); - DPRINTK("Adding mac %s\n", print_mac(buf, mac)); + DPRINTK("Adding mac %pM\n", mac); spin_lock_irqsave(&fwd_set->fwd_lock, flags); @@ -200,8 +199,7 @@ int netback_accel_fwd_add(const __u8 *ma if (cuckoo_hash_lookup(&fwd_set->fwd_hash_table, (cuckoo_hash_key *)(&key), &rc) != 0) { spin_unlock_irqrestore(&fwd_set->fwd_lock, flags); - EPRINTK("MAC address %s already accelerated.\n", - print_mac(buf, mac)); + EPRINTK("MAC address %pM already accelerated.\n", mac); return -EEXIST; } @@ -236,9 +234,8 @@ void netback_accel_fwd_remove(const __u8 unsigned long flags; cuckoo_hash_mac_key key = cuckoo_mac_to_key(mac); struct port_fwd *fwd_set = (struct port_fwd *)fwd_priv; - DECLARE_MAC_BUF(buf); - DPRINTK("Removing mac %s\n", print_mac(buf, mac)); + DPRINTK("Removing mac %pM\n", mac); BUG_ON(fwd_priv == NULL); @@ -396,16 +393,14 @@ void netback_accel_tx_packet(struct sk_b if (is_broadcast_ether_addr(skb_mac_header(skb)) && packet_is_arp_reply(skb)) { - DECLARE_MAC_BUF(buf); - /* * update our fast path forwarding to reflect this * gratuitous ARP */ mac = skb_mac_header(skb)+ETH_ALEN; - DPRINTK("%s: found gratuitous ARP for %s\n", - __FUNCTION__, print_mac(buf, mac)); + DPRINTK("%s: found gratuitous ARP for %pM\n", + __FUNCTION__, mac); spin_lock_irqsave(&fwd_set->fwd_lock, flags); /* --- sle11sp1-2010-03-29.orig/drivers/xen/sfc_netback/accel_msg.c 2009-11-06 10:51:17.000000000 +0100 +++ sle11sp1-2010-03-29/drivers/xen/sfc_netback/accel_msg.c 2009-11-06 10:52:23.000000000 +0100 @@ -57,11 +57,10 @@ static void netback_accel_msg_tx_localma { unsigned long lock_state; struct net_accel_msg *msg; - DECLARE_MAC_BUF(buf); BUG_ON(bend == NULL || mac == NULL); - VPRINTK("Sending local mac message: %s\n", print_mac(buf, mac)); + VPRINTK("Sending local mac message: %pM\n", mac); msg = net_accel_msg_start_send(bend->shared_page, &bend->to_domU, &lock_state); --- sle11sp1-2010-03-29.orig/drivers/xen/sfc_netfront/accel_msg.c 2009-11-06 10:52:02.000000000 +0100 +++ sle11sp1-2010-03-29/drivers/xen/sfc_netfront/accel_msg.c 2009-11-06 10:52:23.000000000 +0100 @@ -327,10 +327,8 @@ static int vnic_process_localmac_msg(net cuckoo_hash_mac_key key; if (msg->u.localmac.flags & NET_ACCEL_MSG_ADD) { - DECLARE_MAC_BUF(buf); - - DPRINTK("MAC has moved, could be local: %s\n", - print_mac(buf, msg->u.localmac.mac)); + DPRINTK("MAC has moved, could be local: %pM\n", + msg->u.localmac.mac); key = cuckoo_mac_to_key(msg->u.localmac.mac); spin_lock_irqsave(&vnic->table_lock, flags); /* Try to remove it, not a big deal if not there */ --- sle11sp1-2010-03-29.orig/drivers/xen/sfc_netfront/accel_vi.c 2010-01-18 16:19:18.000000000 +0100 +++ sle11sp1-2010-03-29/drivers/xen/sfc_netfront/accel_vi.c 2010-01-18 16:55:03.000000000 +0100 @@ -643,10 +643,7 @@ netfront_accel_vi_tx_post(netfront_accel (cuckoo_hash_key *)(&key), &value); if (!try_fastpath) { - DECLARE_MAC_BUF(buf); - - VPRINTK("try fast path false for mac: %s\n", - print_mac(buf, skb->data)); + VPRINTK("try fast path false for mac: %pM\n", skb->data); return NETFRONT_ACCEL_STATUS_CANT; } @@ -772,10 +769,9 @@ static void netfront_accel_vi_rx_comple if (compare_ether_addr(skb->data, vnic->mac)) { struct iphdr *ip = (struct iphdr *)(skb->data + ETH_HLEN); u16 port; - DECLARE_MAC_BUF(buf); - DPRINTK("%s: saw wrong MAC address %s\n", - __FUNCTION__, print_mac(buf, skb->data)); + DPRINTK("%s: saw wrong MAC address %pM\n", + __FUNCTION__, skb->data); if (ip->protocol == IPPROTO_TCP) { struct tcphdr *tcp = (struct tcphdr *) --- sle11sp1-2010-03-29.orig/drivers/xen/xenbus/xenbus_dev.c 2009-05-29 10:25:53.000000000 +0200 +++ sle11sp1-2010-03-29/drivers/xen/xenbus/xenbus_dev.c 2009-11-06 10:52:23.000000000 +0100 @@ -36,6 +36,7 @@ #include <linux/errno.h> #include <linux/uio.h> #include <linux/notifier.h> +#include <linux/sched.h> #include <linux/wait.h> #include <linux/fs.h> #include <linux/poll.h> --- sle11sp1-2010-03-29.orig/drivers/xen/xenbus/xenbus_probe.c 2009-12-04 11:30:36.000000000 +0100 +++ sle11sp1-2010-03-29/drivers/xen/xenbus/xenbus_probe.c 2009-11-06 10:52:23.000000000 +0100 @@ -42,6 +42,7 @@ #include <linux/ctype.h> #include <linux/fcntl.h> #include <linux/mm.h> +#include <linux/sched.h> #include <linux/proc_fs.h> #include <linux/notifier.h> #include <linux/mutex.h> --- sle11sp1-2010-03-29.orig/lib/swiotlb-xen.c 2009-12-14 17:30:30.000000000 +0100 +++ sle11sp1-2010-03-29/lib/swiotlb-xen.c 2009-12-14 17:36:28.000000000 +0100 @@ -111,79 +111,11 @@ setup_io_tlb_npages(char *str) __setup("swiotlb=", setup_io_tlb_npages); /* make io_tlb_overflow tunable too? */ -void *__init swiotlb_alloc_boot(size_t size, unsigned long nslabs) -{ - void *start = alloc_bootmem_pages(size); - unsigned int i; - int rc; - - dma_bits = get_order(IO_TLB_SEGSIZE << IO_TLB_SHIFT) + PAGE_SHIFT; - for (i = 0; i < nslabs; i += IO_TLB_SEGSIZE) { - do { - rc = xen_create_contiguous_region( - (unsigned long)start + (i << IO_TLB_SHIFT), - get_order(IO_TLB_SEGSIZE << IO_TLB_SHIFT), - dma_bits); - } while (rc && dma_bits++ < max_dma_bits); - if (rc) { - if (i == 0) - panic("No suitable physical memory available for SWIOTLB buffer!\n" - "Use dom0_mem Xen boot parameter to reserve\n" - "some DMA memory (e.g., dom0_mem=-128M).\n"); - io_tlb_nslabs = i; - i <<= IO_TLB_SHIFT; - free_bootmem(__pa(start + i), size - i); - size = i; - for (dma_bits = 0; i > 0; i -= IO_TLB_SEGSIZE << IO_TLB_SHIFT) { - unsigned int bits = fls64(virt_to_bus(start + i - 1)); - - if (bits > dma_bits) - dma_bits = bits; - } - break; - } - } - - return start; -} - -#ifndef CONFIG_XEN -void * __weak swiotlb_alloc(unsigned order, unsigned long nslabs) -{ - return (void *)__get_free_pages(GFP_DMA | __GFP_NOWARN, order); -} -#endif - -dma_addr_t swiotlb_phys_to_bus(struct device *hwdev, phys_addr_t paddr) -{ - return phys_to_machine(paddr); -} - -phys_addr_t swiotlb_bus_to_phys(struct device *hwdev, dma_addr_t baddr) -{ - return machine_to_phys(baddr); -} - +/* Note that this doesn't work with highmem page */ static dma_addr_t swiotlb_virt_to_bus(struct device *hwdev, volatile void *address) { - return swiotlb_phys_to_bus(hwdev, virt_to_phys(address)); -} - -void * __weak swiotlb_bus_to_virt(struct device *hwdev, dma_addr_t address) -{ - return phys_to_virt(swiotlb_bus_to_phys(hwdev, address)); -} - -int __weak swiotlb_arch_address_needs_mapping(struct device *hwdev, - dma_addr_t addr, size_t size) -{ - return !is_buffer_dma_capable(dma_get_mask(hwdev), addr, size); -} - -int __weak swiotlb_arch_range_needs_mapping(phys_addr_t paddr, size_t size) -{ - return 0; + return phys_to_dma(hwdev, virt_to_phys(address)); } static void swiotlb_print_info(unsigned long bytes) @@ -216,10 +148,35 @@ swiotlb_init_with_default_size(size_t de /* * Get IO TLB memory from the low pages */ - io_tlb_start = swiotlb_alloc_boot(bytes, io_tlb_nslabs); + io_tlb_start = alloc_bootmem_pages(bytes); if (!io_tlb_start) panic("Cannot allocate SWIOTLB buffer!\n"); - bytes = io_tlb_nslabs << IO_TLB_SHIFT; + dma_bits = get_order(IO_TLB_SEGSIZE << IO_TLB_SHIFT) + PAGE_SHIFT; + for (i = 0; i < io_tlb_nslabs; i += IO_TLB_SEGSIZE) { + do { + rc = xen_create_contiguous_region( + (unsigned long)io_tlb_start + (i << IO_TLB_SHIFT), + get_order(IO_TLB_SEGSIZE << IO_TLB_SHIFT), + dma_bits); + } while (rc && dma_bits++ < max_dma_bits); + if (rc) { + if (i == 0) + panic("No suitable physical memory available for SWIOTLB buffer!\n" + "Use dom0_mem Xen boot parameter to reserve\n" + "some DMA memory (e.g., dom0_mem=-128M).\n"); + io_tlb_nslabs = i; + i <<= IO_TLB_SHIFT; + free_bootmem(__pa(io_tlb_start + i), bytes - i); + bytes = i; + for (dma_bits = 0; i > 0; i -= IO_TLB_SEGSIZE << IO_TLB_SHIFT) { + unsigned int bits = fls64(virt_to_bus(io_tlb_start + i - 1)); + + if (bits > dma_bits) + dma_bits = bits; + } + break; + } + } io_tlb_end = io_tlb_start + bytes; /* @@ -283,13 +240,10 @@ static inline int range_needs_mapping(ph static int is_swiotlb_buffer(dma_addr_t addr) { unsigned long pfn = mfn_to_local_pfn(PFN_DOWN(addr)); - char *va = pfn_valid(pfn) ? __va(pfn << PAGE_SHIFT) : NULL; + phys_addr_t paddr = (phys_addr_t)pfn << PAGE_SHIFT; -#ifdef CONFIG_HIGHMEM - if (pfn >= highstart_pfn) - return 0; -#endif - return va >= io_tlb_start && va < io_tlb_end; + return paddr >= virt_to_phys(io_tlb_start) && + paddr < virt_to_phys(io_tlb_end); } /* @@ -514,12 +468,15 @@ swiotlb_full(struct device *dev, size_t printk(KERN_ERR "PCI-DMA: Out of SW-IOMMU space for %zu bytes at " "device %s\n", size, dev ? dev_name(dev) : "?"); - if (size > io_tlb_overflow && do_panic) { - if (dir == PCI_DMA_FROMDEVICE || dir == PCI_DMA_BIDIRECTIONAL) - panic("PCI-DMA: Memory would be corrupted\n"); - if (dir == PCI_DMA_TODEVICE || dir == PCI_DMA_BIDIRECTIONAL) - panic("PCI-DMA: Random memory would be DMAed\n"); - } + if (size <= io_tlb_overflow || !do_panic) + return; + + if (dir == DMA_BIDIRECTIONAL) + panic("DMA: Random memory could be DMA accessed\n"); + if (dir == DMA_FROM_DEVICE) + panic("DMA: Random memory could be DMA written\n"); + if (dir == DMA_TO_DEVICE) + panic("DMA: Random memory could be DMA read\n"); } /* @@ -545,7 +502,7 @@ dma_addr_t swiotlb_map_page(struct devic * we can safely return the device addr and not worry about bounce * buffering it. */ - if (!address_needs_mapping(dev, dev_addr, size) && + if (dma_capable(dev, dev_addr, size) && !range_needs_mapping(phys, size)) return dev_addr; @@ -575,12 +532,12 @@ EXPORT_SYMBOL_GPL(swiotlb_map_page); static void unmap_single(struct device *hwdev, dma_addr_t dev_addr, size_t size, int dir) { - char *dma_addr = swiotlb_bus_to_virt(hwdev, dev_addr); + phys_addr_t paddr = dma_to_phys(hwdev, dev_addr); BUG_ON(dir == DMA_NONE); if (is_swiotlb_buffer(dev_addr)) { - do_unmap_single(hwdev, dma_addr, size, dir); + do_unmap_single(hwdev, phys_to_virt(paddr), size, dir); return; } @@ -609,12 +566,12 @@ void swiotlb_sync_single_for_cpu(struct device *hwdev, dma_addr_t dev_addr, size_t size, enum dma_data_direction dir) { - char *dma_addr = swiotlb_bus_to_virt(hwdev, dev_addr); + phys_addr_t paddr = dma_to_phys(hwdev, dev_addr); BUG_ON(dir == DMA_NONE); if (is_swiotlb_buffer(dev_addr)) - sync_single(hwdev, dma_addr, size, dir); + sync_single(hwdev, phys_to_virt(paddr), size, dir); } EXPORT_SYMBOL(swiotlb_sync_single_for_cpu); @@ -622,12 +579,12 @@ void swiotlb_sync_single_for_device(struct device *hwdev, dma_addr_t dev_addr, size_t size, enum dma_data_direction dir) { - char *dma_addr = swiotlb_bus_to_virt(hwdev, dev_addr); + phys_addr_t paddr = dma_to_phys(hwdev, dev_addr); BUG_ON(dir == DMA_NONE); if (is_swiotlb_buffer(dev_addr)) - sync_single(hwdev, dma_addr, size, dir); + sync_single(hwdev, phys_to_virt(paddr), size, dir); } EXPORT_SYMBOL(swiotlb_sync_single_for_device); @@ -680,8 +637,8 @@ swiotlb_map_sg_attrs(struct device *hwde phys_addr_t paddr = page_to_pseudophys(sg_page(sg)) + sg->offset; - if (range_needs_mapping(paddr, sg->length) - || address_needs_mapping(hwdev, dev_addr, sg->length)) { + if (range_needs_mapping(paddr, sg->length) || + !dma_capable(hwdev, dev_addr, sg->length)) { void *map; gnttab_dma_unmap_page(dev_addr);