- Update Xen patches to c/s 1011.
[opensuse:kernel-source.git] / patches.xen / xen3-patch-2.6.22
1 From: www.kernel.org
2 Subject: Update to 2.6.22
3 Patch-mainline: 2.6.22
4
5 Automatically created from "patches.kernel.org/patch-2.6.22" by xen-port-patches.py
6
7 Acked-by: jbeulich@novell.com
8
9 --- sle11sp1-2010-03-01.orig/arch/x86/Kconfig   2009-11-06 10:46:41.000000000 +0100
10 +++ sle11sp1-2010-03-01/arch/x86/Kconfig        2010-02-09 16:48:41.000000000 +0100
11 @@ -1661,7 +1661,7 @@ config PHYSICAL_START
12  
13  config RELOCATABLE
14         bool "Build a relocatable kernel"
15 -       depends on !X86_XEN
16 +       depends on !XEN
17         default y
18         ---help---
19           This builds a kernel image that retains relocation information
20 @@ -1721,7 +1721,6 @@ config COMPAT_VDSO
21         def_bool y
22         prompt "Compat VDSO support"
23         depends on X86_32 || IA32_EMULATION
24 -       depends on !X86_XEN
25         ---help---
26           Map the 32-bit VDSO to the predictable old-style address too.
27         ---help---
28 @@ -1944,6 +1943,7 @@ config PCI
29         bool "PCI support"
30         default y
31         select ARCH_SUPPORTS_MSI if (X86_LOCAL_APIC && X86_IO_APIC)
32 +       select ARCH_SUPPORTS_MSI if (XEN_UNPRIVILEGED_GUEST && XEN_PCIDEV_FRONTEND)
33         ---help---
34           Find out whether you have a PCI motherboard. PCI is the name of a
35           bus system, i.e. the way the CPU talks to the other stuff inside
36 --- sle11sp1-2010-03-01.orig/arch/x86/kernel/Makefile   2009-11-06 10:46:41.000000000 +0100
37 +++ sle11sp1-2010-03-01/arch/x86/kernel/Makefile        2009-11-06 10:49:47.000000000 +0100
38 @@ -141,4 +141,4 @@ endif
39  disabled-obj-$(CONFIG_XEN) := early-quirks.o hpet.o i8253.o i8259_$(BITS).o reboot.o \
40         smpboot_$(BITS).o tsc_$(BITS).o tsc_sync.o
41  disabled-obj-$(CONFIG_XEN_UNPRIVILEGED_GUEST) += mpparse_64.o
42 -%/head_$(BITS).o %/head_$(BITS).s: $(if $(CONFIG_XEN),EXTRA_AFLAGS,dummy) :=
43 +%/head_64.o %/head_64.s: $(if $(CONFIG_XEN),EXTRA_AFLAGS,dummy) :=
44 --- sle11sp1-2010-03-01.orig/arch/x86/kernel/apic_32-xen.c      2009-11-06 10:46:41.000000000 +0100
45 +++ sle11sp1-2010-03-01/arch/x86/kernel/apic_32-xen.c   2009-11-06 10:49:47.000000000 +0100
46 @@ -19,7 +19,6 @@
47  #include <linux/mm.h>
48  #include <linux/delay.h>
49  #include <linux/bootmem.h>
50 -#include <linux/smp_lock.h>
51  #include <linux/interrupt.h>
52  #include <linux/mc146818rtc.h>
53  #include <linux/kernel_stat.h>
54 --- sle11sp1-2010-03-01.orig/arch/x86/kernel/asm-offsets_32.c   2009-11-06 10:46:27.000000000 +0100
55 +++ sle11sp1-2010-03-01/arch/x86/kernel/asm-offsets_32.c        2009-11-06 10:49:47.000000000 +0100
56 @@ -111,11 +111,6 @@ void foo(void)
57  
58         OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx);
59  
60 -#ifdef CONFIG_XEN
61 -       BLANK();
62 -       OFFSET(XEN_START_mfn_list, start_info, mfn_list);
63 -#endif
64 -
65  #ifdef CONFIG_PARAVIRT
66         BLANK();
67         OFFSET(PARAVIRT_enabled, pv_info, paravirt_enabled);
68 --- sle11sp1-2010-03-01.orig/arch/x86/kernel/cpu/common-xen.c   2009-11-06 10:46:41.000000000 +0100
69 +++ sle11sp1-2010-03-01/arch/x86/kernel/cpu/common-xen.c        2009-11-06 10:49:47.000000000 +0100
70 @@ -22,16 +22,40 @@
71  #define phys_pkg_id(a,b) a
72  #endif
73  #endif
74 -#include <asm/pda.h>
75  #include <asm/hypervisor.h>
76  
77  #include "cpu.h"
78  
79 -DEFINE_PER_CPU(struct Xgt_desc_struct, cpu_gdt_descr);
80 -EXPORT_PER_CPU_SYMBOL(cpu_gdt_descr);
81 +DEFINE_PER_CPU(struct gdt_page, gdt_page) = { .gdt = {
82 +       [GDT_ENTRY_KERNEL_CS] = { 0x0000ffff, 0x00cf9a00 },
83 +       [GDT_ENTRY_KERNEL_DS] = { 0x0000ffff, 0x00cf9200 },
84 +       [GDT_ENTRY_DEFAULT_USER_CS] = { 0x0000ffff, 0x00cffa00 },
85 +       [GDT_ENTRY_DEFAULT_USER_DS] = { 0x0000ffff, 0x00cff200 },
86 +#ifndef CONFIG_XEN
87 +       /*
88 +        * Segments used for calling PnP BIOS have byte granularity.
89 +        * They code segments and data segments have fixed 64k limits,
90 +        * the transfer segment sizes are set at run time.
91 +        */
92 +       [GDT_ENTRY_PNPBIOS_CS32] = { 0x0000ffff, 0x00409a00 },/* 32-bit code */
93 +       [GDT_ENTRY_PNPBIOS_CS16] = { 0x0000ffff, 0x00009a00 },/* 16-bit code */
94 +       [GDT_ENTRY_PNPBIOS_DS] = { 0x0000ffff, 0x00009200 }, /* 16-bit data */
95 +       [GDT_ENTRY_PNPBIOS_TS1] = { 0x00000000, 0x00009200 },/* 16-bit data */
96 +       [GDT_ENTRY_PNPBIOS_TS2] = { 0x00000000, 0x00009200 },/* 16-bit data */
97 +       /*
98 +        * The APM segments have byte granularity and their bases
99 +        * are set at run time.  All have 64k limits.
100 +        */
101 +       [GDT_ENTRY_APMBIOS_BASE] = { 0x0000ffff, 0x00409a00 },/* 32-bit code */
102 +       /* 16-bit code */
103 +       [GDT_ENTRY_APMBIOS_BASE+1] = { 0x0000ffff, 0x00009a00 },
104 +       [GDT_ENTRY_APMBIOS_BASE+2] = { 0x0000ffff, 0x00409200 }, /* data */
105  
106 -struct i386_pda *_cpu_pda[NR_CPUS] __read_mostly;
107 -EXPORT_SYMBOL(_cpu_pda);
108 +       [GDT_ENTRY_ESPFIX_SS] = { 0x00000000, 0x00c09200 },
109 +#endif
110 +       [GDT_ENTRY_PERCPU] = { 0x00000000, 0x00000000 },
111 +} };
112 +EXPORT_PER_CPU_SYMBOL_GPL(gdt_page);
113  
114  static int cachesize_override __cpuinitdata = -1;
115  static int disable_x86_fxsr __cpuinitdata;
116 @@ -375,7 +399,7 @@ __setup("serialnumber", x86_serial_nr_se
117  /*
118   * This does the hard work of actually picking apart the CPU stuff...
119   */
120 -void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
121 +static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
122  {
123         int i;
124  
125 @@ -486,15 +510,22 @@ void __cpuinit identify_cpu(struct cpuin
126  
127         /* Init Machine Check Exception if available. */
128         mcheck_init(c);
129 +}
130  
131 -       if (c == &boot_cpu_data)
132 -               sysenter_setup();
133 +void __init identify_boot_cpu(void)
134 +{
135 +       identify_cpu(&boot_cpu_data);
136 +       sysenter_setup();
137         enable_sep_cpu();
138 +       mtrr_bp_init();
139 +}
140  
141 -       if (c == &boot_cpu_data)
142 -               mtrr_bp_init();
143 -       else
144 -               mtrr_ap_init();
145 +void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c)
146 +{
147 +       BUG_ON(c == &boot_cpu_data);
148 +       identify_cpu(c);
149 +       enable_sep_cpu();
150 +       mtrr_ap_init();
151  }
152  
153  #ifdef CONFIG_X86_HT
154 @@ -608,136 +639,47 @@ void __init early_cpu_init(void)
155  #endif
156  }
157  
158 -/* Make sure %gs is initialized properly in idle threads */
159 +/* Make sure %fs is initialized properly in idle threads */
160  struct pt_regs * __devinit idle_regs(struct pt_regs *regs)
161  {
162         memset(regs, 0, sizeof(struct pt_regs));
163 -       regs->xfs = __KERNEL_PDA;
164 +       regs->xfs = __KERNEL_PERCPU;
165         return regs;
166  }
167  
168 -static __cpuinit int alloc_gdt(int cpu)
169 -{
170 -       struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu);
171 -       struct desc_struct *gdt;
172 -       struct i386_pda *pda;
173 -
174 -       gdt = (struct desc_struct *)cpu_gdt_descr->address;
175 -       pda = cpu_pda(cpu);
176 -
177 -       /*
178 -        * This is a horrible hack to allocate the GDT.  The problem
179 -        * is that cpu_init() is called really early for the boot CPU
180 -        * (and hence needs bootmem) but much later for the secondary
181 -        * CPUs, when bootmem will have gone away
182 -        */
183 -       if (NODE_DATA(0)->bdata->node_bootmem_map) {
184 -               BUG_ON(gdt != NULL || pda != NULL);
185 -
186 -               gdt = alloc_bootmem_pages(PAGE_SIZE);
187 -               pda = alloc_bootmem(sizeof(*pda));
188 -               /* alloc_bootmem(_pages) panics on failure, so no check */
189 -
190 -               memset(gdt, 0, PAGE_SIZE);
191 -               memset(pda, 0, sizeof(*pda));
192 -       } else {
193 -               /* GDT and PDA might already have been allocated if
194 -                  this is a CPU hotplug re-insertion. */
195 -               if (gdt == NULL)
196 -                       gdt = (struct desc_struct *)get_zeroed_page(GFP_KERNEL);
197 -
198 -               if (pda == NULL)
199 -                       pda = kmalloc_node(sizeof(*pda), GFP_KERNEL, cpu_to_node(cpu));
200 -
201 -               if (unlikely(!gdt || !pda)) {
202 -                       free_pages((unsigned long)gdt, 0);
203 -                       kfree(pda);
204 -                       return 0;
205 -               }
206 -       }
207 -
208 -       cpu_gdt_descr->address = (unsigned long)gdt;
209 -       cpu_pda(cpu) = pda;
210 -
211 -       return 1;
212 -}
213 -
214 -/* Initial PDA used by boot CPU */
215 -struct i386_pda boot_pda = {
216 -       ._pda = &boot_pda,
217 -       .cpu_number = 0,
218 -       .pcurrent = &init_task,
219 -};
220 -
221 -static inline void set_kernel_fs(void)
222 -{
223 -       /* Set %fs for this CPU's PDA.  Memory clobber is to create a
224 -          barrier with respect to any PDA operations, so the compiler
225 -          doesn't move any before here. */
226 -       asm volatile ("mov %0, %%fs" : : "r" (__KERNEL_PDA) : "memory");
227 -}
228 -
229 -/* Initialize the CPU's GDT and PDA.  The boot CPU does this for
230 -   itself, but secondaries find this done for them. */
231 -__cpuinit int init_gdt(int cpu, struct task_struct *idle)
232 -{
233 -       struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu);
234 -       struct desc_struct *gdt;
235 -       struct i386_pda *pda;
236 -
237 -       /* For non-boot CPUs, the GDT and PDA should already have been
238 -          allocated. */
239 -       if (!alloc_gdt(cpu)) {
240 -               printk(KERN_CRIT "CPU%d failed to allocate GDT or PDA\n", cpu);
241 -               return 0;
242 -       }
243 -
244 -       gdt = (struct desc_struct *)cpu_gdt_descr->address;
245 -       pda = cpu_pda(cpu);
246 -
247 -       BUG_ON(gdt == NULL || pda == NULL);
248 -
249 -       /*
250 -        * Initialize the per-CPU GDT with the boot GDT,
251 -        * and set up the GDT descriptor:
252 -        */
253 -       memcpy(gdt, cpu_gdt_table, GDT_SIZE);
254 -       cpu_gdt_descr->size = GDT_SIZE - 1;
255 -
256 -       pack_descriptor((u32 *)&gdt[GDT_ENTRY_PDA].a,
257 -                       (u32 *)&gdt[GDT_ENTRY_PDA].b,
258 -                       (unsigned long)pda, sizeof(*pda) - 1,
259 -                       0x80 | DESCTYPE_S | 0x2, 0); /* present read-write data segment */
260 -
261 -       memset(pda, 0, sizeof(*pda));
262 -       pda->_pda = pda;
263 -       pda->cpu_number = cpu;
264 -       pda->pcurrent = idle;
265 -
266 -       return 1;
267 -}
268 -
269 -void __cpuinit cpu_set_gdt(int cpu)
270 +/* Current gdt points %fs at the "master" per-cpu area: after this,
271 + * it's on the real one. */
272 +void switch_to_new_gdt(void)
273  {
274 -       struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu);
275 +       struct Xgt_desc_struct gdt_descr;
276         unsigned long va, frames[16];
277         int f;
278  
279 -       for (va = cpu_gdt_descr->address, f = 0;
280 -            va < cpu_gdt_descr->address + cpu_gdt_descr->size;
281 +       gdt_descr.address = (long)get_cpu_gdt_table(smp_processor_id());
282 +       gdt_descr.size = GDT_SIZE - 1;
283 +
284 +       for (va = gdt_descr.address, f = 0;
285 +            va < gdt_descr.address + gdt_descr.size;
286              va += PAGE_SIZE, f++) {
287                 frames[f] = virt_to_mfn(va);
288                 make_lowmem_page_readonly(
289                         (void *)va, XENFEAT_writable_descriptor_tables);
290         }
291 -       BUG_ON(HYPERVISOR_set_gdt(frames, (cpu_gdt_descr->size + 1) / 8));
292 -
293 -       set_kernel_fs();
294 +       if (HYPERVISOR_set_gdt(frames, (gdt_descr.size + 1) / 8))
295 +               BUG();
296 +       asm("mov %0, %%fs" : : "r" (__KERNEL_PERCPU) : "memory");
297  }
298  
299 -/* Common CPU init for both boot and secondary CPUs */
300 -static void __cpuinit _cpu_init(int cpu, struct task_struct *curr)
301 +/*
302 + * cpu_init() initializes state that is per-CPU. Some data is already
303 + * initialized (naturally) in the bootstrap process, such as the GDT
304 + * and IDT. We reload them nevertheless, this function acts as a
305 + * 'CPU state barrier', nothing should get across.
306 + */
307 +void __cpuinit cpu_init(void)
308  {
309 +       int cpu = smp_processor_id();
310 +       struct task_struct *curr = current;
311  #ifndef CONFIG_X86_NO_TSS
312         struct tss_struct * t = &per_cpu(init_tss, cpu);
313  #endif
314 @@ -759,6 +701,8 @@ static void __cpuinit _cpu_init(int cpu,
315                 set_in_cr4(X86_CR4_TSD);
316         }
317  
318 +       switch_to_new_gdt();
319 +
320         /*
321          * Set up and load the per-CPU TSS and LDT
322          */
323 @@ -796,38 +740,6 @@ static void __cpuinit _cpu_init(int cpu,
324         mxcsr_feature_mask_init();
325  }
326  
327 -/* Entrypoint to initialize secondary CPU */
328 -void __cpuinit secondary_cpu_init(void)
329 -{
330 -       int cpu = smp_processor_id();
331 -       struct task_struct *curr = current;
332 -
333 -       _cpu_init(cpu, curr);
334 -}
335 -
336 -/*
337 - * cpu_init() initializes state that is per-CPU. Some data is already
338 - * initialized (naturally) in the bootstrap process, such as the GDT
339 - * and IDT. We reload them nevertheless, this function acts as a
340 - * 'CPU state barrier', nothing should get across.
341 - */
342 -void __cpuinit cpu_init(void)
343 -{
344 -       int cpu = smp_processor_id();
345 -       struct task_struct *curr = current;
346 -
347 -       /* Set up the real GDT and PDA, so we can transition from the
348 -          boot versions. */
349 -       if (!init_gdt(cpu, curr)) {
350 -               /* failed to allocate something; not much we can do... */
351 -               for (;;)
352 -                       local_irq_enable();
353 -       }
354 -
355 -       cpu_set_gdt(cpu);
356 -       _cpu_init(cpu, curr);
357 -}
358 -
359  #ifdef CONFIG_HOTPLUG_CPU
360  void __cpuinit cpu_uninit(void)
361  {
362 --- sle11sp1-2010-03-01.orig/arch/x86/kernel/cpu/mtrr/main-xen.c        2009-11-06 10:46:27.000000000 +0100
363 +++ sle11sp1-2010-03-01/arch/x86/kernel/cpu/mtrr/main-xen.c     2009-11-06 10:49:47.000000000 +0100
364 @@ -167,7 +167,7 @@ mtrr_del(int reg, unsigned long base, un
365  EXPORT_SYMBOL(mtrr_add);
366  EXPORT_SYMBOL(mtrr_del);
367  
368 -void __init mtrr_bp_init(void)
369 +__init void mtrr_bp_init(void)
370  {
371  }
372  
373 --- sle11sp1-2010-03-01.orig/arch/x86/kernel/e820_32-xen.c      2009-11-06 10:46:41.000000000 +0100
374 +++ sle11sp1-2010-03-01/arch/x86/kernel/e820_32-xen.c   2009-11-06 10:49:47.000000000 +0100
375 @@ -162,26 +162,27 @@ static struct resource standard_io_resou
376  
377  static int __init romsignature(const unsigned char *rom)
378  {
379 +       const unsigned short * const ptr = (const unsigned short *)rom;
380         unsigned short sig;
381  
382 -       return probe_kernel_address((const unsigned short *)rom, sig) == 0 &&
383 -              sig == ROMSIGNATURE;
384 +       return probe_kernel_address(ptr, sig) == 0 && sig == ROMSIGNATURE;
385  }
386  
387 -static int __init romchecksum(unsigned char *rom, unsigned long length)
388 +static int __init romchecksum(const unsigned char *rom, unsigned long length)
389  {
390 -       unsigned char sum;
391 +       unsigned char sum, c;
392  
393 -       for (sum = 0; length; length--)
394 -               sum += *rom++;
395 -       return sum == 0;
396 +       for (sum = 0; length && probe_kernel_address(rom++, c) == 0; length--)
397 +               sum += c;
398 +       return !length && !sum;
399  }
400  
401  static void __init probe_roms(void)
402  {
403 +       const unsigned char *rom;
404         unsigned long start, length, upper;
405 -       unsigned char *rom;
406 -       int           i;
407 +       unsigned char c;
408 +       int i;
409  
410  #ifdef CONFIG_XEN
411         /* Nothing to do if not running in dom0. */
412 @@ -198,8 +199,11 @@ static void __init probe_roms(void)
413  
414                 video_rom_resource.start = start;
415  
416 +               if (probe_kernel_address(rom + 2, c) != 0)
417 +                       continue;
418 +
419                 /* 0 < length <= 0x7f * 512, historically */
420 -               length = rom[2] * 512;
421 +               length = c * 512;
422  
423                 /* if checksum okay, trust length byte */
424                 if (length && romchecksum(rom, length))
425 @@ -233,8 +237,11 @@ static void __init probe_roms(void)
426                 if (!romsignature(rom))
427                         continue;
428  
429 +               if (probe_kernel_address(rom + 2, c) != 0)
430 +                       continue;
431 +
432                 /* 0 < length <= 0x7f * 512, historically */
433 -               length = rom[2] * 512;
434 +               length = c * 512;
435  
436                 /* but accept any length that fits if checksum okay */
437                 if (!length || start + length > upper || !romchecksum(rom, length))
438 @@ -249,7 +256,7 @@ static void __init probe_roms(void)
439  }
440  
441  #ifdef CONFIG_XEN
442 -static struct e820map machine_e820 __initdata;
443 +static struct e820map machine_e820;
444  #define e820 machine_e820
445  #endif
446  
447 @@ -409,10 +416,8 @@ int __init sanitize_e820_map(struct e820
448                    ____________________33__
449                    ______________________4_
450         */
451 -       printk("sanitize start\n");
452         /* if there's only one memory region, don't bother */
453         if (*pnr_map < 2) {
454 -               printk("sanitize bail 0\n");
455                 return -1;
456         }
457  
458 @@ -421,7 +426,6 @@ int __init sanitize_e820_map(struct e820
459         /* bail out if we find any unreasonable addresses in bios map */
460         for (i=0; i<old_nr; i++)
461                 if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr) {
462 -                       printk("sanitize bail 1\n");
463                         return -1;
464                 }
465  
466 @@ -517,7 +521,6 @@ int __init sanitize_e820_map(struct e820
467         memcpy(biosmap, new_bios, new_nr*sizeof(struct e820entry));
468         *pnr_map = new_nr;
469  
470 -       printk("sanitize end\n");
471         return 0;
472  }
473  
474 @@ -552,7 +555,6 @@ int __init copy_e820_map(struct e820entr
475                 unsigned long long size = biosmap->size;
476                 unsigned long long end = start + size;
477                 unsigned long type = biosmap->type;
478 -               printk("copy_e820_map() start: %016Lx size: %016Lx end: %016Lx type: %ld\n", start, size, end, type);
479  
480                 /* Overflow in 64 bits? Ignore the memory map. */
481                 if (start > end)
482 @@ -564,17 +566,11 @@ int __init copy_e820_map(struct e820entr
483                  * Not right. Fix it up.
484                  */
485                 if (type == E820_RAM) {
486 -                       printk("copy_e820_map() type is E820_RAM\n");
487                         if (start < 0x100000ULL && end > 0xA0000ULL) {
488 -                               printk("copy_e820_map() lies in range...\n");
489 -                               if (start < 0xA0000ULL) {
490 -                                       printk("copy_e820_map() start < 0xA0000ULL\n");
491 +                               if (start < 0xA0000ULL)
492                                         add_memory_region(start, 0xA0000ULL-start, type);
493 -                               }
494 -                               if (end <= 0x100000ULL) {
495 -                                       printk("copy_e820_map() end <= 0x100000ULL\n");
496 +                               if (end <= 0x100000ULL)
497                                         continue;
498 -                               }
499                                 start = 0x100000ULL;
500                                 size = end - start;
501                         }
502 --- sle11sp1-2010-03-01.orig/arch/x86/kernel/entry_32-xen.S     2009-11-06 10:46:41.000000000 +0100
503 +++ sle11sp1-2010-03-01/arch/x86/kernel/entry_32-xen.S  2009-11-06 10:49:47.000000000 +0100
504 @@ -15,7 +15,7 @@
505   * I changed all the .align's to 4 (16 byte alignment), as that's faster
506   * on a 486.
507   *
508 - * Stack layout in 'ret_from_system_call':
509 + * Stack layout in 'syscall_exit':
510   *     ptrace needs to have all regs on the stack.
511   *     if the order here is changed, it needs to be
512   *     updated in fork.c:copy_process, signal.c:do_signal,
513 @@ -135,7 +135,7 @@ NMI_MASK    = 0x80000000
514         movl $(__USER_DS), %edx; \
515         movl %edx, %ds; \
516         movl %edx, %es; \
517 -       movl $(__KERNEL_PDA), %edx; \
518 +       movl $(__KERNEL_PERCPU), %edx; \
519         movl %edx, %fs
520  
521  #define RESTORE_INT_REGS \
522 @@ -308,16 +308,12 @@ sysenter_past_esp:
523         pushl $(__USER_CS)
524         CFI_ADJUST_CFA_OFFSET 4
525         /*CFI_REL_OFFSET cs, 0*/
526 -#ifndef CONFIG_COMPAT_VDSO
527         /*
528          * Push current_thread_info()->sysenter_return to the stack.
529          * A tiny bit of offset fixup is necessary - 4*4 means the 4 words
530          * pushed above; +8 corresponds to copy_thread's esp0 setting.
531          */
532         pushl (TI_sysenter_return-THREAD_SIZE+8+4*4)(%esp)
533 -#else
534 -       pushl $SYSENTER_RETURN
535 -#endif
536         CFI_ADJUST_CFA_OFFSET 4
537         CFI_REL_OFFSET eip, 0
538  
539 @@ -345,7 +341,7 @@ sysenter_past_esp:
540         jae syscall_badsys
541         call *sys_call_table(,%eax,4)
542         movl %eax,PT_EAX(%esp)
543 -       DISABLE_INTERRUPTS(CLBR_ECX|CLBR_EDX)
544 +       DISABLE_INTERRUPTS(CLBR_ANY)
545         TRACE_IRQS_OFF
546         movl TI_flags(%ebp), %ecx
547         testw $_TIF_ALLWORK_MASK, %cx
548 @@ -400,10 +396,6 @@ ENTRY(system_call)
549         CFI_ADJUST_CFA_OFFSET 4
550         SAVE_ALL
551         GET_THREAD_INFO(%ebp)
552 -       testl $TF_MASK,PT_EFLAGS(%esp)
553 -       jz no_singlestep
554 -       orl $_TIF_SINGLESTEP,TI_flags(%ebp)
555 -no_singlestep:
556                                         # system call tracing in operation / emulation
557         /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */
558         testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(%ebp)
559 @@ -418,6 +410,10 @@ syscall_exit:
560                                         # setting need_resched or sigpending
561                                         # between sampling and the iret
562         TRACE_IRQS_OFF
563 +       testl $TF_MASK,PT_EFLAGS(%esp)  # If tracing set singlestep flag on exit
564 +       jz no_singlestep
565 +       orl $_TIF_SINGLESTEP,TI_flags(%ebp)
566 +no_singlestep:
567         movl TI_flags(%ebp), %ecx
568         testw $_TIF_ALLWORK_MASK, %cx   # current->work
569         jne syscall_exit_work
570 @@ -635,9 +631,7 @@ END(syscall_badsys)
571  #ifndef CONFIG_XEN
572  #define FIXUP_ESPFIX_STACK \
573         /* since we are on a wrong stack, we cant make it a C code :( */ \
574 -       movl %fs:PDA_cpu, %ebx; \
575 -       PER_CPU(cpu_gdt_descr, %ebx); \
576 -       movl GDS_address(%ebx), %ebx; \
577 +       PER_CPU(gdt_page, %ebx); \
578         GET_DESC_BASE(GDT_ENTRY_ESPFIX_SS, %ebx, %eax, %ax, %al, %ah); \
579         addl %esp, %eax; \
580         pushl $__KERNEL_DS; \
581 @@ -710,7 +704,7 @@ ENTRY(name)                         \
582         SAVE_ALL;                       \
583         TRACE_IRQS_OFF                  \
584         movl %esp,%eax;                 \
585 -       call smp_/**/name;              \
586 +       call smp_##name;                \
587         jmp ret_from_intr;              \
588         CFI_ENDPROC;                    \
589  ENDPROC(name)
590 @@ -718,10 +712,6 @@ ENDPROC(name)
591  /* The include is where all of the SMP etc. interrupts come from */
592  #include "entry_arch.h"
593  
594 -/* This alternate entry is needed because we hijack the apic LVTT */
595 -#if defined(CONFIG_VMI) && defined(CONFIG_X86_LOCAL_APIC)
596 -BUILD_INTERRUPT(apic_vmi_timer_interrupt,LOCAL_TIMER_VECTOR)
597 -#endif
598  #else
599  #define UNWIND_ESPFIX_STACK
600  #endif
601 @@ -764,7 +754,7 @@ error_code:
602         pushl %fs
603         CFI_ADJUST_CFA_OFFSET 4
604         /*CFI_REL_OFFSET fs, 0*/
605 -       movl $(__KERNEL_PDA), %ecx
606 +       movl $(__KERNEL_PERCPU), %ecx
607         movl %ecx, %fs
608         UNWIND_ESPFIX_STACK
609         popl %ecx
610 --- sle11sp1-2010-03-01.orig/arch/x86/kernel/head_32-xen.S      2009-11-06 10:46:41.000000000 +0100
611 +++ sle11sp1-2010-03-01/arch/x86/kernel/head_32-xen.S   2009-11-06 10:49:47.000000000 +0100
612 @@ -37,7 +37,8 @@ ENTRY(startup_32)
613         /* Set up the stack pointer */
614         movl $(init_thread_union+THREAD_SIZE),%esp
615  
616 -       call setup_pda
617 +       movl %ss,%eax
618 +       movl %eax,%fs                   # gets reset once there's real percpu
619  
620         /* get vendor info */
621         xorl %eax,%eax                  # call CPUID with 0 -> return vendor ID
622 @@ -64,55 +65,11 @@ ENTRY(startup_32)
623         xorl %eax,%eax          # Clear GS
624         movl %eax,%gs
625  
626 -       movl $(__KERNEL_PDA),%eax
627 -       mov  %eax,%fs
628 -
629         cld                     # gcc2 wants the direction flag cleared at all times
630  
631         pushl $0                # fake return address for unwinder
632         jmp start_kernel
633  
634 -/*
635 - * Point the GDT at this CPU's PDA.  This will be
636 - * cpu_gdt_table and boot_pda.
637 - */
638 -ENTRY(setup_pda)
639 -       /* get the PDA pointer */
640 -       movl $boot_pda, %eax
641 -
642 -       /* slot the PDA address into the GDT */
643 -       mov $cpu_gdt_table, %ecx
644 -       mov %ax, (__KERNEL_PDA+0+2)(%ecx)               /* base & 0x0000ffff */
645 -       shr $16, %eax
646 -       mov %al, (__KERNEL_PDA+4+0)(%ecx)               /* base & 0x00ff0000 */
647 -       mov %ah, (__KERNEL_PDA+4+3)(%ecx)               /* base & 0xff000000 */
648 -
649 -       # %esi still points to start_info, and no registers
650 -       # need to be preserved.
651 -
652 -       movl XEN_START_mfn_list(%esi), %ebx
653 -       movl $(cpu_gdt_table - __PAGE_OFFSET), %eax
654 -       shrl $PAGE_SHIFT, %eax
655 -       movl (%ebx,%eax,4), %ecx
656 -       pushl %ecx                      # frame number for set_gdt below
657 -
658 -       xorl %esi, %esi
659 -       xorl %edx, %edx
660 -       shldl $PAGE_SHIFT, %ecx, %edx
661 -       shll $PAGE_SHIFT, %ecx
662 -       orl $0x61, %ecx
663 -       movl $cpu_gdt_table, %ebx
664 -       movl $__HYPERVISOR_update_va_mapping, %eax
665 -       int $0x82
666 -
667 -       movl $(PAGE_SIZE_asm / 8), %ecx
668 -       movl %esp, %ebx
669 -       movl $__HYPERVISOR_set_gdt, %eax
670 -       int $0x82
671 -
672 -       popl %ecx
673 -       ret
674 -
675  #define HYPERCALL_PAGE_OFFSET 0x1000
676  .org HYPERCALL_PAGE_OFFSET
677  ENTRY(hypercall_page)
678 @@ -138,60 +95,6 @@ ENTRY(empty_zero_page)
679   */
680  .data
681  
682 -/*
683 - * The Global Descriptor Table contains 28 quadwords, per-CPU.
684 - */
685 -       .section .data.page_aligned, "aw"
686 -       .align PAGE_SIZE_asm
687 -ENTRY(cpu_gdt_table)
688 -       .quad 0x0000000000000000        /* NULL descriptor */
689 -       .quad 0x0000000000000000        /* 0x0b reserved */
690 -       .quad 0x0000000000000000        /* 0x13 reserved */
691 -       .quad 0x0000000000000000        /* 0x1b reserved */
692 -       .quad 0x0000000000000000        /* 0x20 unused */
693 -       .quad 0x0000000000000000        /* 0x28 unused */
694 -       .quad 0x0000000000000000        /* 0x33 TLS entry 1 */
695 -       .quad 0x0000000000000000        /* 0x3b TLS entry 2 */
696 -       .quad 0x0000000000000000        /* 0x43 TLS entry 3 */
697 -       .quad 0x0000000000000000        /* 0x4b reserved */
698 -       .quad 0x0000000000000000        /* 0x53 reserved */
699 -       .quad 0x0000000000000000        /* 0x5b reserved */
700 -
701 -       .quad 0x00cf9a000000ffff        /* 0x60 kernel 4GB code at 0x00000000 */
702 -       .quad 0x00cf92000000ffff        /* 0x68 kernel 4GB data at 0x00000000 */
703 -       .quad 0x00cffa000000ffff        /* 0x73 user 4GB code at 0x00000000 */
704 -       .quad 0x00cff2000000ffff        /* 0x7b user 4GB data at 0x00000000 */
705 -
706 -       .quad 0x0000000000000000        /* 0x80 TSS descriptor */
707 -       .quad 0x0000000000000000        /* 0x88 LDT descriptor */
708 -
709 -       /*
710 -        * Segments used for calling PnP BIOS have byte granularity.
711 -        * They code segments and data segments have fixed 64k limits,
712 -        * the transfer segment sizes are set at run time.
713 -        */
714 -       .quad 0x0000000000000000        /* 0x90 32-bit code */
715 -       .quad 0x0000000000000000        /* 0x98 16-bit code */
716 -       .quad 0x0000000000000000        /* 0xa0 16-bit data */
717 -       .quad 0x0000000000000000        /* 0xa8 16-bit data */
718 -       .quad 0x0000000000000000        /* 0xb0 16-bit data */
719 -
720 -       /*
721 -        * The APM segments have byte granularity and their bases
722 -        * are set at run time.  All have 64k limits.
723 -        */
724 -       .quad 0x0000000000000000        /* 0xb8 APM CS    code */
725 -       .quad 0x0000000000000000        /* 0xc0 APM CS 16 code (16 bit) */
726 -       .quad 0x0000000000000000        /* 0xc8 APM DS    data */
727 -
728 -       .quad 0x0000000000000000        /* 0xd0 - ESPFIX SS */
729 -       .quad 0x00cf92000000ffff        /* 0xd8 - PDA */
730 -       .quad 0x0000000000000000        /* 0xe0 - unused */
731 -       .quad 0x0000000000000000        /* 0xe8 - unused */
732 -       .quad 0x0000000000000000        /* 0xf0 - unused */
733 -       .quad 0x0000000000000000        /* 0xf8 - GDT entry 31: double-fault TSS */
734 -       .align PAGE_SIZE_asm
735 -
736  #if CONFIG_XEN_COMPAT <= 0x030002
737  /*
738   * __xen_guest information
739 --- sle11sp1-2010-03-01.orig/arch/x86/kernel/io_apic_32-xen.c   2009-11-06 10:46:41.000000000 +0100
740 +++ sle11sp1-2010-03-01/arch/x86/kernel/io_apic_32-xen.c        2009-11-06 10:49:47.000000000 +0100
741 @@ -25,7 +25,6 @@
742  #include <linux/init.h>
743  #include <linux/delay.h>
744  #include <linux/sched.h>
745 -#include <linux/smp_lock.h>
746  #include <linux/mc146818rtc.h>
747  #include <linux/compiler.h>
748  #include <linux/acpi.h>
749 @@ -35,6 +34,7 @@
750  #include <linux/msi.h>
751  #include <linux/htirq.h>
752  #include <linux/freezer.h>
753 +#include <linux/kthread.h>
754  
755  #include <asm/io.h>
756  #include <asm/smp.h>
757 @@ -710,8 +710,6 @@ static int balanced_irq(void *unused)
758         unsigned long prev_balance_time = jiffies;
759         long time_remaining = balanced_irq_interval;
760  
761 -       daemonize("kirqd");
762 -       
763         /* push everything to CPU 0 to give us a starting point.  */
764         for (i = 0 ; i < NR_IRQS ; i++) {
765                 irq_desc[i].pending_mask = cpumask_of_cpu(0);
766 @@ -771,10 +769,9 @@ static int __init balanced_irq_init(void
767         }
768         
769         printk(KERN_INFO "Starting balanced_irq\n");
770 -       if (kernel_thread(balanced_irq, NULL, CLONE_KERNEL) >= 0) 
771 +       if (!IS_ERR(kthread_run(balanced_irq, NULL, "kirqd")))
772                 return 0;
773 -       else 
774 -               printk(KERN_ERR "balanced_irq_init: failed to spawn balanced_irq");
775 +       printk(KERN_ERR "balanced_irq_init: failed to spawn balanced_irq");
776  failed:
777         for_each_possible_cpu(i) {
778                 kfree(irq_cpu_data[i].irq_delta);
779 @@ -1455,10 +1452,6 @@ static void __init setup_ExtINT_IRQ0_pin
780         enable_8259A_irq(0);
781  }
782  
783 -static inline void UNEXPECTED_IO_APIC(void)
784 -{
785 -}
786 -
787  void __init print_IO_APIC(void)
788  {
789         int apic, i;
790 @@ -1498,34 +1491,12 @@ void __init print_IO_APIC(void)
791         printk(KERN_DEBUG ".......    : physical APIC id: %02X\n", reg_00.bits.ID);
792         printk(KERN_DEBUG ".......    : Delivery Type: %X\n", reg_00.bits.delivery_type);
793         printk(KERN_DEBUG ".......    : LTS          : %X\n", reg_00.bits.LTS);
794 -       if (reg_00.bits.ID >= get_physical_broadcast())
795 -               UNEXPECTED_IO_APIC();
796 -       if (reg_00.bits.__reserved_1 || reg_00.bits.__reserved_2)
797 -               UNEXPECTED_IO_APIC();
798  
799         printk(KERN_DEBUG ".... register #01: %08X\n", reg_01.raw);
800         printk(KERN_DEBUG ".......     : max redirection entries: %04X\n", reg_01.bits.entries);
801 -       if (    (reg_01.bits.entries != 0x0f) && /* older (Neptune) boards */
802 -               (reg_01.bits.entries != 0x17) && /* typical ISA+PCI boards */
803 -               (reg_01.bits.entries != 0x1b) && /* Compaq Proliant boards */
804 -               (reg_01.bits.entries != 0x1f) && /* dual Xeon boards */
805 -               (reg_01.bits.entries != 0x22) && /* bigger Xeon boards */
806 -               (reg_01.bits.entries != 0x2E) &&
807 -               (reg_01.bits.entries != 0x3F)
808 -       )
809 -               UNEXPECTED_IO_APIC();
810  
811         printk(KERN_DEBUG ".......     : PRQ implemented: %X\n", reg_01.bits.PRQ);
812         printk(KERN_DEBUG ".......     : IO APIC version: %04X\n", reg_01.bits.version);
813 -       if (    (reg_01.bits.version != 0x01) && /* 82489DX IO-APICs */
814 -               (reg_01.bits.version != 0x10) && /* oldest IO-APICs */
815 -               (reg_01.bits.version != 0x11) && /* Pentium/Pro IO-APICs */
816 -               (reg_01.bits.version != 0x13) && /* Xeon IO-APICs */
817 -               (reg_01.bits.version != 0x20)    /* Intel P64H (82806 AA) */
818 -       )
819 -               UNEXPECTED_IO_APIC();
820 -       if (reg_01.bits.__reserved_1 || reg_01.bits.__reserved_2)
821 -               UNEXPECTED_IO_APIC();
822  
823         /*
824          * Some Intel chipsets with IO APIC VERSION of 0x1? don't have reg_02,
825 @@ -1535,8 +1506,6 @@ void __init print_IO_APIC(void)
826         if (reg_01.bits.version >= 0x10 && reg_02.raw != reg_01.raw) {
827                 printk(KERN_DEBUG ".... register #02: %08X\n", reg_02.raw);
828                 printk(KERN_DEBUG ".......     : arbitration: %02X\n", reg_02.bits.arbitration);
829 -               if (reg_02.bits.__reserved_1 || reg_02.bits.__reserved_2)
830 -                       UNEXPECTED_IO_APIC();
831         }
832  
833         /*
834 @@ -1548,8 +1517,6 @@ void __init print_IO_APIC(void)
835             reg_03.raw != reg_01.raw) {
836                 printk(KERN_DEBUG ".... register #03: %08X\n", reg_03.raw);
837                 printk(KERN_DEBUG ".......     : Boot DT    : %X\n", reg_03.bits.boot_DT);
838 -               if (reg_03.bits.__reserved_1)
839 -                       UNEXPECTED_IO_APIC();
840         }
841  
842         printk(KERN_DEBUG ".... IRQ redirection table:\n");
843 @@ -2686,19 +2653,19 @@ int arch_setup_msi_irq(struct pci_dev *d
844         if (irq < 0)
845                 return irq;
846  
847 -       set_irq_msi(irq, desc);
848         ret = msi_compose_msg(dev, irq, &msg);
849         if (ret < 0) {
850                 destroy_irq(irq);
851                 return ret;
852         }
853  
854 +       set_irq_msi(irq, desc);
855         write_msi_msg(irq, &msg);
856  
857         set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq,
858                                       "edge");
859  
860 -       return irq;
861 +       return 0;
862  }
863  
864  void arch_teardown_msi_irq(unsigned int irq)
865 --- sle11sp1-2010-03-01.orig/arch/x86/kernel/ioport_32-xen.c    2008-01-28 12:24:19.000000000 +0100
866 +++ sle11sp1-2010-03-01/arch/x86/kernel/ioport_32-xen.c 2009-11-06 10:49:47.000000000 +0100
867 @@ -12,10 +12,10 @@
868  #include <linux/types.h>
869  #include <linux/ioport.h>
870  #include <linux/smp.h>
871 -#include <linux/smp_lock.h>
872  #include <linux/stddef.h>
873  #include <linux/slab.h>
874  #include <linux/thread_info.h>
875 +#include <linux/syscalls.h>
876  #include <xen/interface/physdev.h>
877  
878  /* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */
879 --- sle11sp1-2010-03-01.orig/arch/x86/kernel/irq_32-xen.c       2009-11-06 10:46:41.000000000 +0100
880 +++ sle11sp1-2010-03-01/arch/x86/kernel/irq_32-xen.c    2009-11-06 10:49:47.000000000 +0100
881 @@ -24,6 +24,9 @@
882  DEFINE_PER_CPU(irq_cpustat_t, irq_stat) ____cacheline_internodealigned_in_smp;
883  EXPORT_PER_CPU_SYMBOL(irq_stat);
884  
885 +DEFINE_PER_CPU(struct pt_regs *, irq_regs);
886 +EXPORT_PER_CPU_SYMBOL(irq_regs);
887 +
888  /*
889   * 'what should we do if we get a hw irq event on an illegal vector'.
890   * each architecture has to answer this themselves.
891 --- sle11sp1-2010-03-01.orig/arch/x86/kernel/ldt_32-xen.c       2009-11-06 10:46:27.000000000 +0100
892 +++ sle11sp1-2010-03-01/arch/x86/kernel/ldt_32-xen.c    2009-11-06 10:49:47.000000000 +0100
893 @@ -10,7 +10,6 @@
894  #include <linux/string.h>
895  #include <linux/mm.h>
896  #include <linux/smp.h>
897 -#include <linux/smp_lock.h>
898  #include <linux/vmalloc.h>
899  #include <linux/slab.h>
900  
901 --- sle11sp1-2010-03-01.orig/arch/x86/kernel/microcode-xen.c    2009-11-06 10:46:41.000000000 +0100
902 +++ sle11sp1-2010-03-01/arch/x86/kernel/microcode-xen.c 2009-11-06 10:49:47.000000000 +0100
903 @@ -135,7 +135,7 @@ static int __init microcode_dev_init (vo
904         return 0;
905  }
906  
907 -static void __exit microcode_dev_exit (void)
908 +static void microcode_dev_exit (void)
909  {
910         misc_deregister(&microcode_dev);
911  }
912 --- sle11sp1-2010-03-01.orig/arch/x86/kernel/mpparse_32-xen.c   2009-11-06 10:46:41.000000000 +0100
913 +++ sle11sp1-2010-03-01/arch/x86/kernel/mpparse_32-xen.c        2009-11-06 10:49:47.000000000 +0100
914 @@ -18,7 +18,6 @@
915  #include <linux/acpi.h>
916  #include <linux/delay.h>
917  #include <linux/bootmem.h>
918 -#include <linux/smp_lock.h>
919  #include <linux/kernel_stat.h>
920  #include <linux/mc146818rtc.h>
921  #include <linux/bitops.h>
922 @@ -484,7 +483,7 @@ static int __init smp_read_mpc(struct mp
923                 }
924                 ++mpc_record;
925         }
926 -       clustered_apic_check();
927 +       setup_apic_routing();
928         if (!num_processors)
929                 printk(KERN_ERR "SMP mptable: no processors registered!\n");
930         return num_processors;
931 --- sle11sp1-2010-03-01.orig/arch/x86/kernel/pci-dma-xen.c      2009-11-06 10:46:41.000000000 +0100
932 +++ sle11sp1-2010-03-01/arch/x86/kernel/pci-dma-xen.c   2009-11-06 10:49:47.000000000 +0100
933 @@ -13,6 +13,7 @@
934  #include <linux/pci.h>
935  #include <linux/module.h>
936  #include <linux/version.h>
937 +#include <linux/pci.h>
938  #include <asm/io.h>
939  #include <xen/balloon.h>
940  #include <xen/gnttab.h>
941 @@ -275,7 +276,7 @@ int dma_declare_coherent_memory(struct d
942  {
943         void __iomem *mem_base = NULL;
944         int pages = size >> PAGE_SHIFT;
945 -       int bitmap_size = (pages + 31)/32;
946 +       int bitmap_size = BITS_TO_LONGS(pages) * sizeof(long);
947  
948         if ((flags & (DMA_MEMORY_MAP | DMA_MEMORY_IO)) == 0)
949                 goto out;
950 @@ -348,6 +349,32 @@ void *dma_mark_declared_memory_occupied(
951  EXPORT_SYMBOL(dma_mark_declared_memory_occupied);
952  #endif /* ARCH_HAS_DMA_DECLARE_COHERENT_MEMORY */
953  
954 +#if defined(CONFIG_PCI) && !defined(CONFIG_XEN)
955 +/* Many VIA bridges seem to corrupt data for DAC. Disable it here */
956 +
957 +int forbid_dac;
958 +EXPORT_SYMBOL(forbid_dac);
959 +
960 +static __devinit void via_no_dac(struct pci_dev *dev)
961 +{
962 +       if ((dev->class >> 8) == PCI_CLASS_BRIDGE_PCI && forbid_dac == 0) {
963 +               printk(KERN_INFO "PCI: VIA PCI bridge detected. Disabling DAC.\n");
964 +               forbid_dac = 1;
965 +       }
966 +}
967 +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_VIA, PCI_ANY_ID, via_no_dac);
968 +
969 +static int check_iommu(char *s)
970 +{
971 +       if (!strcmp(s, "usedac")) {
972 +               forbid_dac = -1;
973 +               return 1;
974 +       }
975 +       return 0;
976 +}
977 +__setup("iommu=", check_iommu);
978 +#endif
979 +
980  dma_addr_t
981  dma_map_single(struct device *dev, void *ptr, size_t size,
982                enum dma_data_direction direction)
983 --- sle11sp1-2010-03-01.orig/arch/x86/kernel/process_32-xen.c   2009-11-06 10:46:41.000000000 +0100
984 +++ sle11sp1-2010-03-01/arch/x86/kernel/process_32-xen.c        2009-11-06 10:49:47.000000000 +0100
985 @@ -21,7 +21,6 @@
986  #include <linux/mm.h>
987  #include <linux/elfcore.h>
988  #include <linux/smp.h>
989 -#include <linux/smp_lock.h>
990  #include <linux/stddef.h>
991  #include <linux/slab.h>
992  #include <linux/vmalloc.h>
993 @@ -39,6 +38,7 @@
994  #include <linux/random.h>
995  #include <linux/personality.h>
996  #include <linux/tick.h>
997 +#include <linux/percpu.h>
998  
999  #include <asm/uaccess.h>
1000  #include <asm/pgtable.h>
1001 @@ -61,7 +61,6 @@
1002  
1003  #include <asm/tlbflush.h>
1004  #include <asm/cpu.h>
1005 -#include <asm/pda.h>
1006  
1007  asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
1008  
1009 @@ -70,6 +69,12 @@ static int hlt_counter;
1010  unsigned long boot_option_idle_override = 0;
1011  EXPORT_SYMBOL(boot_option_idle_override);
1012  
1013 +DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task;
1014 +EXPORT_PER_CPU_SYMBOL(current_task);
1015 +
1016 +DEFINE_PER_CPU(int, cpu_number);
1017 +EXPORT_PER_CPU_SYMBOL(cpu_number);
1018 +
1019  /*
1020   * Return saved PC of a blocked thread.
1021   */
1022 @@ -168,6 +173,7 @@ void cpu_idle(void)
1023                         if (__get_cpu_var(cpu_idle_state))
1024                                 __get_cpu_var(cpu_idle_state) = 0;
1025  
1026 +                       check_pgt_cache();
1027                         rmb();
1028                         idle = xen_idle; /* no alternatives */
1029  
1030 @@ -218,18 +224,19 @@ void __devinit select_idle_routine(const
1031  {
1032  }
1033  
1034 -static int __init idle_setup (char *str)
1035 +static int __init idle_setup(char *str)
1036  {
1037 -       if (!strncmp(str, "poll", 4)) {
1038 +       if (!strcmp(str, "poll")) {
1039                 printk("using polling idle threads.\n");
1040                 pm_idle = poll_idle;
1041         }
1042 +       else
1043 +               return -1;
1044  
1045         boot_option_idle_override = 1;
1046 -       return 1;
1047 +       return 0;
1048  }
1049 -
1050 -__setup("idle=", idle_setup);
1051 +early_param("idle", idle_setup);
1052  
1053  void show_regs(struct pt_regs * regs)
1054  {
1055 @@ -282,7 +289,7 @@ int kernel_thread(int (*fn)(void *), voi
1056  
1057         regs.xds = __USER_DS;
1058         regs.xes = __USER_DS;
1059 -       regs.xfs = __KERNEL_PDA;
1060 +       regs.xfs = __KERNEL_PERCPU;
1061         regs.orig_eax = -1;
1062         regs.eip = (unsigned long) kernel_thread_helper;
1063         regs.xcs = __KERNEL_CS | get_kernel_rpl();
1064 @@ -562,7 +569,7 @@ struct task_struct fastcall * __switch_t
1065          * multicall to indicate FPU task switch, rather than
1066          * synchronously trapping to Xen.
1067          */
1068 -       if (prev_p->thread_info->status & TS_USEDFPU) {
1069 +       if (task_thread_info(prev_p)->status & TS_USEDFPU) {
1070                 __save_init_fpu(prev_p); /* _not_ save_init_fpu() */
1071                 mcl->op      = __HYPERVISOR_fpu_taskswitch;
1072                 mcl->args[0] = 1;
1073 @@ -669,7 +676,7 @@ struct task_struct fastcall * __switch_t
1074         if (prev->gs | next->gs)
1075                 loadsegment(gs, next->gs);
1076  
1077 -       write_pda(pcurrent, next_p);
1078 +       x86_write_percpu(current_task, next_p);
1079  
1080         return prev_p;
1081  }
1082 --- sle11sp1-2010-03-01.orig/arch/x86/kernel/quirks-xen.c       2009-11-06 10:46:27.000000000 +0100
1083 +++ sle11sp1-2010-03-01/arch/x86/kernel/quirks-xen.c    2009-11-06 10:49:47.000000000 +0100
1084 @@ -3,12 +3,10 @@
1085   */
1086  #include <linux/pci.h>
1087  #include <linux/irq.h>
1088 -#include <asm/pci-direct.h>
1089 -#include <asm/genapic.h>
1090 -#include <asm/cpu.h>
1091  
1092  #if defined(CONFIG_X86_IO_APIC) && (defined(CONFIG_SMP) || defined(CONFIG_XEN)) && defined(CONFIG_PCI)
1093 -static void __devinit verify_quirk_intel_irqbalance(struct pci_dev *dev)
1094 +
1095 +static void __devinit quirk_intel_irqbalance(struct pci_dev *dev)
1096  {
1097         u8 config, rev;
1098         u32 word;
1099 @@ -16,7 +14,7 @@ static void __devinit verify_quirk_intel
1100         /* BIOS may enable hardware IRQ balancing for
1101          * E7520/E7320/E7525(revision ID 0x9 and below)
1102          * based platforms.
1103 -        * For those platforms, make sure that the genapic is set to 'flat'
1104 +        * Disable SW irqbalance/affinity on those platforms.
1105          */
1106         pci_read_config_byte(dev, PCI_CLASS_REVISION, &rev);
1107         if (rev > 0x9)
1108 @@ -30,59 +28,20 @@ static void __devinit verify_quirk_intel
1109         raw_pci_ops->read(0, 0, 0x40, 0x4c, 2, &word);
1110  
1111         if (!(word & (1 << 13))) {
1112 -#ifndef CONFIG_XEN
1113 -#ifdef CONFIG_X86_64
1114 -               if (genapic !=  &apic_flat)
1115 -                       panic("APIC mode must be flat on this system\n");
1116 -#elif defined(CONFIG_X86_GENERICARCH)
1117 -               if (genapic != &apic_default)
1118 -                       panic("APIC mode must be default(flat) on this system. Use apic=default\n");
1119 -#endif
1120 -#endif
1121 -       }
1122 -
1123 -       /* put back the original value for config space*/
1124 -       if (!(config & 0x2))
1125 -               pci_write_config_byte(dev, 0xf4, config);
1126 -}
1127 -
1128 -void __init quirk_intel_irqbalance(void)
1129 -{
1130 -       u8 config, rev;
1131 -       u32 word;
1132 -
1133 -       /* BIOS may enable hardware IRQ balancing for
1134 -        * E7520/E7320/E7525(revision ID 0x9 and below)
1135 -        * based platforms.
1136 -        * Disable SW irqbalance/affinity on those platforms.
1137 -        */
1138 -       rev = read_pci_config_byte(0, 0, 0, PCI_CLASS_REVISION);
1139 -       if (rev > 0x9)
1140 -               return;
1141 -
1142 -       printk(KERN_INFO "Intel E7520/7320/7525 detected.");
1143 -
1144 -       /* enable access to config space */
1145 -       config = read_pci_config_byte(0, 0, 0, 0xf4);
1146 -       write_pci_config_byte(0, 0, 0, 0xf4, config|0x2);
1147 -
1148 -       /* read xTPR register */
1149 -       word = read_pci_config_16(0, 0, 0x40, 0x4c);
1150 -
1151 -       if (!(word & (1 << 13))) {
1152                 struct xen_platform_op op;
1153 -               printk(KERN_INFO "Disabling irq balancing and affinity\n");
1154 +
1155 +               printk(KERN_INFO "Intel E7520/7320/7525 detected. "
1156 +                       "Disabling irq balancing and affinity\n");
1157                 op.cmd = XENPF_platform_quirk;
1158                 op.u.platform_quirk.quirk_id = QUIRK_NOIRQBALANCING;
1159                 WARN_ON(HYPERVISOR_platform_op(&op));
1160         }
1161  
1162 -       /* put back the original value for config space */
1163 +       /* put back the original value for config space*/
1164         if (!(config & 0x2))
1165 -               write_pci_config_byte(0, 0, 0, 0xf4, config);
1166 +               pci_write_config_byte(dev, 0xf4, config);
1167  }
1168 -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL,   PCI_DEVICE_ID_INTEL_E7320_MCH,  verify_quirk_intel_irqbalance);
1169 -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL,   PCI_DEVICE_ID_INTEL_E7525_MCH,  verify_quirk_intel_irqbalance);
1170 -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL,   PCI_DEVICE_ID_INTEL_E7520_MCH,  verify_quirk_intel_irqbalance);
1171 -
1172 +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL,   PCI_DEVICE_ID_INTEL_E7320_MCH,  quirk_intel_irqbalance);
1173 +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL,   PCI_DEVICE_ID_INTEL_E7525_MCH,  quirk_intel_irqbalance);
1174 +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL,   PCI_DEVICE_ID_INTEL_E7520_MCH,  quirk_intel_irqbalance);
1175  #endif
1176 --- sle11sp1-2010-03-01.orig/arch/x86/kernel/smp_32-xen.c       2009-11-06 10:46:41.000000000 +0100
1177 +++ sle11sp1-2010-03-01/arch/x86/kernel/smp_32-xen.c    2009-11-06 10:49:47.000000000 +0100
1178 @@ -13,7 +13,6 @@
1179  #include <linux/mm.h>
1180  #include <linux/delay.h>
1181  #include <linux/spinlock.h>
1182 -#include <linux/smp_lock.h>
1183  #include <linux/kernel_stat.h>
1184  #include <linux/mc146818rtc.h>
1185  #include <linux/cache.h>
1186 @@ -216,7 +215,6 @@ static cpumask_t flush_cpumask;
1187  static struct mm_struct * flush_mm;
1188  static unsigned long flush_va;
1189  static DEFINE_SPINLOCK(tlbstate_lock);
1190 -#define FLUSH_ALL      0xffffffff
1191  
1192  /*
1193   * We cannot call mmdrop() because we are in interrupt context, 
1194 @@ -298,7 +296,7 @@ irqreturn_t smp_invalidate_interrupt(int
1195                  
1196         if (flush_mm == per_cpu(cpu_tlbstate, cpu).active_mm) {
1197                 if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK) {
1198 -                       if (flush_va == FLUSH_ALL)
1199 +                       if (flush_va == TLB_FLUSH_ALL)
1200                                 local_flush_tlb();
1201                         else
1202                                 __flush_tlb_one(flush_va);
1203 @@ -314,9 +312,11 @@ out:
1204         return IRQ_HANDLED;
1205  }
1206  
1207 -static void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm,
1208 -                                               unsigned long va)
1209 +void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm,
1210 +                            unsigned long va)
1211  {
1212 +       cpumask_t cpumask = *cpumaskp;
1213 +
1214         /*
1215          * A couple of (to be removed) sanity checks:
1216          *
1217 @@ -327,10 +327,12 @@ static void flush_tlb_others(cpumask_t c
1218         BUG_ON(cpu_isset(smp_processor_id(), cpumask));
1219         BUG_ON(!mm);
1220  
1221 +#ifdef CONFIG_HOTPLUG_CPU
1222         /* If a CPU which we ran on has gone down, OK. */
1223         cpus_and(cpumask, cpumask, cpu_online_map);
1224 -       if (cpus_empty(cpumask))
1225 +       if (unlikely(cpus_empty(cpumask)))
1226                 return;
1227 +#endif
1228  
1229         /*
1230          * i'm not happy about this global shared spinlock in the
1231 @@ -341,17 +343,7 @@ static void flush_tlb_others(cpumask_t c
1232         
1233         flush_mm = mm;
1234         flush_va = va;
1235 -#if NR_CPUS <= BITS_PER_LONG
1236 -       atomic_set_mask(cpumask, &flush_cpumask);
1237 -#else
1238 -       {
1239 -               int k;
1240 -               unsigned long *flush_mask = (unsigned long *)&flush_cpumask;
1241 -               unsigned long *cpu_mask = (unsigned long *)&cpumask;
1242 -               for (k = 0; k < BITS_TO_LONGS(NR_CPUS); ++k)
1243 -                       atomic_set_mask(cpu_mask[k], &flush_mask[k]);
1244 -       }
1245 -#endif
1246 +       cpus_or(flush_cpumask, cpumask, flush_cpumask);
1247         /*
1248          * We have to send the IPI only to
1249          * CPUs affected.
1250 @@ -378,7 +370,7 @@ void flush_tlb_current_task(void)
1251  
1252         local_flush_tlb();
1253         if (!cpus_empty(cpu_mask))
1254 -               flush_tlb_others(cpu_mask, mm, FLUSH_ALL);
1255 +               flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL);
1256         preempt_enable();
1257  }
1258  
1259 @@ -397,7 +389,7 @@ void flush_tlb_mm (struct mm_struct * mm
1260                         leave_mm(smp_processor_id());
1261         }
1262         if (!cpus_empty(cpu_mask))
1263 -               flush_tlb_others(cpu_mask, mm, FLUSH_ALL);
1264 +               flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL);
1265  
1266         preempt_enable();
1267  }
1268 @@ -446,7 +438,7 @@ void flush_tlb_all(void)
1269   * it goes straight through and wastes no time serializing
1270   * anything. Worst case is that we lose a reschedule ...
1271   */
1272 -void smp_send_reschedule(int cpu)
1273 +void xen_smp_send_reschedule(int cpu)
1274  {
1275         WARN_ON(cpu_is_offline(cpu));
1276         send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR);
1277 @@ -478,36 +470,79 @@ void unlock_ipi_call_lock(void)
1278  
1279  static struct call_data_struct *call_data;
1280  
1281 +static void __smp_call_function(void (*func) (void *info), void *info,
1282 +                               int nonatomic, int wait)
1283 +{
1284 +       struct call_data_struct data;
1285 +       int cpus = num_online_cpus() - 1;
1286 +
1287 +       if (!cpus)
1288 +               return;
1289 +
1290 +       data.func = func;
1291 +       data.info = info;
1292 +       atomic_set(&data.started, 0);
1293 +       data.wait = wait;
1294 +       if (wait)
1295 +               atomic_set(&data.finished, 0);
1296 +
1297 +       call_data = &data;
1298 +       mb();
1299 +
1300 +       /* Send a message to all other CPUs and wait for them to respond */
1301 +       send_IPI_allbutself(CALL_FUNCTION_VECTOR);
1302 +
1303 +       /* Wait for response */
1304 +       while (atomic_read(&data.started) != cpus)
1305 +               cpu_relax();
1306 +
1307 +       if (wait)
1308 +               while (atomic_read(&data.finished) != cpus)
1309 +                       cpu_relax();
1310 +}
1311 +
1312 +
1313  /**
1314 - * smp_call_function(): Run a function on all other CPUs.
1315 + * smp_call_function_mask(): Run a function on a set of other CPUs.
1316 + * @mask: The set of cpus to run on.  Must not include the current cpu.
1317   * @func: The function to run. This must be fast and non-blocking.
1318   * @info: An arbitrary pointer to pass to the function.
1319 - * @nonatomic: currently unused.
1320   * @wait: If true, wait (atomically) until function has completed on other CPUs.
1321   *
1322 - * Returns 0 on success, else a negative status code. Does not return until
1323 - * remote CPUs are nearly ready to execute <<func>> or are or have executed.
1324 +  * Returns 0 on success, else a negative status code.
1325 + *
1326 + * If @wait is true, then returns once @func has returned; otherwise
1327 + * it returns just before the target cpu calls @func.
1328   *
1329   * You must not call this function with disabled interrupts or from a
1330   * hardware interrupt handler or from a bottom half handler.
1331   */
1332 -int smp_call_function (void (*func) (void *info), void *info, int nonatomic,
1333 -                       int wait)
1334 +int
1335 +xen_smp_call_function_mask(cpumask_t mask,
1336 +                             void (*func)(void *), void *info,
1337 +                             int wait)
1338  {
1339         struct call_data_struct data;
1340 +       cpumask_t allbutself;
1341         int cpus;
1342  
1343 +       /* Can deadlock when called with interrupts disabled */
1344 +       WARN_ON(irqs_disabled());
1345 +
1346         /* Holding any lock stops cpus from going down. */
1347         spin_lock(&call_lock);
1348 -       cpus = num_online_cpus() - 1;
1349 +
1350 +       allbutself = cpu_online_map;
1351 +       cpu_clear(smp_processor_id(), allbutself);
1352 +
1353 +       cpus_and(mask, mask, allbutself);
1354 +       cpus = cpus_weight(mask);
1355 +
1356         if (!cpus) {
1357                 spin_unlock(&call_lock);
1358                 return 0;
1359         }
1360  
1361 -       /* Can deadlock when called with interrupts disabled */
1362 -       WARN_ON(irqs_disabled());
1363 -
1364         data.func = func;
1365         data.info = info;
1366         atomic_set(&data.started, 0);
1367 @@ -517,9 +552,12 @@ int smp_call_function (void (*func) (voi
1368  
1369         call_data = &data;
1370         mb();
1371 -       
1372 -       /* Send a message to all other CPUs and wait for them to respond */
1373 -       send_IPI_allbutself(CALL_FUNCTION_VECTOR);
1374 +
1375 +       /* Send a message to other CPUs */
1376 +       if (cpus_equal(mask, allbutself))
1377 +               send_IPI_allbutself(CALL_FUNCTION_VECTOR);
1378 +       else
1379 +               send_IPI_mask(mask, CALL_FUNCTION_VECTOR);
1380  
1381         /* Wait for response */
1382         while (atomic_read(&data.started) != cpus)
1383 @@ -532,15 +570,14 @@ int smp_call_function (void (*func) (voi
1384  
1385         return 0;
1386  }
1387 -EXPORT_SYMBOL(smp_call_function);
1388  
1389  static void stop_this_cpu (void * dummy)
1390  {
1391 +       local_irq_disable();
1392         /*
1393          * Remove this CPU:
1394          */
1395         cpu_clear(smp_processor_id(), cpu_online_map);
1396 -       local_irq_disable();
1397         disable_all_local_evtchn();
1398         if (cpu_data[smp_processor_id()].hlt_works_ok)
1399                 for(;;) halt();
1400 @@ -551,13 +588,18 @@ static void stop_this_cpu (void * dummy)
1401   * this function calls the 'stop' function on all other CPUs in the system.
1402   */
1403  
1404 -void smp_send_stop(void)
1405 +void xen_smp_send_stop(void)
1406  {
1407 -       smp_call_function(stop_this_cpu, NULL, 1, 0);
1408 +       /* Don't deadlock on the call lock in panic */
1409 +       int nolock = !spin_trylock(&call_lock);
1410 +       unsigned long flags;
1411  
1412 -       local_irq_disable();
1413 +       local_irq_save(flags);
1414 +       __smp_call_function(stop_this_cpu, NULL, 0, 0);
1415 +       if (!nolock)
1416 +               spin_unlock(&call_lock);
1417         disable_all_local_evtchn();
1418 -       local_irq_enable();
1419 +       local_irq_restore(flags);
1420  }
1421  
1422  /*
1423 @@ -598,74 +640,3 @@ irqreturn_t smp_call_function_interrupt(
1424  
1425         return IRQ_HANDLED;
1426  }
1427 -
1428 -/*
1429 - * this function sends a 'generic call function' IPI to one other CPU
1430 - * in the system.
1431 - *
1432 - * cpu is a standard Linux logical CPU number.
1433 - */
1434 -static void
1435 -__smp_call_function_single(int cpu, void (*func) (void *info), void *info,
1436 -                               int nonatomic, int wait)
1437 -{
1438 -       struct call_data_struct data;
1439 -       int cpus = 1;
1440 -
1441 -       data.func = func;
1442 -       data.info = info;
1443 -       atomic_set(&data.started, 0);
1444 -       data.wait = wait;
1445 -       if (wait)
1446 -               atomic_set(&data.finished, 0);
1447 -
1448 -       call_data = &data;
1449 -       wmb();
1450 -       /* Send a message to all other CPUs and wait for them to respond */
1451 -       send_IPI_mask(cpumask_of_cpu(cpu), CALL_FUNCTION_VECTOR);
1452 -
1453 -       /* Wait for response */
1454 -       while (atomic_read(&data.started) != cpus)
1455 -               cpu_relax();
1456 -
1457 -       if (!wait)
1458 -               return;
1459 -
1460 -       while (atomic_read(&data.finished) != cpus)
1461 -               cpu_relax();
1462 -}
1463 -
1464 -/*
1465 - * smp_call_function_single - Run a function on another CPU
1466 - * @func: The function to run. This must be fast and non-blocking.
1467 - * @info: An arbitrary pointer to pass to the function.
1468 - * @nonatomic: Currently unused.
1469 - * @wait: If true, wait until function has completed on other CPUs.
1470 - *
1471 - * Retrurns 0 on success, else a negative status code.
1472 - *
1473 - * Does not return until the remote CPU is nearly ready to execute <func>
1474 - * or is or has executed.
1475 - */
1476 -
1477 -int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
1478 -                       int nonatomic, int wait)
1479 -{
1480 -       /* prevent preemption and reschedule on another processor */
1481 -       int me = get_cpu();
1482 -       if (cpu == me) {
1483 -               WARN_ON(1);
1484 -               put_cpu();
1485 -               return -EBUSY;
1486 -       }
1487 -
1488 -       /* Can deadlock when called with interrupts disabled */
1489 -       WARN_ON(irqs_disabled());
1490 -
1491 -       spin_lock_bh(&call_lock);
1492 -       __smp_call_function_single(cpu, func, info, nonatomic, wait);
1493 -       spin_unlock_bh(&call_lock);
1494 -       put_cpu();
1495 -       return 0;
1496 -}
1497 -EXPORT_SYMBOL(smp_call_function_single);
1498 --- sle11sp1-2010-03-01.orig/arch/x86/kernel/time-xen.c 2010-03-01 14:29:27.000000000 +0100
1499 +++ sle11sp1-2010-03-01/arch/x86/kernel/time-xen.c      2010-03-01 14:30:29.000000000 +0100
1500 @@ -79,7 +79,6 @@
1501  #include <asm/i8253.h>
1502  DEFINE_SPINLOCK(i8253_lock);
1503  EXPORT_SYMBOL(i8253_lock);
1504 -int pit_latch_buggy;              /* extern */
1505  #else
1506  volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES;
1507  #endif
1508 @@ -199,6 +198,36 @@ static inline u64 scale_delta(u64 delta,
1509         return product;
1510  }
1511  
1512 +static inline u64 get64(volatile u64 *ptr)
1513 +{
1514 +#ifndef CONFIG_64BIT
1515 +       u64 res;
1516 +       __asm__("movl %%ebx,%%eax\n"
1517 +               "movl %%ecx,%%edx\n"
1518 +               LOCK_PREFIX "cmpxchg8b %1"
1519 +               : "=&A" (res) : "m" (*ptr));
1520 +       return res;
1521 +#else
1522 +       return *ptr;
1523 +#define cmpxchg64 cmpxchg
1524 +#endif
1525 +}
1526 +
1527 +static inline u64 get64_local(volatile u64 *ptr)
1528 +{
1529 +#ifndef CONFIG_64BIT
1530 +       u64 res;
1531 +       __asm__("movl %%ebx,%%eax\n"
1532 +               "movl %%ecx,%%edx\n"
1533 +               "cmpxchg8b %1"
1534 +               : "=&A" (res) : "m" (*ptr));
1535 +       return res;
1536 +#else
1537 +       return *ptr;
1538 +#define cmpxchg64_local cmpxchg_local
1539 +#endif
1540 +}
1541 +
1542  static void init_cpu_khz(void)
1543  {
1544         u64 __cpu_khz = 1000000ULL << 32;
1545 @@ -378,7 +407,7 @@ static int set_rtc_mmss(unsigned long no
1546         return retval;
1547  }
1548  
1549 -unsigned long long sched_clock(void)
1550 +static unsigned long long local_clock(void)
1551  {
1552         unsigned int cpu = get_cpu();
1553         struct shadow_time_info *shadow = &per_cpu(shadow_time, cpu);
1554 @@ -399,6 +428,61 @@ unsigned long long sched_clock(void)
1555         return time;
1556  }
1557  
1558 +/*
1559 + * Runstate accounting
1560 + */
1561 +static void get_runstate_snapshot(struct vcpu_runstate_info *res)
1562 +{
1563 +       u64 state_time;
1564 +       struct vcpu_runstate_info *state;
1565 +
1566 +       BUG_ON(preemptible());
1567 +
1568 +       state = &__get_cpu_var(runstate);
1569 +
1570 +       do {
1571 +               state_time = get64_local(&state->state_entry_time);
1572 +               *res = *state;
1573 +       } while (get64_local(&state->state_entry_time) != state_time);
1574 +
1575 +       WARN_ON_ONCE(res->state != RUNSTATE_running);
1576 +}
1577 +
1578 +/*
1579 + * Xen sched_clock implementation.  Returns the number of unstolen
1580 + * nanoseconds, which is nanoseconds the VCPU spent in RUNNING+BLOCKED
1581 + * states.
1582 + */
1583 +unsigned long long sched_clock(void)
1584 +{
1585 +       struct vcpu_runstate_info runstate;
1586 +       cycle_t now;
1587 +       u64 ret;
1588 +       s64 offset;
1589 +
1590 +       /*
1591 +        * Ideally sched_clock should be called on a per-cpu basis
1592 +        * anyway, so preempt should already be disabled, but that's
1593 +        * not current practice at the moment.
1594 +        */
1595 +       preempt_disable();
1596 +
1597 +       now = local_clock();
1598 +
1599 +       get_runstate_snapshot(&runstate);
1600 +
1601 +       offset = now - runstate.state_entry_time;
1602 +       if (offset < 0)
1603 +               offset = 0;
1604 +
1605 +       ret = offset + runstate.time[RUNSTATE_running]
1606 +             + runstate.time[RUNSTATE_blocked];
1607 +
1608 +       preempt_enable();
1609 +
1610 +       return ret;
1611 +}
1612 +
1613  unsigned long profile_pc(struct pt_regs *regs)
1614  {
1615         unsigned long pc = instruction_pointer(regs);
1616 @@ -446,10 +530,9 @@ EXPORT_SYMBOL(profile_pc);
1617  irqreturn_t timer_interrupt(int irq, void *dev_id)
1618  {
1619         s64 delta, delta_cpu, stolen, blocked;
1620 -       u64 sched_time;
1621         unsigned int i, cpu = smp_processor_id();
1622         struct shadow_time_info *shadow = &per_cpu(shadow_time, cpu);
1623 -       struct vcpu_runstate_info *runstate = &per_cpu(runstate, cpu);
1624 +       struct vcpu_runstate_info runstate;
1625  
1626         /*
1627          * Here we are in the timer irq handler. We just have irqs locally
1628 @@ -469,20 +552,7 @@ irqreturn_t timer_interrupt(int irq, voi
1629                 delta     -= processed_system_time;
1630                 delta_cpu -= per_cpu(processed_system_time, cpu);
1631  
1632 -               /*
1633 -                * Obtain a consistent snapshot of stolen/blocked cycles. We
1634 -                * can use state_entry_time to detect if we get preempted here.
1635 -                */
1636 -               do {
1637 -                       sched_time = runstate->state_entry_time;
1638 -                       barrier();
1639 -                       stolen = runstate->time[RUNSTATE_runnable] +
1640 -                               runstate->time[RUNSTATE_offline] -
1641 -                               per_cpu(processed_stolen_time, cpu);
1642 -                       blocked = runstate->time[RUNSTATE_blocked] -
1643 -                               per_cpu(processed_blocked_time, cpu);
1644 -                       barrier();
1645 -               } while (sched_time != runstate->state_entry_time);
1646 +               get_runstate_snapshot(&runstate);
1647         } while (!time_values_up_to_date(cpu));
1648  
1649         if ((unlikely(delta < -(s64)permitted_clock_jitter) ||
1650 @@ -525,6 +595,9 @@ irqreturn_t timer_interrupt(int irq, voi
1651          * HACK: Passing NULL to account_steal_time()
1652          * ensures that the ticks are accounted as stolen.
1653          */
1654 +       stolen = runstate.time[RUNSTATE_runnable]
1655 +                + runstate.time[RUNSTATE_offline]
1656 +                - per_cpu(processed_stolen_time, cpu);
1657         if ((stolen > 0) && (delta_cpu > 0)) {
1658                 delta_cpu -= stolen;
1659                 if (unlikely(delta_cpu < 0))
1660 @@ -540,6 +613,8 @@ irqreturn_t timer_interrupt(int irq, voi
1661          * HACK: Passing idle_task to account_steal_time()
1662          * ensures that the ticks are accounted as idle/wait.
1663          */
1664 +       blocked = runstate.time[RUNSTATE_blocked]
1665 +                 - per_cpu(processed_blocked_time, cpu);
1666         if ((blocked > 0) && (delta_cpu > 0)) {
1667                 delta_cpu -= blocked;
1668                 if (unlikely(delta_cpu < 0))
1669 @@ -576,7 +651,7 @@ irqreturn_t timer_interrupt(int irq, voi
1670         return IRQ_HANDLED;
1671  }
1672  
1673 -void mark_tsc_unstable(void)
1674 +void mark_tsc_unstable(char *reason)
1675  {
1676  #ifndef CONFIG_XEN /* XXX Should tell the hypervisor about this fact. */
1677         tsc_unstable = 1;
1678 @@ -584,17 +659,13 @@ void mark_tsc_unstable(void)
1679  }
1680  EXPORT_SYMBOL_GPL(mark_tsc_unstable);
1681  
1682 +static cycle_t cs_last;
1683 +
1684  static cycle_t xen_clocksource_read(void)
1685  {
1686  #ifdef CONFIG_SMP
1687 -       static cycle_t last_ret;
1688 -#ifndef CONFIG_64BIT
1689 -       cycle_t last = cmpxchg64(&last_ret, 0, 0);
1690 -#else
1691 -       cycle_t last = last_ret;
1692 -#define cmpxchg64 cmpxchg
1693 -#endif
1694 -       cycle_t ret = sched_clock();
1695 +       cycle_t last = get64(&cs_last);
1696 +       cycle_t ret = local_clock();
1697  
1698         if (unlikely((s64)(ret - last) < 0)) {
1699                 if (last - ret > permitted_clock_jitter
1700 @@ -613,17 +684,25 @@ static cycle_t xen_clocksource_read(void
1701         }
1702  
1703         for (;;) {
1704 -               cycle_t cur = cmpxchg64(&last_ret, last, ret);
1705 +               cycle_t cur = cmpxchg64(&cs_last, last, ret);
1706  
1707                 if (cur == last || (s64)(ret - cur) < 0)
1708                         return ret;
1709                 last = cur;
1710         }
1711  #else
1712 -       return sched_clock();
1713 +       return local_clock();
1714  #endif
1715  }
1716  
1717 +static void xen_clocksource_resume(void)
1718 +{
1719 +       extern void time_resume(void);
1720 +
1721 +       time_resume();
1722 +       cs_last = local_clock();
1723 +}
1724 +
1725  static struct clocksource clocksource_xen = {
1726         .name                   = "xen",
1727         .rating                 = 400,
1728 @@ -632,19 +711,29 @@ static struct clocksource clocksource_xe
1729         .mult                   = 1 << XEN_SHIFT,               /* time directly in nanoseconds */
1730         .shift                  = XEN_SHIFT,
1731         .flags                  = CLOCK_SOURCE_IS_CONTINUOUS,
1732 +       .resume                 = xen_clocksource_resume,
1733  };
1734  
1735 -static void init_missing_ticks_accounting(unsigned int cpu)
1736 +struct vcpu_runstate_info *setup_runstate_area(unsigned int cpu)
1737  {
1738         struct vcpu_register_runstate_memory_area area;
1739         struct vcpu_runstate_info *runstate = &per_cpu(runstate, cpu);
1740         int rc;
1741  
1742 -       memset(runstate, 0, sizeof(*runstate));
1743 -
1744 -       area.addr.v = runstate;
1745 +       set_xen_guest_handle(area.addr.h, runstate);
1746         rc = HYPERVISOR_vcpu_op(VCPUOP_register_runstate_memory_area, cpu, &area);
1747 -       WARN_ON(rc && rc != -ENOSYS);
1748 +       if (rc) {
1749 +               BUILD_BUG_ON(RUNSTATE_running);
1750 +               memset(runstate, 0, sizeof(*runstate));
1751 +               WARN_ON(rc != -ENOSYS);
1752 +       }
1753 +
1754 +       return runstate;
1755 +}
1756 +
1757 +static void init_missing_ticks_accounting(unsigned int cpu)
1758 +{
1759 +       struct vcpu_runstate_info *runstate = setup_runstate_area(cpu);
1760  
1761         per_cpu(processed_blocked_time, cpu) =
1762                 runstate->time[RUNSTATE_blocked];
1763 @@ -720,35 +809,6 @@ void notify_arch_cmos_timer(void)
1764         mod_timer(&sync_xen_wallclock_timer, jiffies + 1);
1765  }
1766  
1767 -static int timer_resume(struct sys_device *dev)
1768 -{
1769 -       extern void time_resume(void);
1770 -       time_resume();
1771 -       return 0;
1772 -}
1773 -
1774 -static struct sysdev_class timer_sysclass = {
1775 -       .resume = timer_resume,
1776 -       set_kset_name("timer"),
1777 -};
1778 -
1779 -
1780 -/* XXX this driverfs stuff should probably go elsewhere later -john */
1781 -static struct sys_device device_timer = {
1782 -       .id     = 0,
1783 -       .cls    = &timer_sysclass,
1784 -};
1785 -
1786 -static int time_init_device(void)
1787 -{
1788 -       int error = sysdev_class_register(&timer_sysclass);
1789 -       if (!error)
1790 -               error = sysdev_register(&device_timer);
1791 -       return error;
1792 -}
1793 -
1794 -device_initcall(time_init_device);
1795 -
1796  extern void (*late_time_init)(void);
1797  
1798  /* Dynamically-mapped IRQ. */
1799 @@ -892,21 +952,21 @@ static void start_hz_timer(void)
1800         cpu_clear(cpu, nohz_cpu_mask);
1801  }
1802  
1803 -void raw_safe_halt(void)
1804 +void xen_safe_halt(void)
1805  {
1806         stop_hz_timer();
1807         /* Blocking includes an implicit local_irq_enable(). */
1808         HYPERVISOR_block();
1809         start_hz_timer();
1810  }
1811 -EXPORT_SYMBOL(raw_safe_halt);
1812 +EXPORT_SYMBOL(xen_safe_halt);
1813  
1814 -void halt(void)
1815 +void xen_halt(void)
1816  {
1817         if (irqs_disabled())
1818                 VOID(HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL));
1819  }
1820 -EXPORT_SYMBOL(halt);
1821 +EXPORT_SYMBOL(xen_halt);
1822  
1823  /* No locking required. Interrupts are disabled on all CPUs. */
1824  void time_resume(void)
1825 --- sle11sp1-2010-03-01.orig/arch/x86/kernel/traps_32-xen.c     2009-11-06 10:46:41.000000000 +0100
1826 +++ sle11sp1-2010-03-01/arch/x86/kernel/traps_32-xen.c  2009-11-06 10:49:47.000000000 +0100
1827 @@ -52,7 +52,7 @@
1828  #include <asm/unwind.h>
1829  #include <asm/smp.h>
1830  #include <asm/arch_hooks.h>
1831 -#include <asm/kdebug.h>
1832 +#include <linux/kdebug.h>
1833  #include <asm/stacktrace.h>
1834  
1835  #include <linux/module.h>
1836 @@ -101,20 +101,6 @@ asmlinkage void machine_check(void);
1837  
1838  int kstack_depth_to_print = 24;
1839  static unsigned int code_bytes = 64;
1840 -ATOMIC_NOTIFIER_HEAD(i386die_chain);
1841 -
1842 -int register_die_notifier(struct notifier_block *nb)
1843 -{
1844 -       vmalloc_sync_all();
1845 -       return atomic_notifier_chain_register(&i386die_chain, nb);
1846 -}
1847 -EXPORT_SYMBOL(register_die_notifier); /* used modular by kdb */
1848 -
1849 -int unregister_die_notifier(struct notifier_block *nb)
1850 -{
1851 -       return atomic_notifier_chain_unregister(&i386die_chain, nb);
1852 -}
1853 -EXPORT_SYMBOL(unregister_die_notifier); /* used modular by kdb */
1854  
1855  static inline int valid_stack_ptr(struct thread_info *tinfo, void *p)
1856  {
1857 @@ -325,7 +311,7 @@ void show_registers(struct pt_regs *regs
1858                regs->xds & 0xffff, regs->xes & 0xffff, regs->xfs & 0xffff, gs, ss);
1859         printk(KERN_EMERG "Process %.*s (pid: %d, ti=%p task=%p task.ti=%p)",
1860                 TASK_COMM_LEN, current->comm, current->pid,
1861 -               current_thread_info(), current, current->thread_info);
1862 +               current_thread_info(), current, task_thread_info(current));
1863         /*
1864          * When in-kernel, we also print out the stack and code at the
1865          * time of the fault..
1866 @@ -482,8 +468,6 @@ static void __kprobes do_trap(int trapnr
1867                               siginfo_t *info)
1868  {
1869         struct task_struct *tsk = current;
1870 -       tsk->thread.error_code = error_code;
1871 -       tsk->thread.trap_no = trapnr;
1872  
1873         if (regs->eflags & VM_MASK) {
1874                 if (vm86)
1875 @@ -495,6 +479,18 @@ static void __kprobes do_trap(int trapnr
1876                 goto kernel_trap;
1877  
1878         trap_signal: {
1879 +               /*
1880 +                * We want error_code and trap_no set for userspace faults and
1881 +                * kernelspace faults which result in die(), but not
1882 +                * kernelspace faults which are fixed up.  die() gives the
1883 +                * process no chance to handle the signal and notice the
1884 +                * kernel fault information, so that won't result in polluting
1885 +                * the information about previously queued, but not yet
1886 +                * delivered, faults.  See also do_general_protection below.
1887 +                */
1888 +               tsk->thread.error_code = error_code;
1889 +               tsk->thread.trap_no = trapnr;
1890 +
1891                 if (info)
1892                         force_sig_info(signr, info, tsk);
1893                 else
1894 @@ -503,8 +499,11 @@ static void __kprobes do_trap(int trapnr
1895         }
1896  
1897         kernel_trap: {
1898 -               if (!fixup_exception(regs))
1899 +               if (!fixup_exception(regs)) {
1900 +                       tsk->thread.error_code = error_code;
1901 +                       tsk->thread.trap_no = trapnr;
1902                         die(str, regs, error_code);
1903 +               }
1904                 return;
1905         }
1906  
1907 @@ -578,9 +577,6 @@ DO_ERROR_INFO(32, SIGSEGV, "iret excepti
1908  fastcall void __kprobes do_general_protection(struct pt_regs * regs,
1909                                               long error_code)
1910  {
1911 -       current->thread.error_code = error_code;
1912 -       current->thread.trap_no = 13;
1913 -
1914         if (regs->eflags & VM_MASK)
1915                 goto gp_in_vm86;
1916  
1917 @@ -599,6 +595,8 @@ gp_in_vm86:
1918  
1919  gp_in_kernel:
1920         if (!fixup_exception(regs)) {
1921 +               current->thread.error_code = error_code;
1922 +               current->thread.trap_no = 13;
1923                 if (notify_die(DIE_GPF, "general protection fault", regs,
1924                                 error_code, 13, SIGSEGV) == NOTIFY_STOP)
1925                         return;
1926 @@ -987,9 +985,7 @@ fastcall void do_spurious_interrupt_bug(
1927  fastcall unsigned long patch_espfix_desc(unsigned long uesp,
1928                                           unsigned long kesp)
1929  {
1930 -       int cpu = smp_processor_id();
1931 -       struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu);
1932 -       struct desc_struct *gdt = (struct desc_struct *)cpu_gdt_descr->address;
1933 +       struct desc_struct *gdt = __get_cpu_var(gdt_page).gdt;
1934         unsigned long base = (kesp - uesp) & -THREAD_SIZE;
1935         unsigned long new_kesp = kesp - base;
1936         unsigned long lim_pages = (new_kesp | (THREAD_SIZE - 1)) >> PAGE_SHIFT;
1937 --- sle11sp1-2010-03-01.orig/arch/x86/mm/fault_32-xen.c 2009-11-06 10:46:41.000000000 +0100
1938 +++ sle11sp1-2010-03-01/arch/x86/mm/fault_32-xen.c      2009-11-06 10:49:47.000000000 +0100
1939 @@ -14,19 +14,20 @@
1940  #include <linux/mman.h>
1941  #include <linux/mm.h>
1942  #include <linux/smp.h>
1943 -#include <linux/smp_lock.h>
1944  #include <linux/interrupt.h>
1945  #include <linux/init.h>
1946  #include <linux/tty.h>
1947  #include <linux/vt_kern.h>             /* For unblank_screen() */
1948  #include <linux/highmem.h>
1949 +#include <linux/bootmem.h>             /* for max_low_pfn */
1950 +#include <linux/vmalloc.h>
1951  #include <linux/module.h>
1952  #include <linux/kprobes.h>
1953  #include <linux/uaccess.h>
1954 +#include <linux/kdebug.h>
1955  
1956  #include <asm/system.h>
1957  #include <asm/desc.h>
1958 -#include <asm/kdebug.h>
1959  #include <asm/segment.h>
1960  
1961  extern void die(const char *,struct pt_regs *,long);
1962 @@ -259,25 +260,20 @@ static void dump_fault_path(unsigned lon
1963         unsigned long page;
1964  
1965         page = read_cr3();
1966 -       page = ((unsigned long *) __va(page))[address >> 22];
1967 -       if (oops_may_print())
1968 -               printk(KERN_ALERT "*pde = ma %08lx pa %08lx\n", page,
1969 -                      machine_to_phys(page));
1970 +       page = ((unsigned long *) __va(page))[address >> PGDIR_SHIFT];
1971 +       printk(KERN_ALERT "*pde = ma %08lx pa %08lx\n", page,
1972 +              machine_to_phys(page));
1973         /*
1974          * We must not directly access the pte in the highpte
1975          * case if the page table is located in highmem.
1976          * And lets rather not kmap-atomic the pte, just in case
1977          * it's allocated already.
1978          */
1979 -#ifdef CONFIG_HIGHPTE
1980 -       if ((page >> PAGE_SHIFT) >= highstart_pfn)
1981 -               return;
1982 -#endif
1983 -       if ((page & 1) && oops_may_print()) {
1984 -               page &= PAGE_MASK;
1985 -               address &= 0x003ff000;
1986 -               page = machine_to_phys(page);
1987 -               page = ((unsigned long *) __va(page))[address >> PAGE_SHIFT];
1988 +       if ((machine_to_phys(page) >> PAGE_SHIFT) < max_low_pfn
1989 +           && (page & _PAGE_PRESENT)) {
1990 +               page = machine_to_phys(page & PAGE_MASK);
1991 +               page = ((unsigned long *) __va(page))[(address >> PAGE_SHIFT)
1992 +                                                     & (PTRS_PER_PTE - 1)];
1993                 printk(KERN_ALERT "*pte = ma %08lx pa %08lx\n", page,
1994                        machine_to_phys(page));
1995         }
1996 @@ -581,6 +577,11 @@ bad_area:
1997  bad_area_nosemaphore:
1998         /* User mode accesses just cause a SIGSEGV */
1999         if (error_code & 4) {
2000 +               /*
2001 +                * It's possible to have interrupts off here.
2002 +                */
2003 +               local_irq_enable();
2004 +
2005                 /* 
2006                  * Valid to do another page fault here because this one came 
2007                  * from user space.
2008 @@ -633,7 +634,7 @@ no_context:
2009         bust_spinlocks(1);
2010  
2011         if (oops_may_print()) {
2012 -       #ifdef CONFIG_X86_PAE
2013 +#ifdef CONFIG_X86_PAE
2014                 if (error_code & 16) {
2015                         pte_t *pte = lookup_address(address);
2016  
2017 @@ -642,7 +643,7 @@ no_context:
2018                                         "NX-protected page - exploit attempt? "
2019                                         "(uid: %d)\n", current->uid);
2020                 }
2021 -       #endif
2022 +#endif
2023                 if (address < PAGE_SIZE)
2024                         printk(KERN_ALERT "BUG: unable to handle kernel NULL "
2025                                         "pointer dereference");
2026 @@ -652,8 +653,8 @@ no_context:
2027                 printk(" at virtual address %08lx\n",address);
2028                 printk(KERN_ALERT " printing eip:\n");
2029                 printk("%08lx\n", regs->eip);
2030 +               dump_fault_path(address);
2031         }
2032 -       dump_fault_path(address);
2033         tsk->thread.cr2 = address;
2034         tsk->thread.trap_no = 14;
2035         tsk->thread.error_code = error_code;
2036 @@ -694,7 +695,6 @@ do_sigbus:
2037         force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk);
2038  }
2039  
2040 -#if !HAVE_SHARED_KERNEL_PMD
2041  void vmalloc_sync_all(void)
2042  {
2043         /*
2044 @@ -710,6 +710,9 @@ void vmalloc_sync_all(void)
2045         static unsigned long start = TASK_SIZE;
2046         unsigned long address;
2047  
2048 +       if (SHARED_KERNEL_PMD)
2049 +               return;
2050 +
2051         BUILD_BUG_ON(TASK_SIZE & ~PGDIR_MASK);
2052         for (address = start;
2053              address >= TASK_SIZE && address < hypervisor_virt_start;
2054 @@ -739,4 +742,3 @@ void vmalloc_sync_all(void)
2055                         start = address + (1UL << PMD_SHIFT);
2056         }
2057  }
2058 -#endif
2059 --- sle11sp1-2010-03-01.orig/arch/x86/mm/highmem_32-xen.c       2009-11-06 10:46:41.000000000 +0100
2060 +++ sle11sp1-2010-03-01/arch/x86/mm/highmem_32-xen.c    2009-11-06 10:49:47.000000000 +0100
2061 @@ -26,7 +26,7 @@ void kunmap(struct page *page)
2062   * However when holding an atomic kmap is is not legal to sleep, so atomic
2063   * kmaps are appropriate for short, tight code paths only.
2064   */
2065 -static void *__kmap_atomic(struct page *page, enum km_type type, pgprot_t prot)
2066 +void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot)
2067  {
2068         enum fixed_addresses idx;
2069         unsigned long vaddr;
2070 @@ -49,15 +49,7 @@ static void *__kmap_atomic(struct page *
2071  
2072  void *kmap_atomic(struct page *page, enum km_type type)
2073  {
2074 -       return __kmap_atomic(page, type, kmap_prot);
2075 -}
2076 -
2077 -/* Same as kmap_atomic but with PAGE_KERNEL_RO page protection. */
2078 -void *kmap_atomic_pte(struct page *page, enum km_type type)
2079 -{
2080 -       return __kmap_atomic(page, type,
2081 -                            test_bit(PG_pinned, &page->flags)
2082 -                            ? PAGE_KERNEL_RO : kmap_prot);
2083 +       return kmap_atomic_prot(page, type, kmap_prot);
2084  }
2085  
2086  void kunmap_atomic(void *kvaddr, enum km_type type)
2087 @@ -80,6 +72,7 @@ void kunmap_atomic(void *kvaddr, enum km
2088  #endif
2089         }
2090  
2091 +       /*arch_flush_lazy_mmu_mode();*/
2092         pagefault_enable();
2093  }
2094  
2095 @@ -162,7 +155,6 @@ void copy_highpage(struct page *to, stru
2096  EXPORT_SYMBOL(kmap);
2097  EXPORT_SYMBOL(kunmap);
2098  EXPORT_SYMBOL(kmap_atomic);
2099 -EXPORT_SYMBOL(kmap_atomic_pte);
2100  EXPORT_SYMBOL(kunmap_atomic);
2101  EXPORT_SYMBOL(kmap_atomic_to_page);
2102  EXPORT_SYMBOL(clear_highpage);
2103 --- sle11sp1-2010-03-01.orig/arch/x86/mm/hypervisor.c   2009-11-06 10:45:48.000000000 +0100
2104 +++ sle11sp1-2010-03-01/arch/x86/mm/hypervisor.c        2009-11-06 10:49:47.000000000 +0100
2105 @@ -421,13 +421,13 @@ void xen_tlb_flush_all(void)
2106  }
2107  EXPORT_SYMBOL_GPL(xen_tlb_flush_all);
2108  
2109 -void xen_tlb_flush_mask(cpumask_t *mask)
2110 +void xen_tlb_flush_mask(const cpumask_t *mask)
2111  {
2112         struct mmuext_op op;
2113         if ( cpus_empty(*mask) )
2114                 return;
2115         op.cmd = MMUEXT_TLB_FLUSH_MULTI;
2116 -       set_xen_guest_handle(op.arg2.vcpumask, mask->bits);
2117 +       set_xen_guest_handle(op.arg2.vcpumask, cpus_addr(*mask));
2118         BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
2119  }
2120  EXPORT_SYMBOL_GPL(xen_tlb_flush_mask);
2121 @@ -441,14 +441,14 @@ void xen_invlpg_all(unsigned long ptr)
2122  }
2123  EXPORT_SYMBOL_GPL(xen_invlpg_all);
2124  
2125 -void xen_invlpg_mask(cpumask_t *mask, unsigned long ptr)
2126 +void xen_invlpg_mask(const cpumask_t *mask, unsigned long ptr)
2127  {
2128         struct mmuext_op op;
2129         if ( cpus_empty(*mask) )
2130                 return;
2131         op.cmd = MMUEXT_INVLPG_MULTI;
2132         op.arg1.linear_addr = ptr & PAGE_MASK;
2133 -       set_xen_guest_handle(op.arg2.vcpumask, mask->bits);
2134 +       set_xen_guest_handle(op.arg2.vcpumask, cpus_addr(*mask));
2135         BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
2136  }
2137  EXPORT_SYMBOL_GPL(xen_invlpg_mask);
2138 --- sle11sp1-2010-03-01.orig/arch/x86/mm/init_32-xen.c  2009-11-06 10:46:41.000000000 +0100
2139 +++ sle11sp1-2010-03-01/arch/x86/mm/init_32-xen.c       2009-11-06 10:49:47.000000000 +0100
2140 @@ -22,6 +22,7 @@
2141  #include <linux/init.h>
2142  #include <linux/highmem.h>
2143  #include <linux/pagemap.h>
2144 +#include <linux/pfn.h>
2145  #include <linux/poison.h>
2146  #include <linux/bootmem.h>
2147  #include <linux/slab.h>
2148 @@ -65,17 +66,19 @@ static pmd_t * __init one_md_table_init(
2149         pmd_t *pmd_table;
2150                 
2151  #ifdef CONFIG_X86_PAE
2152 -       pmd_table = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE);
2153 -       paravirt_alloc_pd(__pa(pmd_table) >> PAGE_SHIFT);
2154 -       make_lowmem_page_readonly(pmd_table, XENFEAT_writable_page_tables);
2155 -       set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT));
2156 -       pud = pud_offset(pgd, 0);
2157 -       if (pmd_table != pmd_offset(pud, 0)) 
2158 -               BUG();
2159 -#else
2160 +       if (!(__pgd_val(*pgd) & _PAGE_PRESENT)) {
2161 +               pmd_table = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE);
2162 +
2163 +               paravirt_alloc_pd(__pa(pmd_table) >> PAGE_SHIFT);
2164 +               make_lowmem_page_readonly(pmd_table, XENFEAT_writable_page_tables);
2165 +               set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT));
2166 +               pud = pud_offset(pgd, 0);
2167 +               if (pmd_table != pmd_offset(pud, 0))
2168 +                       BUG();
2169 +       }
2170 +#endif
2171         pud = pud_offset(pgd, 0);
2172         pmd_table = pmd_offset(pud, 0);
2173 -#endif
2174  
2175         return pmd_table;
2176  }
2177 @@ -86,16 +89,18 @@ static pmd_t * __init one_md_table_init(
2178   */
2179  static pte_t * __init one_page_table_init(pmd_t *pmd)
2180  {
2181 +#if CONFIG_XEN_COMPAT <= 0x030002
2182         if (pmd_none(*pmd)) {
2183 +#else
2184 +       if (!(__pmd_val(*pmd) & _PAGE_PRESENT)) {
2185 +#endif
2186                 pte_t *page_table = (pte_t *) alloc_bootmem_low_pages(PAGE_SIZE);
2187 +
2188                 paravirt_alloc_pt(__pa(page_table) >> PAGE_SHIFT);
2189                 make_lowmem_page_readonly(page_table,
2190                                           XENFEAT_writable_page_tables);
2191                 set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE));
2192 -               if (page_table != pte_offset_kernel(pmd, 0))
2193 -                       BUG();  
2194 -
2195 -               return page_table;
2196 +               BUG_ON(page_table != pte_offset_kernel(pmd, 0));
2197         }
2198         
2199         return pte_offset_kernel(pmd, 0);
2200 @@ -115,7 +120,6 @@ static pte_t * __init one_page_table_ini
2201  static void __init page_table_range_init (unsigned long start, unsigned long end, pgd_t *pgd_base)
2202  {
2203         pgd_t *pgd;
2204 -       pud_t *pud;
2205         pmd_t *pmd;
2206         int pgd_idx, pmd_idx;
2207         unsigned long vaddr;
2208 @@ -126,12 +130,10 @@ static void __init page_table_range_init
2209         pgd = pgd_base + pgd_idx;
2210  
2211         for ( ; (pgd_idx < PTRS_PER_PGD) && (vaddr != end); pgd++, pgd_idx++) {
2212 -               if (pgd_none(*pgd)) 
2213 -                       one_md_table_init(pgd);
2214 -               pud = pud_offset(pgd, vaddr);
2215 -               pmd = pmd_offset(pud, vaddr);
2216 +               pmd = one_md_table_init(pgd);
2217 +               pmd = pmd + pmd_index(vaddr);
2218                 for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end); pmd++, pmd_idx++) {
2219 -                       if (vaddr < hypervisor_virt_start && pmd_none(*pmd))
2220 +                       if (vaddr < hypervisor_virt_start)
2221                                 one_page_table_init(pmd);
2222  
2223                         vaddr += PMD_SIZE;
2224 @@ -194,24 +196,25 @@ static void __init kernel_physical_mappi
2225                         /* Map with big pages if possible, otherwise create normal page tables. */
2226                         if (cpu_has_pse) {
2227                                 unsigned int address2 = (pfn + PTRS_PER_PTE - 1) * PAGE_SIZE + PAGE_OFFSET + PAGE_SIZE-1;
2228 -
2229                                 if (is_kernel_text(address) || is_kernel_text(address2))
2230                                         set_pmd(pmd, pfn_pmd(pfn, PAGE_KERNEL_LARGE_EXEC));
2231                                 else
2232                                         set_pmd(pmd, pfn_pmd(pfn, PAGE_KERNEL_LARGE));
2233 +
2234                                 pfn += PTRS_PER_PTE;
2235                         } else {
2236                                 pte = one_page_table_init(pmd);
2237  
2238 -                               pte += pte_ofs;
2239 -                               for (; pte_ofs < PTRS_PER_PTE && pfn < max_low_pfn; pte++, pfn++, pte_ofs++) {
2240 -                                               /* XEN: Only map initial RAM allocation. */
2241 -                                               if ((pfn >= max_ram_pfn) || pte_present(*pte))
2242 -                                                       continue;
2243 -                                               if (is_kernel_text(address))
2244 -                                                       set_pte(pte, pfn_pte(pfn, PAGE_KERNEL_EXEC));
2245 -                                               else
2246 -                                                       set_pte(pte, pfn_pte(pfn, PAGE_KERNEL));
2247 +                               for (pte += pte_ofs;
2248 +                                    pte_ofs < PTRS_PER_PTE && pfn < max_low_pfn;
2249 +                                    pte++, pfn++, pte_ofs++, address += PAGE_SIZE) {
2250 +                                       /* XEN: Only map initial RAM allocation. */
2251 +                                       if ((pfn >= max_ram_pfn) || pte_present(*pte))
2252 +                                               continue;
2253 +                                       if (is_kernel_text(address))
2254 +                                               set_pte(pte, pfn_pte(pfn, PAGE_KERNEL_EXEC));
2255 +                                       else
2256 +                                               set_pte(pte, pfn_pte(pfn, PAGE_KERNEL));
2257                                 }
2258                                 pte_ofs = 0;
2259                         }
2260 @@ -388,15 +391,44 @@ extern void __init remap_numa_kva(void);
2261  
2262  pgd_t *swapper_pg_dir;
2263  
2264 +static void __init xen_pagetable_setup_start(pgd_t *base)
2265 +{
2266 +}
2267 +
2268 +static void __init xen_pagetable_setup_done(pgd_t *base)
2269 +{
2270 +}
2271 +
2272 +/*
2273 + * Build a proper pagetable for the kernel mappings.  Up until this
2274 + * point, we've been running on some set of pagetables constructed by
2275 + * the boot process.
2276 + *
2277 + * If we're booting on native hardware, this will be a pagetable
2278 + * constructed in arch/i386/kernel/head.S, and not running in PAE mode
2279 + * (even if we'll end up running in PAE).  The root of the pagetable
2280 + * will be swapper_pg_dir.
2281 + *
2282 + * If we're booting paravirtualized under a hypervisor, then there are
2283 + * more options: we may already be running PAE, and the pagetable may
2284 + * or may not be based in swapper_pg_dir.  In any case,
2285 + * paravirt_pagetable_setup_start() will set up swapper_pg_dir
2286 + * appropriately for the rest of the initialization to work.
2287 + *
2288 + * In general, pagetable_init() assumes that the pagetable may already
2289 + * be partially populated, and so it avoids stomping on any existing
2290 + * mappings.
2291 + */
2292  static void __init pagetable_init (void)
2293  {
2294 -       unsigned long vaddr;
2295 +       unsigned long vaddr, end;
2296         pgd_t *pgd_base = (pgd_t *)xen_start_info->pt_base;
2297  
2298 +       xen_pagetable_setup_start(pgd_base);
2299 +
2300         /* Enable PSE if available */
2301 -       if (cpu_has_pse) {
2302 +       if (cpu_has_pse)
2303                 set_in_cr4(X86_CR4_PSE);
2304 -       }
2305  
2306         /* Enable PGE if available */
2307         if (cpu_has_pge) {
2308 @@ -413,9 +445,12 @@ static void __init pagetable_init (void)
2309          * created - mappings will be set by set_fixmap():
2310          */
2311         vaddr = __fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK;
2312 -       page_table_range_init(vaddr, hypervisor_virt_start, pgd_base);
2313 +       end = (FIXADDR_TOP + PMD_SIZE - 1) & PMD_MASK;
2314 +       page_table_range_init(vaddr, end, pgd_base);
2315  
2316         permanent_kmaps_init(pgd_base);
2317 +
2318 +       xen_pagetable_setup_done(pgd_base);
2319  }
2320  
2321  #if defined(CONFIG_SOFTWARE_SUSPEND) || defined(CONFIG_ACPI_SLEEP)
2322 @@ -756,34 +791,29 @@ int remove_memory(u64 start, u64 size)
2323  EXPORT_SYMBOL_GPL(remove_memory);
2324  #endif
2325  
2326 -struct kmem_cache *pgd_cache;
2327  struct kmem_cache *pmd_cache;
2328  
2329  void __init pgtable_cache_init(void)
2330  {
2331 +       size_t pgd_size = PTRS_PER_PGD*sizeof(pgd_t);
2332 +
2333         if (PTRS_PER_PMD > 1) {
2334                 pmd_cache = kmem_cache_create("pmd",
2335                                         PTRS_PER_PMD*sizeof(pmd_t),
2336                                         PTRS_PER_PMD*sizeof(pmd_t),
2337 -                                       0,
2338 +                                       SLAB_PANIC,
2339                                         pmd_ctor,
2340                                         NULL);
2341 -               if (!pmd_cache)
2342 -                       panic("pgtable_cache_init(): cannot create pmd cache");
2343 +               if (!SHARED_KERNEL_PMD) {
2344 +                       /* If we're in PAE mode and have a non-shared
2345 +                          kernel pmd, then the pgd size must be a
2346 +                          page size.  This is because the pgd_list
2347 +                          links through the page structure, so there
2348 +                          can only be one pgd per page for this to
2349 +                          work. */
2350 +                       pgd_size = PAGE_SIZE;
2351 +               }
2352         }
2353 -       pgd_cache = kmem_cache_create("pgd",
2354 -#ifndef CONFIG_XEN
2355 -                               PTRS_PER_PGD*sizeof(pgd_t),
2356 -                               PTRS_PER_PGD*sizeof(pgd_t),
2357 -#else
2358 -                               PAGE_SIZE,
2359 -                               PAGE_SIZE,
2360 -#endif
2361 -                               0,
2362 -                               pgd_ctor,
2363 -                               PTRS_PER_PMD == 1 ? pgd_dtor : NULL);
2364 -       if (!pgd_cache)
2365 -               panic("pgtable_cache_init(): Cannot create pgd cache");
2366  }
2367  
2368  /*
2369 @@ -817,13 +847,26 @@ static int noinline do_test_wp_bit(void)
2370  
2371  void mark_rodata_ro(void)
2372  {
2373 -       unsigned long addr = (unsigned long)__start_rodata;
2374 -
2375 -       for (; addr < (unsigned long)__end_rodata; addr += PAGE_SIZE)
2376 -               change_page_attr(virt_to_page(addr), 1, PAGE_KERNEL_RO);
2377 +       unsigned long start = PFN_ALIGN(_text);
2378 +       unsigned long size = PFN_ALIGN(_etext) - start;
2379  
2380 -       printk("Write protecting the kernel read-only data: %uk\n",
2381 -                       (__end_rodata - __start_rodata) >> 10);
2382 +#ifndef CONFIG_KPROBES
2383 +#ifdef CONFIG_HOTPLUG_CPU
2384 +       /* It must still be possible to apply SMP alternatives. */
2385 +       if (num_possible_cpus() <= 1)
2386 +#endif
2387 +       {
2388 +               change_page_attr(virt_to_page(start),
2389 +                                size >> PAGE_SHIFT, PAGE_KERNEL_RX);
2390 +               printk("Write protecting the kernel text: %luk\n", size >> 10);
2391 +       }
2392 +#endif
2393 +       start += size;
2394 +       size = (unsigned long)__end_rodata - start;
2395 +       change_page_attr(virt_to_page(start),
2396 +                        size >> PAGE_SHIFT, PAGE_KERNEL_RO);
2397 +       printk("Write protecting the kernel read-only data: %luk\n",
2398 +              size >> 10);
2399  
2400         /*
2401          * change_page_attr() requires a global_flush_tlb() call after it.
2402 @@ -846,7 +889,7 @@ void free_init_pages(char *what, unsigne
2403                 free_page(addr);
2404                 totalram_pages++;
2405         }
2406 -       printk(KERN_INFO "Freeing %s: %ldk freed\n", what, (end - begin) >> 10);
2407 +       printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10);
2408  }
2409  
2410  void free_initmem(void)
2411 --- sle11sp1-2010-03-01.orig/arch/x86/mm/ioremap_32-xen.c       2009-11-06 10:45:48.000000000 +0100
2412 +++ sle11sp1-2010-03-01/arch/x86/mm/ioremap_32-xen.c    2009-11-06 10:49:47.000000000 +0100
2413 @@ -13,6 +13,7 @@
2414  #include <linux/slab.h>
2415  #include <linux/module.h>
2416  #include <linux/io.h>
2417 +#include <linux/sched.h>
2418  #include <asm/fixmap.h>
2419  #include <asm/cacheflush.h>
2420  #include <asm/tlbflush.h>
2421 --- sle11sp1-2010-03-01.orig/arch/x86/mm/pgtable_32-xen.c       2009-11-06 10:46:41.000000000 +0100
2422 +++ sle11sp1-2010-03-01/arch/x86/mm/pgtable_32-xen.c    2009-11-06 10:49:47.000000000 +0100
2423 @@ -13,6 +13,7 @@
2424  #include <linux/pagemap.h>
2425  #include <linux/spinlock.h>
2426  #include <linux/module.h>
2427 +#include <linux/quicklist.h>
2428  
2429  #include <asm/system.h>
2430  #include <asm/pgtable.h>
2431 @@ -218,8 +219,6 @@ void pmd_ctor(void *pmd, struct kmem_cac
2432   * against pageattr.c; it is the unique case in which a valid change
2433   * of kernel pagetables can't be lazily synchronized by vmalloc faults.
2434   * vmalloc faults work because attached pagetables are never freed.
2435 - * The locking scheme was chosen on the basis of manfred's
2436 - * recommendations and having no core impact whatsoever.
2437   * -- wli
2438   */
2439  DEFINE_SPINLOCK(pgd_lock);
2440 @@ -245,37 +244,54 @@ static inline void pgd_list_del(pgd_t *p
2441                 set_page_private(next, (unsigned long)pprev);
2442  }
2443  
2444 -void pgd_ctor(void *pgd, struct kmem_cache *cache, unsigned long unused)
2445 +
2446 +
2447 +#if (PTRS_PER_PMD == 1)
2448 +/* Non-PAE pgd constructor */
2449 +void pgd_ctor(void *pgd)
2450  {
2451         unsigned long flags;
2452  
2453 -       if (PTRS_PER_PMD > 1) {
2454 -               if (HAVE_SHARED_KERNEL_PMD)
2455 -                       clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD,
2456 -                                       swapper_pg_dir + USER_PTRS_PER_PGD,
2457 -                                       KERNEL_PGD_PTRS);
2458 -       } else {
2459 -               spin_lock_irqsave(&pgd_lock, flags);
2460 +       /* !PAE, no pagetable sharing */
2461 +       memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t));
2462 +
2463 +       spin_lock_irqsave(&pgd_lock, flags);
2464 +
2465 +       /* must happen under lock */
2466 +       clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD,
2467 +                       swapper_pg_dir + USER_PTRS_PER_PGD,
2468 +                       KERNEL_PGD_PTRS);
2469 +
2470 +       paravirt_alloc_pd_clone(__pa(pgd) >> PAGE_SHIFT,
2471 +                               __pa(swapper_pg_dir) >> PAGE_SHIFT,
2472 +                               USER_PTRS_PER_PGD,
2473 +                               KERNEL_PGD_PTRS);
2474 +       pgd_list_add(pgd);
2475 +       spin_unlock_irqrestore(&pgd_lock, flags);
2476 +}
2477 +#else  /* PTRS_PER_PMD > 1 */
2478 +/* PAE pgd constructor */
2479 +void pgd_ctor(void *pgd)
2480 +{
2481 +       /* PAE, kernel PMD may be shared */
2482 +
2483 +       if (SHARED_KERNEL_PMD) {
2484                 clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD,
2485                                 swapper_pg_dir + USER_PTRS_PER_PGD,
2486                                 KERNEL_PGD_PTRS);
2487 +       } else {
2488                 memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t));
2489 -
2490 -               /* must happen under lock */
2491 -               paravirt_alloc_pd_clone(__pa(pgd) >> PAGE_SHIFT,
2492 -                       __pa(swapper_pg_dir) >> PAGE_SHIFT,
2493 -                       USER_PTRS_PER_PGD, PTRS_PER_PGD - USER_PTRS_PER_PGD);
2494 -
2495 -               pgd_list_add(pgd);
2496 -               spin_unlock_irqrestore(&pgd_lock, flags);
2497         }
2498  }
2499 +#endif /* PTRS_PER_PMD */
2500  
2501 -/* never called when PTRS_PER_PMD > 1 */
2502 -void pgd_dtor(void *pgd, struct kmem_cache *cache, unsigned long unused)
2503 +void pgd_dtor(void *pgd)
2504  {
2505         unsigned long flags; /* can be called from interrupt context */
2506  
2507 +       if (SHARED_KERNEL_PMD)
2508 +               return;
2509 +
2510         paravirt_release_pd(__pa(pgd) >> PAGE_SHIFT);
2511         spin_lock_irqsave(&pgd_lock, flags);
2512         pgd_list_del(pgd);
2513 @@ -284,11 +300,46 @@ void pgd_dtor(void *pgd, struct kmem_cac
2514         pgd_test_and_unpin(pgd);
2515  }
2516  
2517 +#define UNSHARED_PTRS_PER_PGD                          \
2518 +       (SHARED_KERNEL_PMD ? USER_PTRS_PER_PGD : PTRS_PER_PGD)
2519 +
2520 +/* If we allocate a pmd for part of the kernel address space, then
2521 +   make sure its initialized with the appropriate kernel mappings.
2522 +   Otherwise use a cached zeroed pmd.  */
2523 +static pmd_t *pmd_cache_alloc(int idx)
2524 +{
2525 +       pmd_t *pmd;
2526 +
2527 +       if (idx >= USER_PTRS_PER_PGD) {
2528 +               pmd = (pmd_t *)__get_free_page(GFP_KERNEL);
2529 +
2530 +#ifndef CONFIG_XEN
2531 +               if (pmd)
2532 +                       memcpy(pmd,
2533 +                              (void *)pgd_page_vaddr(swapper_pg_dir[idx]),
2534 +                              sizeof(pmd_t) * PTRS_PER_PMD);
2535 +#endif
2536 +       } else
2537 +               pmd = kmem_cache_alloc(pmd_cache, GFP_KERNEL);
2538 +
2539 +       return pmd;
2540 +}
2541 +
2542 +static void pmd_cache_free(pmd_t *pmd, int idx)
2543 +{
2544 +       if (idx >= USER_PTRS_PER_PGD) {
2545 +               make_lowmem_page_writable(pmd, XENFEAT_writable_page_tables);
2546 +               memset(pmd, 0, PTRS_PER_PMD*sizeof(pmd_t));
2547 +               free_page((unsigned long)pmd);
2548 +       } else
2549 +               kmem_cache_free(pmd_cache, pmd);
2550 +}
2551 +
2552  pgd_t *pgd_alloc(struct mm_struct *mm)
2553  {
2554         int i;
2555 -       pgd_t *pgd = kmem_cache_alloc(pgd_cache, GFP_KERNEL);
2556 -       pmd_t **pmd;
2557 +       pgd_t *pgd = quicklist_alloc(0, GFP_KERNEL, pgd_ctor);
2558 +       pmd_t **pmds = NULL;
2559         unsigned long flags;
2560  
2561         pgd_test_and_unpin(pgd);
2562 @@ -296,37 +347,40 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
2563         if (PTRS_PER_PMD == 1 || !pgd)
2564                 return pgd;
2565  
2566 -       if (HAVE_SHARED_KERNEL_PMD) {
2567 -               for (i = 0; i < USER_PTRS_PER_PGD; ++i) {
2568 -                       pmd_t *pmd = kmem_cache_alloc(pmd_cache, GFP_KERNEL);
2569 -                       if (!pmd)
2570 -                               goto out_oom;
2571 -                       paravirt_alloc_pd(__pa(pmd) >> PAGE_SHIFT);
2572 -                       set_pgd(&pgd[i], __pgd(1 + __pa(pmd)));
2573 +#ifdef CONFIG_XEN
2574 +       if (!SHARED_KERNEL_PMD) {
2575 +               /*
2576 +                * We can race save/restore (if we sleep during a GFP_KERNEL memory
2577 +                * allocation). We therefore store virtual addresses of pmds as they
2578 +                * do not change across save/restore, and poke the machine addresses
2579 +                * into the pgdir under the pgd_lock.
2580 +                */
2581 +               pmds = kmalloc(PTRS_PER_PGD * sizeof(pmd_t *), GFP_KERNEL);
2582 +               if (!pmds) {
2583 +                       quicklist_free(0, pgd_dtor, pgd);
2584 +                       return NULL;
2585                 }
2586 -               return pgd;
2587 -       }
2588 -
2589 -       /*
2590 -        * We can race save/restore (if we sleep during a GFP_KERNEL memory
2591 -        * allocation). We therefore store virtual addresses of pmds as they
2592 -        * do not change across save/restore, and poke the machine addresses
2593 -        * into the pgdir under the pgd_lock.
2594 -        */
2595 -       pmd = kmalloc(PTRS_PER_PGD * sizeof(pmd_t *), GFP_KERNEL);
2596 -       if (!pmd) {
2597 -               kmem_cache_free(pgd_cache, pgd);
2598 -               return NULL;
2599         }
2600 +#endif
2601  
2602         /* Allocate pmds, remember virtual addresses. */
2603 -       for (i = 0; i < PTRS_PER_PGD; ++i) {
2604 -               pmd[i] = kmem_cache_alloc(pmd_cache, GFP_KERNEL);
2605 -               if (!pmd[i])
2606 +       for (i = 0; i < UNSHARED_PTRS_PER_PGD; ++i) {
2607 +               pmd_t *pmd = pmd_cache_alloc(i);
2608 +
2609 +               if (!pmd)
2610                         goto out_oom;
2611 +
2612                 paravirt_alloc_pd(__pa(pmd) >> PAGE_SHIFT);
2613 +               if (pmds)
2614 +                       pmds[i] = pmd;
2615 +               else
2616 +                       set_pgd(&pgd[i], __pgd(1 + __pa(pmd)));
2617         }
2618  
2619 +#ifdef CONFIG_XEN
2620 +       if (SHARED_KERNEL_PMD)
2621 +               return pgd;
2622 +
2623         spin_lock_irqsave(&pgd_lock, flags);
2624  
2625         /* Protect against save/restore: move below 4GB under pgd_lock. */
2626 @@ -341,44 +395,43 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
2627  
2628         /* Copy kernel pmd contents and write-protect the new pmds. */
2629         for (i = USER_PTRS_PER_PGD; i < PTRS_PER_PGD; i++) {
2630 -               unsigned long v = (unsigned long)i << PGDIR_SHIFT;
2631 -               pgd_t *kpgd = pgd_offset_k(v);
2632 -               pud_t *kpud = pud_offset(kpgd, v);
2633 -               pmd_t *kpmd = pmd_offset(kpud, v);
2634 -               memcpy(pmd[i], kpmd, PAGE_SIZE);
2635 +               memcpy(pmds[i],
2636 +                      (void *)pgd_page_vaddr(swapper_pg_dir[i]),
2637 +                      sizeof(pmd_t) * PTRS_PER_PMD);
2638                 make_lowmem_page_readonly(
2639 -                       pmd[i], XENFEAT_writable_page_tables);
2640 +                       pmds[i], XENFEAT_writable_page_tables);
2641         }
2642  
2643         /* It is safe to poke machine addresses of pmds under the pmd_lock. */
2644         for (i = 0; i < PTRS_PER_PGD; i++)
2645 -               set_pgd(&pgd[i], __pgd(1 + __pa(pmd[i])));
2646 +               set_pgd(&pgd[i], __pgd(1 + __pa(pmds[i])));
2647  
2648         /* Ensure this pgd gets picked up and pinned on save/restore. */
2649         pgd_list_add(pgd);
2650  
2651         spin_unlock_irqrestore(&pgd_lock, flags);
2652  
2653 -       kfree(pmd);
2654 +       kfree(pmds);
2655 +#endif
2656  
2657         return pgd;
2658  
2659  out_oom:
2660 -       if (HAVE_SHARED_KERNEL_PMD) {
2661 +       if (!pmds) {
2662                 for (i--; i >= 0; i--) {
2663                         pgd_t pgdent = pgd[i];
2664                         void* pmd = (void *)__va(pgd_val(pgdent)-1);
2665                         paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT);
2666 -                       kmem_cache_free(pmd_cache, pmd);
2667 +                       pmd_cache_free(pmd, i);
2668                 }
2669         } else {
2670                 for (i--; i >= 0; i--) {
2671 -                       paravirt_release_pd(__pa(pmd[i]) >> PAGE_SHIFT);
2672 -                       kmem_cache_free(pmd_cache, pmd[i]);
2673 +                       paravirt_release_pd(__pa(pmds[i]) >> PAGE_SHIFT);
2674 +                       pmd_cache_free(pmds[i], i);
2675                 }
2676 -               kfree(pmd);
2677 +               kfree(pmds);
2678         }
2679 -       kmem_cache_free(pgd_cache, pgd);
2680 +       quicklist_free(0, pgd_dtor, pgd);
2681         return NULL;
2682  }
2683  
2684 @@ -398,35 +451,24 @@ void pgd_free(pgd_t *pgd)
2685  
2686         /* in the PAE case user pgd entries are overwritten before usage */
2687         if (PTRS_PER_PMD > 1) {
2688 -               for (i = 0; i < USER_PTRS_PER_PGD; ++i) {
2689 +               for (i = 0; i < UNSHARED_PTRS_PER_PGD; ++i) {
2690                         pgd_t pgdent = pgd[i];
2691                         void* pmd = (void *)__va(pgd_val(pgdent)-1);
2692                         paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT);
2693 -                       kmem_cache_free(pmd_cache, pmd);
2694 +                       pmd_cache_free(pmd, i);
2695                 }
2696  
2697 -               if (!HAVE_SHARED_KERNEL_PMD) {