File linux-2.6-xen-execshield.patch of Package kernel
diff -r 095d53b0d1a6 arch/i386/kernel/cpu/common-xen.c
--- a/arch/i386/kernel/cpu/common-xen.c Tue Jul 25 21:53:33 2006 +0200
+++ b/arch/i386/kernel/cpu/common-xen.c Tue Jul 25 23:02:25 2006 +0200
@@ -431,6 +431,13 @@ void __cpuinit identify_cpu(struct cpuin
if (disable_pse)
clear_bit(X86_FEATURE_PSE, c->x86_capability);
+ if (exec_shield != 0) {
+#ifdef CONFIG_HIGHMEM64G /* NX implies PAE */
+ if (!test_bit(X86_FEATURE_NX, c->x86_capability))
+#endif
+ clear_bit(X86_FEATURE_SEP, c->x86_capability);
+ }
+
/* If the model name is still unset, do table lookup. */
if ( !c->x86_model_id[0] ) {
char *p;
diff -r 095d53b0d1a6 arch/i386/kernel/process-xen.c
--- a/arch/i386/kernel/process-xen.c Tue Jul 25 21:53:33 2006 +0200
+++ b/arch/i386/kernel/process-xen.c Tue Jul 25 23:02:25 2006 +0200
@@ -528,6 +528,9 @@ struct task_struct fastcall * __switch_t
else BUG_ON(!(read_cr0() & 8));
#endif
+ if (next_p->mm)
+ load_user_cs_desc(cpu, next_p->mm);
+
/*
* Reload esp0.
* This is load_esp0(tss, next) with a multicall.
@@ -810,3 +813,60 @@ unsigned long arch_align_stack(unsigned
sp -= get_random_int() % 8192;
return sp & ~0xf;
}
+
+void arch_add_exec_range(struct mm_struct *mm, unsigned long limit)
+{
+ if (limit > mm->context.exec_limit) {
+ mm->context.exec_limit = limit;
+ set_user_cs(&mm->context.user_cs, limit);
+ if (mm == current->mm) {
+ preempt_disable();
+ load_user_cs_desc(smp_processor_id(), mm);
+ preempt_enable();
+ }
+ }
+}
+
+void arch_remove_exec_range(struct mm_struct *mm, unsigned long old_end)
+{
+ struct vm_area_struct *vma;
+ unsigned long limit = PAGE_SIZE;
+
+ if (old_end == mm->context.exec_limit) {
+ for (vma = mm->mmap; vma; vma = vma->vm_next)
+ if ((vma->vm_flags & VM_EXEC) && (vma->vm_end > limit))
+ limit = vma->vm_end;
+
+ mm->context.exec_limit = limit;
+ set_user_cs(&mm->context.user_cs, limit);
+ if (mm == current->mm) {
+ preempt_disable();
+ load_user_cs_desc(smp_processor_id(), mm);
+ preempt_enable();
+ }
+ }
+}
+
+void arch_flush_exec_range(struct mm_struct *mm)
+{
+ mm->context.exec_limit = 0;
+ set_user_cs(&mm->context.user_cs, 0);
+}
+
+/*
+ * Generate random brk address between 128MB and 196MB. (if the layout
+ * allows it.)
+ */
+void randomize_brk(unsigned long old_brk)
+{
+ unsigned long new_brk, range_start, range_end;
+
+ range_start = 0x08000000;
+ if (current->mm->brk >= range_start)
+ range_start = current->mm->brk;
+ range_end = range_start + 0x02000000;
+ new_brk = randomize_range(range_start, range_end, 0);
+ if (new_brk)
+ current->mm->brk = new_brk;
+}
+
diff -r 095d53b0d1a6 arch/i386/kernel/smp-xen.c
--- a/arch/i386/kernel/smp-xen.c Tue Jul 25 21:53:33 2006 +0200
+++ b/arch/i386/kernel/smp-xen.c Tue Jul 25 23:02:25 2006 +0200
@@ -23,6 +23,7 @@
#include <asm/mtrr.h>
#include <asm/tlbflush.h>
+#include <asm/desc.h>
#if 0
#include <mach_apic.h>
#endif
@@ -285,6 +286,8 @@ irqreturn_t smp_invalidate_interrupt(int
unsigned long cpu;
cpu = get_cpu();
+ if (current->active_mm)
+ load_user_cs_desc(cpu, current->active_mm);
if (!cpu_isset(cpu, flush_cpumask))
goto out;
diff -r 095d53b0d1a6 arch/i386/kernel/traps-xen.c
--- a/arch/i386/kernel/traps-xen.c Tue Jul 25 21:53:33 2006 +0200
+++ b/arch/i386/kernel/traps-xen.c Tue Jul 25 23:02:25 2006 +0200
@@ -558,11 +558,89 @@ DO_ERROR(11, SIGBUS, "segment not prese
DO_ERROR(11, SIGBUS, "segment not present", segment_not_present)
DO_ERROR(12, SIGBUS, "stack segment", stack_segment)
DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0)
-DO_ERROR_INFO(32, SIGSEGV, "iret exception", iret_error, ILL_BADSTK, 0)
+
+
+/*
+ * lazy-check for CS validity on exec-shield binaries:
+ *
+ * the original non-exec stack patch was written by
+ * Solar Designer <solar at openwall.com>. Thanks!
+ */
+static int
+check_lazy_exec_limit(int cpu, struct pt_regs *regs, long error_code)
+{
+ struct desc_struct *desc1, *desc2;
+ struct vm_area_struct *vma;
+ unsigned long limit;
+
+ if (current->mm == NULL)
+ return 0;
+
+ limit = -1UL;
+ if (current->mm->context.exec_limit != -1UL) {
+ limit = PAGE_SIZE;
+ spin_lock(¤t->mm->page_table_lock);
+ for (vma = current->mm->mmap; vma; vma = vma->vm_next)
+ if ((vma->vm_flags & VM_EXEC) && (vma->vm_end > limit))
+ limit = vma->vm_end;
+ spin_unlock(¤t->mm->page_table_lock);
+ if (limit >= TASK_SIZE)
+ limit = -1UL;
+ current->mm->context.exec_limit = limit;
+ }
+ set_user_cs(¤t->mm->context.user_cs, limit);
+
+ desc1 = ¤t->mm->context.user_cs;
+ desc2 = get_cpu_gdt_table(cpu) + GDT_ENTRY_DEFAULT_USER_CS;
+
+ if (desc1->a != desc2->a || desc1->b != desc2->b) {
+ /*
+ * The CS was not in sync - reload it and retry the
+ * instruction. If the instruction still faults then
+ * we won't hit this branch next time around.
+ */
+ if (print_fatal_signals >= 2) {
+ printk("#GPF fixup (%ld[seg:%lx]) at %08lx, CPU#%d.\n", error_code, error_code/8, regs->eip, smp_processor_id());
+ printk(" exec_limit: %08lx, user_cs: %08lx/%08lx, CPU_cs: %08lx/%08lx.\n", current->mm->context.exec_limit, desc1->a, desc1->b, desc2->a, desc2->b);
+ }
+ load_user_cs_desc(cpu, current->mm);
+ return 1;
+ }
+
+ return 0;
+}
+
+/*
+ * The fixup code for errors in iret jumps to here (iret_exc). It loses
+ * the original trap number and error code. The bogus trap 32 and error
+ * code 0 are what the vanilla kernel delivers via:
+ * DO_ERROR_INFO(32, SIGSEGV, "iret exception", iret_error, ILL_BADSTK, 0)
+ *
+ * In case of a general protection fault in the iret instruction, we
+ * need to check for a lazy CS update for exec-shield.
+ */
+fastcall void do_iret_error(struct pt_regs *regs, long error_code)
+{
+ int ok = check_lazy_exec_limit(get_cpu(), regs, error_code);
+ put_cpu();
+ if (!ok && notify_die(DIE_TRAP, "iret exception", regs,
+ error_code, 32, SIGSEGV) != NOTIFY_STOP) {
+ siginfo_t info;
+ info.si_signo = SIGSEGV;
+ info.si_errno = 0;
+ info.si_code = ILL_BADSTK;
+ info.si_addr = 0;
+ do_trap(32, SIGSEGV, "iret exception", 0, regs, error_code,
+ &info);
+ }
+}
fastcall void __kprobes do_general_protection(struct pt_regs * regs,
long error_code)
{
+ int cpu = get_cpu();
+ int ok;
+
current->thread.error_code = error_code;
current->thread.trap_no = 13;
@@ -572,17 +650,31 @@ fastcall void __kprobes do_general_prote
if (!user_mode(regs))
goto gp_in_kernel;
+ ok = check_lazy_exec_limit(cpu, regs, error_code);
+
+ put_cpu();
+
+ if (ok)
+ return;
+
+ if (print_fatal_signals) {
+ printk("#GPF(%ld[seg:%lx]) at %08lx, CPU#%d.\n", error_code, error_code/8, regs->eip, smp_processor_id());
+ printk(" exec_limit: %08lx, user_cs: %08lx/%08lx.\n", current->mm->context.exec_limit, current->mm->context.user_cs.a, current->mm->context.user_cs.b);
+ }
+
current->thread.error_code = error_code;
current->thread.trap_no = 13;
force_sig(SIGSEGV, current);
return;
gp_in_vm86:
+ put_cpu();
local_irq_enable();
handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code);
return;
gp_in_kernel:
+ put_cpu();
if (!fixup_exception(regs)) {
if (notify_die(DIE_GPF, "general protection fault", regs,
error_code, 13, SIGSEGV) == NOTIFY_STOP)
diff -r 095d53b0d1a6 arch/i386/mm/init-xen.c
--- a/arch/i386/mm/init-xen.c Tue Jul 25 21:53:33 2006 +0200
+++ b/arch/i386/mm/init-xen.c Tue Jul 25 23:02:25 2006 +0200
@@ -465,7 +465,7 @@ EXPORT_SYMBOL(__supported_pte_mask);
* Control non executable mappings.
*
* on Enable
- * off Disable
+ * off Disable (disables exec-shield too)
*/
void __init noexec_setup(const char *str)
{
@@ -475,6 +475,7 @@ void __init noexec_setup(const char *str
} else if (!strncmp(str,"off",3)) {
disable_nx = 1;
__supported_pte_mask &= ~_PAGE_NX;
+ exec_shield = 0;
}
}
@@ -541,7 +542,10 @@ void __init paging_init(void)
set_nx();
if (nx_enabled)
printk("NX (Execute Disable) protection: active\n");
-#endif
+ else
+#endif
+ if (exec_shield)
+ printk("Using x86 segment limits to approximate NX protection\n");
pagetable_init();
diff -r 095d53b0d1a6 arch/x86_64/ia32/syscall32-xen.c
--- a/arch/x86_64/ia32/syscall32-xen.c Tue Jul 25 21:53:33 2006 +0200
+++ b/arch/x86_64/ia32/syscall32-xen.c Tue Jul 25 23:02:25 2006 +0200
@@ -47,7 +47,9 @@ struct linux_binprm;
struct linux_binprm;
/* Setup a VMA at program startup for the vsyscall page */
-int syscall32_setup_pages(struct linux_binprm *bprm, int exstack)
+int syscall32_setup_pages(struct linux_binprm *bprm, int exstack,
+ unsigned long start_code,
+ unsigned long interp_map_address)
{
int npages = (VSYSCALL32_END - VSYSCALL32_BASE) >> PAGE_SHIFT;
struct vm_area_struct *vma;
diff -r 095d53b0d1a6 arch/x86_64/kernel/process-xen.c
--- a/arch/x86_64/kernel/process-xen.c Tue Jul 25 21:53:33 2006 +0200
+++ b/arch/x86_64/kernel/process-xen.c Tue Jul 25 23:02:25 2006 +0200
@@ -590,12 +590,6 @@ void set_personality_64bit(void)
/* Make sure to be in 64bit mode */
clear_thread_flag(TIF_IA32);
-
- /* TBD: overwrites user setup. Should have two bits.
- But 64bit processes have always behaved this way,
- so it's not too bad. The main problem is just that
- 32bit childs are affected again. */
- current->personality &= ~READ_IMPLIES_EXEC;
}
asmlinkage long sys_fork(struct pt_regs *regs)
diff -r 095d53b0d1a6 arch/x86_64/kernel/setup64-xen.c
--- a/arch/x86_64/kernel/setup64-xen.c Tue Jul 25 21:53:33 2006 +0200
+++ b/arch/x86_64/kernel/setup64-xen.c Tue Jul 25 23:02:25 2006 +0200
@@ -55,7 +55,7 @@ on Enable(default)
on Enable(default)
off Disable
*/
-int __init nonx_setup(char *str)
+void __init nonx_setup(char *str)
{
if (!strncmp(str, "on", 2)) {
__supported_pte_mask |= _PAGE_NX;
@@ -64,28 +64,7 @@ int __init nonx_setup(char *str)
do_not_nx = 1;
__supported_pte_mask &= ~_PAGE_NX;
}
- return 1;
-}
-__setup("noexec=", nonx_setup); /* parsed early actually */
-
-int force_personality32 = 0;
-
-/* noexec32=on|off
-Control non executable heap for 32bit processes.
-To control the stack too use noexec=off
-
-on PROT_READ does not imply PROT_EXEC for 32bit processes
-off PROT_READ implies PROT_EXEC (default)
-*/
-static int __init nonx32_setup(char *str)
-{
- if (!strcmp(str, "on"))
- force_personality32 &= ~READ_IMPLIES_EXEC;
- else if (!strcmp(str, "off"))
- force_personality32 |= READ_IMPLIES_EXEC;
- return 1;
-}
-__setup("noexec32=", nonx32_setup);
+}
/*
* Great future plan:
diff -r 095d53b0d1a6 arch/x86_64/mm/fault-xen.c
--- a/arch/x86_64/mm/fault-xen.c Tue Jul 25 21:53:33 2006 +0200
+++ b/arch/x86_64/mm/fault-xen.c Tue Jul 25 23:02:25 2006 +0200
@@ -114,7 +114,7 @@ static noinline int is_prefetch(struct p
instr = (unsigned char *)convert_rip_to_linear(current, regs);
max_instr = instr + 15;
- if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE)
+ if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE64)
return 0;
while (scan_more && instr < max_instr) {
diff -r 095d53b0d1a6 include/asm-i386/mach-xen/asm/desc.h
--- a/include/asm-i386/mach-xen/asm/desc.h Tue Jul 25 21:53:33 2006 +0200
+++ b/include/asm-i386/mach-xen/asm/desc.h Tue Jul 25 23:02:25 2006 +0200
@@ -159,6 +159,20 @@ static inline unsigned long get_desc_bas
return base;
}
+static inline void set_user_cs(struct desc_struct *desc, unsigned long limit)
+{
+ limit = (limit - 1) / PAGE_SIZE;
+ desc->a = limit & 0xffff;
+ desc->b = (limit & 0xf0000) | 0x00c0fb00;
+}
+
+#define load_user_cs_desc(cpu, mm) \
+ HYPERVISOR_update_descriptor(virt_to_machine(&get_cpu_gdt_table(cpu)[GDT_ENTRY_DEFAULT_USER_CS]), (u64)(mm)->context.user_cs.a | ((u64)(mm)->context.user_cs.b) << 32);
+
+extern void arch_add_exec_range(struct mm_struct *mm, unsigned long limit);
+extern void arch_remove_exec_range(struct mm_struct *mm, unsigned long limit);
+extern void arch_flush_exec_range(struct mm_struct *mm);
+
#endif /* !__ASSEMBLY__ */
#endif
diff -r 095d53b0d1a6 include/asm-i386/mach-xen/asm/mmu.h
--- a/include/asm-i386/mach-xen/asm/mmu.h Tue Jul 25 21:53:33 2006 +0200
+++ b/include/asm-i386/mach-xen/asm/mmu.h Tue Jul 25 23:02:25 2006 +0200
@@ -7,11 +7,15 @@
* we put the segment information here.
*
* cpu_vm_mask is used to optimize ldt flushing.
+ * exec_limit is used to track the range PROT_EXEC
+ * mappings span.
*/
typedef struct {
int size;
struct semaphore sem;
void *ldt;
+ struct desc_struct user_cs;
+ unsigned long exec_limit;
void *vdso;
} mm_context_t;
diff -r 095d53b0d1a6 include/asm-i386/mach-xen/asm/pgalloc.h
--- a/include/asm-i386/mach-xen/asm/pgalloc.h Tue Jul 25 21:53:33 2006 +0200
+++ b/include/asm-i386/mach-xen/asm/pgalloc.h Tue Jul 25 23:02:25 2006 +0200
@@ -2,6 +2,7 @@
#define _I386_PGALLOC_H
#include <asm/fixmap.h>
+#include <asm/desc.h>
#include <linux/threads.h>
#include <linux/mm.h> /* for struct page */
#include <asm/io.h> /* for phys_to_virt and page_to_pseudophys */
diff -r 095d53b0d1a6 include/asm-i386/mach-xen/asm/processor.h
--- a/include/asm-i386/mach-xen/asm/processor.h Tue Jul 25 21:53:33 2006 +0200
+++ b/include/asm-i386/mach-xen/asm/processor.h Tue Jul 25 23:02:25 2006 +0200
@@ -333,7 +333,10 @@ extern int bootloader_type;
/* This decides where the kernel will search for a free chunk of vm
* space during mmap's.
*/
-#define TASK_UNMAPPED_BASE (PAGE_ALIGN(TASK_SIZE / 3))
+#define TASK_UNMAPPED_BASE PAGE_ALIGN(TASK_SIZE/3)
+
+#define __HAVE_ARCH_ALIGN_STACK
+extern unsigned long arch_align_stack(unsigned long sp);
#define HAVE_ARCH_PICK_MMAP_LAYOUT
@@ -526,6 +529,9 @@ static inline void __load_esp0(struct ts
regs->xcs = __USER_CS; \
regs->eip = new_eip; \
regs->esp = new_esp; \
+ preempt_disable(); \
+ load_user_cs_desc(smp_processor_id(), current->mm); \
+ preempt_enable(); \
} while (0)
/*
diff -r 095d53b0d1a6 include/asm-x86_64/mach-xen/asm/pgalloc.h
--- a/include/asm-x86_64/mach-xen/asm/pgalloc.h Tue Jul 25 21:53:33 2006 +0200
+++ b/include/asm-x86_64/mach-xen/asm/pgalloc.h Tue Jul 25 23:02:25 2006 +0200
@@ -8,6 +8,14 @@
#include <asm/io.h> /* for phys_to_virt and page_to_pseudophys */
#include <xen/features.h>
+
+#define arch_add_exec_range(mm, limit) \
+ do { (void)(mm), (void)(limit); } while (0)
+#define arch_flush_exec_range(mm) \
+ do { (void)(mm); } while (0)
+#define arch_remove_exec_range(mm, limit) \
+ do { (void)(mm), (void)(limit); } while (0)
+
void make_page_readonly(void *va, unsigned int feature);
void make_page_writable(void *va, unsigned int feature);
void make_pages_readonly(void *va, unsigned int nr, unsigned int feature);
diff -r 095d53b0d1a6 include/asm-x86_64/mach-xen/asm/pgtable.h
--- a/include/asm-x86_64/mach-xen/asm/pgtable.h Tue Jul 25 21:53:33 2006 +0200
+++ b/include/asm-x86_64/mach-xen/asm/pgtable.h Tue Jul 25 23:02:25 2006 +0200
@@ -44,7 +44,7 @@ extern unsigned long __supported_pte_mas
#define swapper_pg_dir init_level4_pgt
-extern int nonx_setup(char *str);
+extern void nonx_setup(char *str);
extern void paging_init(void);
extern void clear_kernel_mapping(unsigned long addr, unsigned long size);