Discussion:
[RFC PATCH 2/6] arm64: avoid open-coding THREAD_SIZE{,_ORDER}
Mark Rutland
2017-07-12 22:32:59 UTC
Permalink
Currently we define THREAD_SIZE_ORDER dependent on which arm64-specific
page size kconfig symbol was selected. This is unfortunate, as it hides
the relationship between THREAD_SIZE_ORDER and THREAD_SIZE, and makes it
painful more painful than necessary to modify the thread size as we will
need to do for some debug configurations.

This patch follows arch/metag's approach of consistently defining
THREAD_SIZE in terms of THREAD_SIZE_ORDER. This avoids having ifdefs for
particular page size configurations, and allows us to change a single
definition to change the thread size.

Signed-off-by: Mark Rutland <***@arm.com>
Cc: Catalin Marinas <***@arm.com>
Cc: Will Deacon <***@arm.com>
---
arch/arm64/include/asm/thread_info.h | 12 ++++++++----
1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h
index 141f13e9..6d0c59a 100644
--- a/arch/arm64/include/asm/thread_info.h
+++ b/arch/arm64/include/asm/thread_info.h
@@ -23,13 +23,17 @@

#include <linux/compiler.h>

-#ifdef CONFIG_ARM64_4K_PAGES
-#define THREAD_SIZE_ORDER 2
-#elif defined(CONFIG_ARM64_16K_PAGES)
+#include <asm/page.h>
+
+#define THREAD_SHIFT 14
+
+#if THREAD_SHIFT >= PAGE_SHIFT
+#define THREAD_SIZE_ORDER (THREAD_SHIFT - PAGE_SHIFT)
+#else
#define THREAD_SIZE_ORDER 0
#endif

-#define THREAD_SIZE 16384
+#define THREAD_SIZE (PAGE_SIZE << THREAD_SIZE_ORDER)
#define THREAD_START_SP (THREAD_SIZE - 16)

#ifndef __ASSEMBLY__
--
1.9.1
Mark Rutland
2017-07-12 22:32:58 UTC
Permalink
Today we use TPIDR_EL1 for our percpu offset, and SP_EL0 for current
(and current::thread_info, which is at offset 0).

Using SP_EL0 in this way prevents us from using EL1 thread mode, where
SP_EL0 is not addressable (since it's used as the active SP). It also
means we can't use SP_EL0 for other purposes (e.g. as a
scratch-register).

This patch frees up SP_EL0 for such usage, by storing the percpu offset
in current::thread_info, and using TPIDR_EL1 to store current. As we no
longer need to update SP_EL0 at EL0 exception boundaries, this allows us
to delete some code.

This new organisation means that we need to perform an additional load
to acquire the prcpu offset. However, our assembly constraints allow
current to be cached, and therefore allow the offset to be cached.
Additionally, in most cases where we need the percpu offset, we also
need to fiddle with the preempt count or other data stored in
current::thread_info, so this data should already be hot in the caches.

Signed-off-by: Mark Rutland <***@arm.com>
---
arch/arm64/include/asm/assembler.h | 11 ++++++++---
arch/arm64/include/asm/current.h | 6 +++---
arch/arm64/include/asm/percpu.h | 15 ++++-----------
arch/arm64/include/asm/thread_info.h | 1 +
arch/arm64/kernel/asm-offsets.c | 1 +
arch/arm64/kernel/entry.S | 11 ++---------
arch/arm64/kernel/head.S | 4 ++--
arch/arm64/kernel/process.c | 16 ++++------------
8 files changed, 25 insertions(+), 40 deletions(-)

diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h
index 1b67c37..f7da6b5 100644
--- a/arch/arm64/include/asm/assembler.h
+++ b/arch/arm64/include/asm/assembler.h
@@ -229,6 +229,11 @@
#endif
.endm

+ .macro get_this_cpu_offset dst
+ mrs \dst, tpidr_el1
+ ldr \dst, [\dst, #TSK_TI_PCP]
+ .endm
+
/*
* @dst: Result of per_cpu(sym, smp_processor_id())
* @sym: The name of the per-cpu variable
@@ -236,7 +241,7 @@
*/
.macro adr_this_cpu, dst, sym, tmp
adr_l \dst, \sym
- mrs \tmp, tpidr_el1
+ get_this_cpu_offset \tmp
add \dst, \dst, \tmp
.endm

@@ -247,7 +252,7 @@
*/
.macro ldr_this_cpu dst, sym, tmp
adr_l \dst, \sym
- mrs \tmp, tpidr_el1
+ get_this_cpu_offset \tmp
ldr \dst, [\dst, \tmp]
.endm

@@ -438,7 +443,7 @@
* Return the current thread_info.
*/
.macro get_thread_info, rd
- mrs \rd, sp_el0
+ mrs \rd, tpidr_el1
.endm

/*
diff --git a/arch/arm64/include/asm/current.h b/arch/arm64/include/asm/current.h
index f6580d4..54b271a 100644
--- a/arch/arm64/include/asm/current.h
+++ b/arch/arm64/include/asm/current.h
@@ -13,11 +13,11 @@
*/
static __always_inline struct task_struct *get_current(void)
{
- unsigned long sp_el0;
+ unsigned long cur;

- asm ("mrs %0, sp_el0" : "=r" (sp_el0));
+ asm ("mrs %0, tpidr_el1" : "=r" (cur));

- return (struct task_struct *)sp_el0;
+ return (struct task_struct *)cur;
}

#define current get_current()
diff --git a/arch/arm64/include/asm/percpu.h b/arch/arm64/include/asm/percpu.h
index 3bd498e..05cf0f8 100644
--- a/arch/arm64/include/asm/percpu.h
+++ b/arch/arm64/include/asm/percpu.h
@@ -18,23 +18,16 @@

#include <asm/stack_pointer.h>

+#include <linux/thread_info.h>
+
static inline void set_my_cpu_offset(unsigned long off)
{
- asm volatile("msr tpidr_el1, %0" :: "r" (off) : "memory");
+ current_thread_info()->pcp_offset = off;
}

static inline unsigned long __my_cpu_offset(void)
{
- unsigned long off;
-
- /*
- * We want to allow caching the value, so avoid using volatile and
- * instead use a fake stack read to hazard against barrier().
- */
- asm("mrs %0, tpidr_el1" : "=r" (off) :
- "Q" (*(const unsigned long *)current_stack_pointer));
-
- return off;
+ return current_thread_info()->pcp_offset;
}
#define __my_cpu_offset __my_cpu_offset()

diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h
index 46c3b93..141f13e9 100644
--- a/arch/arm64/include/asm/thread_info.h
+++ b/arch/arm64/include/asm/thread_info.h
@@ -50,6 +50,7 @@ struct thread_info {
#ifdef CONFIG_ARM64_SW_TTBR0_PAN
u64 ttbr0; /* saved TTBR0_EL1 */
#endif
+ unsigned long pcp_offset;
int preempt_count; /* 0 => preemptable, <0 => bug */
};

diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c
index b3bb7ef..17001be 100644
--- a/arch/arm64/kernel/asm-offsets.c
+++ b/arch/arm64/kernel/asm-offsets.c
@@ -38,6 +38,7 @@ int main(void)
BLANK();
DEFINE(TSK_TI_FLAGS, offsetof(struct task_struct, thread_info.flags));
DEFINE(TSK_TI_PREEMPT, offsetof(struct task_struct, thread_info.preempt_count));
+ DEFINE(TSK_TI_PCP, offsetof(struct task_struct, thread_info.pcp_offset));
DEFINE(TSK_TI_ADDR_LIMIT, offsetof(struct task_struct, thread_info.addr_limit));
#ifdef CONFIG_ARM64_SW_TTBR0_PAN
DEFINE(TSK_TI_TTBR0, offsetof(struct task_struct, thread_info.ttbr0));
diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index b738880..773b3fea 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -92,7 +92,7 @@

.if \el == 0
mrs x21, sp_el0
- ldr_this_cpu tsk, __entry_task, x20 // Ensure MDSCR_EL1.SS is clear,
+ get_thread_info tsk // Ensure MDSCR_EL1.SS is clear,
ldr x19, [tsk, #TSK_TI_FLAGS] // since we can unmask debug
disable_step_tsk x19, x20 // exceptions when scheduling.

@@ -147,13 +147,6 @@ alternative_else_nop_endif
.endif

/*
- * Set sp_el0 to current thread_info.
- */
- .if \el == 0
- msr sp_el0, tsk
- .endif
-
- /*
* Registers that may be useful after this macro is invoked:
*
* x21 - aborted SP
@@ -734,7 +727,7 @@ ENTRY(cpu_switch_to)
ldp x29, x9, [x8], #16
ldr lr, [x8]
mov sp, x9
- msr sp_el0, x1
+ msr tpidr_el1, x1
ret
ENDPROC(cpu_switch_to)

diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S
index 973df7d..a58ecda 100644
--- a/arch/arm64/kernel/head.S
+++ b/arch/arm64/kernel/head.S
@@ -324,7 +324,7 @@ __primary_switched:
adrp x4, init_thread_union
add sp, x4, #THREAD_SIZE
adr_l x5, init_task
- msr sp_el0, x5 // Save thread_info
+ msr tpidr_el1, x5 // Save thread_info

adr_l x8, vectors // load VBAR_EL1 with virtual
msr vbar_el1, x8 // vector table address
@@ -615,7 +615,7 @@ __secondary_switched:
ldr x1, [x0, #CPU_BOOT_STACK] // get secondary_data.stack
mov sp, x1
ldr x2, [x0, #CPU_BOOT_TASK]
- msr sp_el0, x2
+ msr tpidr_el1, x2
mov x29, #0
b secondary_start_kernel
ENDPROC(__secondary_switched)
diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c
index ae2a835..4212da3 100644
--- a/arch/arm64/kernel/process.c
+++ b/arch/arm64/kernel/process.c
@@ -323,18 +323,10 @@ void uao_thread_switch(struct task_struct *next)
}
}

-/*
- * We store our current task in sp_el0, which is clobbered by userspace. Keep a
- * shadow copy so that we can restore this upon entry from userspace.
- *
- * This is *only* for exception entry from EL0, and is not valid until we
- * __switch_to() a user task.
- */
-DEFINE_PER_CPU(struct task_struct *, __entry_task);
-
-static void entry_task_switch(struct task_struct *next)
+/* Ensure the new task has this CPU's offset */
+void pcp_thread_switch(struct task_struct *next)
{
- __this_cpu_write(__entry_task, next);
+ next->thread_info.pcp_offset = current_thread_info()->pcp_offset;
}

/*
@@ -349,8 +341,8 @@ __notrace_funcgraph struct task_struct *__switch_to(struct task_struct *prev,
tls_thread_switch(next);
hw_breakpoint_thread_switch(next);
contextidr_thread_switch(next);
- entry_task_switch(next);
uao_thread_switch(next);
+ pcp_thread_switch(next);

/*
* Complete any pending TLB or cache maintenance on this CPU in case
--
1.9.1
Will Deacon
2017-07-14 01:30:54 UTC
Permalink
Post by Mark Rutland
Today we use TPIDR_EL1 for our percpu offset, and SP_EL0 for current
(and current::thread_info, which is at offset 0).
Using SP_EL0 in this way prevents us from using EL1 thread mode, where
SP_EL0 is not addressable (since it's used as the active SP). It also
means we can't use SP_EL0 for other purposes (e.g. as a
scratch-register).
This patch frees up SP_EL0 for such usage, by storing the percpu offset
in current::thread_info, and using TPIDR_EL1 to store current. As we no
longer need to update SP_EL0 at EL0 exception boundaries, this allows us
to delete some code.
Does this mean we can just use asm-generic/percpu.h?

Will
Mark Rutland
2017-07-12 22:33:01 UTC
Permalink
In subsequent patches, we'll want the base of the secondary stack in
secondary_start_kernel.

Pass the stack base down, as we do in the primary path, and add the
offset in secondary_start_kernel. Unfortunately, we can't encode
STACK_START_SP in an add immediate, so use a mov immedaite, which has
greater range.

This is far from a hot path, so the overhead shouldn't matter.

Signed-off-by: Mark Rutland <***@arm.com>
---
arch/arm64/kernel/head.S | 3 ++-
arch/arm64/kernel/smp.c | 2 +-
2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S
index a58ecda..db77cac 100644
--- a/arch/arm64/kernel/head.S
+++ b/arch/arm64/kernel/head.S
@@ -613,7 +613,8 @@ __secondary_switched:

adr_l x0, secondary_data
ldr x1, [x0, #CPU_BOOT_STACK] // get secondary_data.stack
- mov sp, x1
+ mov x3, #THREAD_START_SP
+ add sp, x1, x3
ldr x2, [x0, #CPU_BOOT_TASK]
msr tpidr_el1, x2
mov x29, #0
diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
index 6e0e16a..269c957 100644
--- a/arch/arm64/kernel/smp.c
+++ b/arch/arm64/kernel/smp.c
@@ -154,7 +154,7 @@ int __cpu_up(unsigned int cpu, struct task_struct *idle)
* page tables.
*/
secondary_data.task = idle;
- secondary_data.stack = task_stack_page(idle) + THREAD_START_SP;
+ secondary_data.stack = task_stack_page(idle);
update_cpu_boot_status(CPU_MMU_OFF);
__flush_dcache_area(&secondary_data, sizeof(secondary_data));
--
1.9.1
Mark Rutland
2017-07-12 22:33:00 UTC
Permalink
Our THREAD_SIZE may be smaller than PAGE_SIZE. With VMAP_STACK, we can't
allow stacks to share a page with anything else, so may as well pad
up-to PAGE_SIZE, and have 64K stacks when we have 64K pages.

Signed-off-by: Mark Rutland <***@arm.com>
---
arch/arm64/include/asm/thread_info.h | 8 +++++++-
1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h
index 6d0c59a..3684f86 100644
--- a/arch/arm64/include/asm/thread_info.h
+++ b/arch/arm64/include/asm/thread_info.h
@@ -25,7 +25,13 @@

#include <asm/page.h>

-#define THREAD_SHIFT 14
+#define __THREAD_SHIFT 14
+
+#if defined(CONFIG_VMAP_STACK) && (__THREAD_SHIFT < PAGE_SHIFT)
+#define THREAD_SHIFT PAGE_SHIFT
+#else
+#define THREAD_SHIFT __THREAD_SHIFT
+#endif

#if THREAD_SHIFT >= PAGE_SHIFT
#define THREAD_SIZE_ORDER (THREAD_SHIFT - PAGE_SHIFT)
--
1.9.1
Mark Rutland
2017-07-12 22:33:03 UTC
Permalink
Signed-off-by: Mark Rutland <***@arm.com>
---
arch/arm64/Kconfig | 1 +
arch/arm64/kernel/entry.S | 43 +++++++++++++++++++++++++++++++++++++++++++
arch/arm64/kernel/traps.c | 21 +++++++++++++++++++++
3 files changed, 65 insertions(+)

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index b2024db..5cbd961 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -1,5 +1,6 @@
config ARM64
def_bool y
+ select HAVE_ARCH_VMAP_STACK
select ACPI_CCA_REQUIRED if ACPI
select ACPI_GENERIC_GSI if ACPI
select ACPI_GTDT if ACPI
diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index 7c8b164..e0fdb65 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -396,11 +396,54 @@ el1_error_invalid:
inv_entry 1, BAD_ERROR
ENDPROC(el1_error_invalid)

+#ifdef CONFIG_VMAP_STACK
+.macro detect_bad_stack
+ msr sp_el0, x0
+ get_thread_info x0
+ ldr x0, [x0, #TSK_TI_CUR_STK]
+ sub x0, sp, x0
+ and x0, x0, #~(THREAD_SIZE - 1)
+ cbnz x0, __bad_stack
+ mrs x0, sp_el0
+.endm
+
+__bad_stack:
+ /*
+ * Stash the bad SP, and free up another GPR. We no longer care about
+ * EL0 state, since this thread cannot recover.
+ */
+ mov x0, sp
+ msr tpidrro_el0, x0
+ msr tpidr_el0, x1
+
+ /* Move to the emergency stack */
+ adr_this_cpu x0, bad_stack, x1
+ mov x1, #THREAD_START_SP
+ add sp, x0, x1
+
+ /* Restore GPRs and log them to pt_regs */
+ mrs x0, sp_el0
+ mrs x1, tpidr_el0
+ kernel_entry 1
+
+ /* restore the bad SP to pt_regs */
+ mrs x1, tpidrro_el0
+ str x1, [sp, #S_SP]
+
+ /* Time to die */
+ mov x0, sp
+ b handle_bad_stack
+#else
+.macro detect_bad_stack
+.endm
+#endif
+
/*
* EL1 mode handlers.
*/
.align 6
el1_sync:
+ detect_bad_stack
kernel_entry 1
mrs x1, esr_el1 // read the syndrome register
lsr x24, x1, #ESR_ELx_EC_SHIFT // exception class
diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c
index 0805b44..84b00e3 100644
--- a/arch/arm64/kernel/traps.c
+++ b/arch/arm64/kernel/traps.c
@@ -683,6 +683,27 @@ asmlinkage void bad_el0_sync(struct pt_regs *regs, int reason, unsigned int esr)
force_sig_info(info.si_signo, &info, current);
}

+#ifdef CONFIG_VMAP_STACK
+DEFINE_PER_CPU(unsigned long [IRQ_STACK_SIZE/sizeof(long)], bad_stack) __aligned(16);
+
+asmlinkage void handle_bad_stack(struct pt_regs *regs)
+{
+ unsigned long tsk_stk = (unsigned long)current->stack;
+ unsigned long irq_stk = (unsigned long)per_cpu(irq_stack, smp_processor_id());
+
+ console_verbose();
+ pr_emerg("Stack out-of-bounds!\n"
+ "\tsp: 0x%016lx\n"
+ "\ttsk stack: [0x%016lx..0x%016lx]\n"
+ "\tirq stack: [0x%016lx..0x%016lx]\n",
+ kernel_stack_pointer(regs),
+ tsk_stk, tsk_stk + THREAD_SIZE,
+ irq_stk, irq_stk + THREAD_SIZE);
+ show_regs(regs);
+ panic("stack out-of-bounds");
+}
+#endif
+
void __pte_error(const char *file, int line, unsigned long val)
{
pr_err("%s:%d: bad pte %016lx.\n", file, line, val);
--
1.9.1
Ard Biesheuvel
2017-07-13 06:58:50 UTC
Permalink
Hi Mark,
Post by Mark Rutland
---
arch/arm64/Kconfig | 1 +
arch/arm64/kernel/entry.S | 43 +++++++++++++++++++++++++++++++++++++++++++
arch/arm64/kernel/traps.c | 21 +++++++++++++++++++++
3 files changed, 65 insertions(+)
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index b2024db..5cbd961 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -1,5 +1,6 @@
config ARM64
def_bool y
+ select HAVE_ARCH_VMAP_STACK
select ACPI_CCA_REQUIRED if ACPI
select ACPI_GENERIC_GSI if ACPI
select ACPI_GTDT if ACPI
diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index 7c8b164..e0fdb65 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
inv_entry 1, BAD_ERROR
ENDPROC(el1_error_invalid)
+#ifdef CONFIG_VMAP_STACK
+.macro detect_bad_stack
+ msr sp_el0, x0
+ get_thread_info x0
+ ldr x0, [x0, #TSK_TI_CUR_STK]
+ sub x0, sp, x0
+ and x0, x0, #~(THREAD_SIZE - 1)
+ cbnz x0, __bad_stack
+ mrs x0, sp_el0
The typical prologue looks like

stp x29, x30, [sp, #-xxx]!
stp x27, x28, [sp, #xxx]
...
mov x29, sp

which means that in most cases where we do run off the stack, sp will
still be pointing into it when the exception is taken. This means we
will fault recursively in the handler before having had the chance to
accurately record the exception context.

Given that the max displacement of a store instruction is 512 bytes,
and that the frame size we are about to stash exceeds that, should we
already consider it a stack fault if sp is within 512 bytes (or
S_FRAME_SIZE) of the base of the stack?
Post by Mark Rutland
+.endm
+
+ /*
+ * Stash the bad SP, and free up another GPR. We no longer care about
+ * EL0 state, since this thread cannot recover.
+ */
+ mov x0, sp
+ msr tpidrro_el0, x0
+ msr tpidr_el0, x1
+
+ /* Move to the emergency stack */
+ adr_this_cpu x0, bad_stack, x1
+ mov x1, #THREAD_START_SP
+ add sp, x0, x1
+
+ /* Restore GPRs and log them to pt_regs */
+ mrs x0, sp_el0
+ mrs x1, tpidr_el0
+ kernel_entry 1
+
+ /* restore the bad SP to pt_regs */
+ mrs x1, tpidrro_el0
+ str x1, [sp, #S_SP]
+
+ /* Time to die */
+ mov x0, sp
+ b handle_bad_stack
+#else
+.macro detect_bad_stack
+.endm
+#endif
+
/*
* EL1 mode handlers.
*/
.align 6
+ detect_bad_stack
kernel_entry 1
mrs x1, esr_el1 // read the syndrome register
lsr x24, x1, #ESR_ELx_EC_SHIFT // exception class
diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c
index 0805b44..84b00e3 100644
--- a/arch/arm64/kernel/traps.c
+++ b/arch/arm64/kernel/traps.c
@@ -683,6 +683,27 @@ asmlinkage void bad_el0_sync(struct pt_regs *regs, int reason, unsigned int esr)
force_sig_info(info.si_signo, &info, current);
}
+#ifdef CONFIG_VMAP_STACK
+DEFINE_PER_CPU(unsigned long [IRQ_STACK_SIZE/sizeof(long)], bad_stack) __aligned(16);
+
Surely, we don't need a 16 KB or 64 KB stack here?
Post by Mark Rutland
+asmlinkage void handle_bad_stack(struct pt_regs *regs)
+{
+ unsigned long tsk_stk = (unsigned long)current->stack;
+ unsigned long irq_stk = (unsigned long)per_cpu(irq_stack, smp_processor_id());
+
+ console_verbose();
+ pr_emerg("Stack out-of-bounds!\n"
+ "\tsp: 0x%016lx\n"
+ "\ttsk stack: [0x%016lx..0x%016lx]\n"
+ "\tirq stack: [0x%016lx..0x%016lx]\n",
+ kernel_stack_pointer(regs),
+ tsk_stk, tsk_stk + THREAD_SIZE,
+ irq_stk, irq_stk + THREAD_SIZE);
+ show_regs(regs);
+ panic("stack out-of-bounds");
+}
+#endif
+
void __pte_error(const char *file, int line, unsigned long val)
{
pr_err("%s:%d: bad pte %016lx.\n", file, line, val);
--
1.9.1
Mark Rutland
2017-07-13 10:49:50 UTC
Permalink
Post by Ard Biesheuvel
Hi Mark,
Hi,
Post by Ard Biesheuvel
Post by Mark Rutland
+#ifdef CONFIG_VMAP_STACK
+.macro detect_bad_stack
+ msr sp_el0, x0
+ get_thread_info x0
+ ldr x0, [x0, #TSK_TI_CUR_STK]
+ sub x0, sp, x0
+ and x0, x0, #~(THREAD_SIZE - 1)
+ cbnz x0, __bad_stack
+ mrs x0, sp_el0
The typical prologue looks like
stp x29, x30, [sp, #-xxx]!
stp x27, x28, [sp, #xxx]
...
mov x29, sp
which means that in most cases where we do run off the stack, sp will
still be pointing into it when the exception is taken. This means we
will fault recursively in the handler before having had the chance to
accurately record the exception context.
True; I had mostly been thinking about kernel_entry, where we do an
explicit subtraction from the SP before any stores.
Post by Ard Biesheuvel
Given that the max displacement of a store instruction is 512 bytes,
and that the frame size we are about to stash exceeds that, should we
already consider it a stack fault if sp is within 512 bytes (or
S_FRAME_SIZE) of the base of the stack?
Good point.

I've flip-flopped on this point while writing this reply.

My original line of thinking was that it was best to rely on the
recursive fault to push the SP out-of-bounds. That keeps the overflow
detection simple/fast, and hopefully robust to unexpected exceptions,
(expected?) probes to the guard page, etc.

I also agree that it's annoying to lose the information associated with the
initial fault.

My fear is that we can't catch those cases robustly and efficiently. At
minimum, I believe we'd need to check:

* FAR_EL1 is out-of-bounds for the stack. You have a suitable check for
this.

* FAR_EL1 is valid (looking at the ESR_ELx.{EC,ISS}, etc). I'm not sure
exactly what we need to check here, and I'm not sure what we want to
do about reserved ESR_ELx encodings.

* The base register for the access was the SP (e.g. so this isn't a
probe_kernel_read() or similar).

... so my current feeling is that relying on the recursive fault is the
best bet, even if we lose some information from the initial fault.

Along with that, we should ensure that we get a reliable backtrace, so
that we have the PC from the initial fault, and can acquire the relevant
regs from a dump of the stack and/or the regs at the point of the
recursive fault.

FWIW, currently this series gives you something like:

[ 0.263544] Stack out-of-bounds!
[ 0.263544] sp: 0xffff000009fbfed0
[ 0.263544] tsk stack: [0xffff000009fc0000..0xffff000009fd0000]
[ 0.263544] irq stack: [0xffff80097fe100a0..0xffff80097fe200a0]
[ 0.304862] CPU: 1 PID: 0 Comm: swapper/1 Not tainted 4.12.0-00006-g0c4fb26-dirty #73
[ 0.312830] Hardware name: ARM Juno development board (r1) (DT)
[ 0.318847] task: ffff800940d8a200 task.stack: ffff000009fc0000
[ 0.324872] PC is at el1_sync+0x20/0xc8
[ 0.328773] LR is at force_overflow+0xc/0x18
[ 0.333113] pc : [<ffff000008082460>] lr : [<ffff00000808c75c>] pstate: 600003c5
[ 0.340636] sp : ffff000009fbfed0
[ 0.344004] x29: ffff000009fc0000 x28: 0000000000000000
[ 0.349409] x27: 0000000000000000 x26: 0000000000000000
[ 0.354812] x25: 0000000000000000 x24: 0000000000000000
[ 0.360214] x23: 0000000000000000 x22: 0000000000000000
[ 0.365617] x21: 0000000000000000 x20: 0000000000000001
[ 0.371020] x19: 0000000000000001 x18: 0000000000000030
[ 0.376422] x17: 0000000000000000 x16: 0000000000000000
[ 0.381826] x15: 0000000000000008 x14: 000000000fb506bc
[ 0.387228] x13: 0000000000000000 x12: 0000000000000000
[ 0.392631] x11: 0000000000000000 x10: 0000000000000141
[ 0.398034] x9 : 0000000000000000 x8 : ffff80097fdf93e8
[ 0.403437] x7 : ffff80097fdf9410 x6 : 0000000000000001
[ 0.408839] x5 : ffff000008ebcb80 x4 : ffff000008eb65d8
[ 0.414242] x3 : 00000000000f4240 x2 : 0000000000000002
[ 0.419644] x1 : ffff800940d8a200 x0 : 0000000000000001
[ 0.425048] Kernel panic - not syncing: stack out-of-bounds
[ 0.430714] CPU: 1 PID: 0 Comm: swapper/1 Not tainted 4.12.0-00006-g0c4fb26-dirty #73
[ 0.438679] Hardware name: ARM Juno development board (r1) (DT)
[ 0.444697] Call trace:
[ 0.447185] [<ffff000008086f68>] dump_backtrace+0x0/0x230
[ 0.452676] [<ffff00000808725c>] show_stack+0x14/0x20
[ 0.457815] [<ffff00000838760c>] dump_stack+0x9c/0xc0
[ 0.462953] [<ffff00000816d5a0>] panic+0x11c/0x294
[ 0.467825] [<ffff000008087a70>] __pte_error+0x0/0x28
[ 0.472961] [<ffff00000808c75c>] force_overflow+0xc/0x18
[ 0.478364] SMP: stopping secondary CPUs
[ 0.482356] ---[ end Kernel panic - not syncing: stack out-of-bounds

... that __pte_error() is because the last instruction in handle_bad_stack is a
tail-call to panic, and __pte_error happens to be next in the text.

I haven't yet dug into why the stacktrace ends abruptly. I think I need
to update stack walkers to understand the new stack, but I may also have
forgotten to do something with the frame record in the entry path.

[...]
Post by Ard Biesheuvel
Post by Mark Rutland
+#ifdef CONFIG_VMAP_STACK
+DEFINE_PER_CPU(unsigned long [IRQ_STACK_SIZE/sizeof(long)], bad_stack) __aligned(16);
+
Surely, we don't need a 16 KB or 64 KB stack here?
For most cases, we do not need such a big stack. We can probably drop
this down to something much smaller (1K, as with your series, sounds
sufficient).

The one case I was worried about was overflows on the emergency stack
itself. I believe that for dumping memory we might need to fix up
exceptions, and if that goes wrong we could go recursive.

I'd planned to update current_stack when jumping to the emergency stack,
and use the same (initial) bounds detection, requiring the emergency
stack to be the same size. In the case of an emergency stack overflow,
we'd go to a (stackless) wfi/wfe loop.

However, I deleted bits of that code while trying to debug an unrelated
issue, and didn't restore it.

I guess it depends on whether we want to try to handle that case.

Thanks,
Mark.
Ard Biesheuvel
2017-07-13 11:49:48 UTC
Permalink
Post by Mark Rutland
Post by Ard Biesheuvel
Hi Mark,
Hi,
Post by Ard Biesheuvel
Post by Mark Rutland
+#ifdef CONFIG_VMAP_STACK
+.macro detect_bad_stack
+ msr sp_el0, x0
+ get_thread_info x0
+ ldr x0, [x0, #TSK_TI_CUR_STK]
+ sub x0, sp, x0
+ and x0, x0, #~(THREAD_SIZE - 1)
+ cbnz x0, __bad_stack
+ mrs x0, sp_el0
The typical prologue looks like
stp x29, x30, [sp, #-xxx]!
stp x27, x28, [sp, #xxx]
...
mov x29, sp
which means that in most cases where we do run off the stack, sp will
still be pointing into it when the exception is taken. This means we
will fault recursively in the handler before having had the chance to
accurately record the exception context.
True; I had mostly been thinking about kernel_entry, where we do an
explicit subtraction from the SP before any stores.
Post by Ard Biesheuvel
Given that the max displacement of a store instruction is 512 bytes,
and that the frame size we are about to stash exceeds that, should we
already consider it a stack fault if sp is within 512 bytes (or
S_FRAME_SIZE) of the base of the stack?
Good point.
I've flip-flopped on this point while writing this reply.
My original line of thinking was that it was best to rely on the
recursive fault to push the SP out-of-bounds. That keeps the overflow
detection simple/fast, and hopefully robust to unexpected exceptions,
(expected?) probes to the guard page, etc.
I also agree that it's annoying to lose the information associated with the
initial fault.
My fear is that we can't catch those cases robustly and efficiently. At
* FAR_EL1 is out-of-bounds for the stack. You have a suitable check for
this.
* FAR_EL1 is valid (looking at the ESR_ELx.{EC,ISS}, etc). I'm not sure
exactly what we need to check here, and I'm not sure what we want to
do about reserved ESR_ELx encodings.
* The base register for the access was the SP (e.g. so this isn't a
probe_kernel_read() or similar).
... so my current feeling is that relying on the recursive fault is the
best bet, even if we lose some information from the initial fault.
There are two related issues at play here that we shouldn't conflate:
- checking whether we have sufficient stack space left to be able to
handle the exception in the first place,
- figuring out whether *this* exception was caused by a faulting
dereference of the stack pointer (which could be with writeback, or
even via some intermediate register: x29 is often used as a pseudo
stack pointer IIRC, although it should never point below sp itself)

Given that the very first stp in kernel_entry will fault if we have
less than S_FRAME_SIZE bytes of stack left, I think we should check
that we have at least that much space available. That way, the context
is preserved, and we could restart the outer exception if we wanted
to, or point our pt_regs pointer to it etc.

When and how we diagnose the condition as a kernel stack overflow is a
separate issue, and can well wait until we're in C code.
Post by Mark Rutland
Along with that, we should ensure that we get a reliable backtrace, so
that we have the PC from the initial fault, and can acquire the relevant
regs from a dump of the stack and/or the regs at the point of the
recursive fault.
[ 0.263544] Stack out-of-bounds!
[ 0.263544] sp: 0xffff000009fbfed0
[ 0.263544] tsk stack: [0xffff000009fc0000..0xffff000009fd0000]
[ 0.263544] irq stack: [0xffff80097fe100a0..0xffff80097fe200a0]
[ 0.304862] CPU: 1 PID: 0 Comm: swapper/1 Not tainted 4.12.0-00006-g0c4fb26-dirty #73
[ 0.312830] Hardware name: ARM Juno development board (r1) (DT)
[ 0.318847] task: ffff800940d8a200 task.stack: ffff000009fc0000
[ 0.324872] PC is at el1_sync+0x20/0xc8
[ 0.328773] LR is at force_overflow+0xc/0x18
[ 0.333113] pc : [<ffff000008082460>] lr : [<ffff00000808c75c>] pstate: 600003c5
[ 0.340636] sp : ffff000009fbfed0
[ 0.344004] x29: ffff000009fc0000 x28: 0000000000000000
[ 0.349409] x27: 0000000000000000 x26: 0000000000000000
[ 0.354812] x25: 0000000000000000 x24: 0000000000000000
[ 0.360214] x23: 0000000000000000 x22: 0000000000000000
[ 0.365617] x21: 0000000000000000 x20: 0000000000000001
[ 0.371020] x19: 0000000000000001 x18: 0000000000000030
[ 0.376422] x17: 0000000000000000 x16: 0000000000000000
[ 0.381826] x15: 0000000000000008 x14: 000000000fb506bc
[ 0.387228] x13: 0000000000000000 x12: 0000000000000000
[ 0.392631] x11: 0000000000000000 x10: 0000000000000141
[ 0.398034] x9 : 0000000000000000 x8 : ffff80097fdf93e8
[ 0.403437] x7 : ffff80097fdf9410 x6 : 0000000000000001
[ 0.408839] x5 : ffff000008ebcb80 x4 : ffff000008eb65d8
[ 0.414242] x3 : 00000000000f4240 x2 : 0000000000000002
[ 0.419644] x1 : ffff800940d8a200 x0 : 0000000000000001
[ 0.425048] Kernel panic - not syncing: stack out-of-bounds
[ 0.430714] CPU: 1 PID: 0 Comm: swapper/1 Not tainted 4.12.0-00006-g0c4fb26-dirty #73
[ 0.438679] Hardware name: ARM Juno development board (r1) (DT)
[ 0.447185] [<ffff000008086f68>] dump_backtrace+0x0/0x230
[ 0.452676] [<ffff00000808725c>] show_stack+0x14/0x20
[ 0.457815] [<ffff00000838760c>] dump_stack+0x9c/0xc0
[ 0.462953] [<ffff00000816d5a0>] panic+0x11c/0x294
[ 0.467825] [<ffff000008087a70>] __pte_error+0x0/0x28
[ 0.472961] [<ffff00000808c75c>] force_overflow+0xc/0x18
[ 0.478364] SMP: stopping secondary CPUs
[ 0.482356] ---[ end Kernel panic - not syncing: stack out-of-bounds
... that __pte_error() is because the last instruction in handle_bad_stack is a
tail-call to panic, and __pte_error happens to be next in the text.
I haven't yet dug into why the stacktrace ends abruptly. I think I need
to update stack walkers to understand the new stack, but I may also have
forgotten to do something with the frame record in the entry path.
[...]
Post by Ard Biesheuvel
Post by Mark Rutland
+#ifdef CONFIG_VMAP_STACK
+DEFINE_PER_CPU(unsigned long [IRQ_STACK_SIZE/sizeof(long)], bad_stack) __aligned(16);
+
Surely, we don't need a 16 KB or 64 KB stack here?
For most cases, we do not need such a big stack. We can probably drop
this down to something much smaller (1K, as with your series, sounds
sufficient).
The one case I was worried about was overflows on the emergency stack
itself. I believe that for dumping memory we might need to fix up
exceptions, and if that goes wrong we could go recursive.
I'd planned to update current_stack when jumping to the emergency stack,
and use the same (initial) bounds detection, requiring the emergency
stack to be the same size. In the case of an emergency stack overflow,
we'd go to a (stackless) wfi/wfe loop.
However, I deleted bits of that code while trying to debug an unrelated
issue, and didn't restore it.
I guess it depends on whether we want to try to handle that case.
Thanks,
Mark.
Mark Rutland
2017-07-13 16:10:50 UTC
Permalink
Post by Ard Biesheuvel
Post by Mark Rutland
Post by Ard Biesheuvel
The typical prologue looks like
stp x29, x30, [sp, #-xxx]!
stp x27, x28, [sp, #xxx]
...
mov x29, sp
which means that in most cases where we do run off the stack, sp will
still be pointing into it when the exception is taken. This means we
will fault recursively in the handler before having had the chance to
accurately record the exception context.
Given that the max displacement of a store instruction is 512 bytes,
and that the frame size we are about to stash exceeds that, should we
already consider it a stack fault if sp is within 512 bytes (or
S_FRAME_SIZE) of the base of the stack?
My original line of thinking was that it was best to rely on the
recursive fault to push the SP out-of-bounds. That keeps the overflow
detection simple/fast, and hopefully robust to unexpected exceptions,
(expected?) probes to the guard page, etc.
I also agree that it's annoying to lose the information associated with the
initial fault.
My fear is that we can't catch those cases robustly and efficiently. At
* FAR_EL1 is out-of-bounds for the stack. You have a suitable check for
this.
* FAR_EL1 is valid (looking at the ESR_ELx.{EC,ISS}, etc). I'm not sure
exactly what we need to check here, and I'm not sure what we want to
do about reserved ESR_ELx encodings.
* The base register for the access was the SP (e.g. so this isn't a
probe_kernel_read() or similar).
... so my current feeling is that relying on the recursive fault is the
best bet, even if we lose some information from the initial fault.
- checking whether we have sufficient stack space left to be able to
handle the exception in the first place,
- figuring out whether *this* exception was caused by a faulting
dereference of the stack pointer (which could be with writeback, or
even via some intermediate register: x29 is often used as a pseudo
stack pointer IIRC, although it should never point below sp itself)
Sure; I agree these are separate properties (my robustness and
efficiency concerns fall with the latter).
Post by Ard Biesheuvel
Given that the very first stp in kernel_entry will fault if we have
less than S_FRAME_SIZE bytes of stack left, I think we should check
that we have at least that much space available.
I was going to reply saying that I didn't agree, but in writing up
examples, I mostly convinced myself that this is the right thing to do.
So I mostly agree!

This would mean we treat the first impossible-to-handle exception as
that fatal case, which is similar to x86's double-fault, triggered when
the HW can't stack the regs. All other cases are just arbitrary faults.

However, to provide that consistently, we'll need to perform this check
at every exception boundary, or some of those cases will result in a
recursive fault first.

So I think there are three choices:

1) In el1_sync, only check SP bounds, and live with the recursive
faults.

2) in el1_sync, check there's room for the regs, and live with the
recursive faults for overflow on other exceptions.

3) In all EL1 entry paths, check there's room for the regs.
Post by Ard Biesheuvel
That way, the context is preserved, and we could restart the outer
exception if we wanted to, or point our pt_regs pointer to it etc.
When and how we diagnose the condition as a kernel stack overflow is a
separate issue, and can well wait until we're in C code.
I believe that determining whether the exception was caused by a stack
overflow is not something we can do robustly or efficiently.

You mentioned the x29 pseudo-sp case, and there are other cases where
the SP value is proxied:

mov x0, sp
ldr x0, [x0, x1]

Or unrelated accesses that hit the guard page:

adrp x0, some_vmalloc_object
add x0, x0, #:lo12:some_vmalloc_object
mov x1, #bogus_offset
ldr x0, [x0, x1]

As above, I think it's helpful to think of this as something closer to a
double-fault handler (i.e. aiming to catch when we can't take the
exception safely), rather than something that's trying to catch logical
stack overflows.

Thanks,
Mark.
Mark Rutland
2017-07-13 17:55:45 UTC
Permalink
Post by Mark Rutland
Post by Ard Biesheuvel
Given that the very first stp in kernel_entry will fault if we have
less than S_FRAME_SIZE bytes of stack left, I think we should check
that we have at least that much space available.
I was going to reply saying that I didn't agree, but in writing up
examples, I mostly convinced myself that this is the right thing to do.
So I mostly agree!
This would mean we treat the first impossible-to-handle exception as
that fatal case, which is similar to x86's double-fault, triggered when
the HW can't stack the regs. All other cases are just arbitrary faults.
However, to provide that consistently, we'll need to perform this check
at every exception boundary, or some of those cases will result in a
recursive fault first.
1) In el1_sync, only check SP bounds, and live with the recursive
faults.
2) in el1_sync, check there's room for the regs, and live with the
recursive faults for overflow on other exceptions.
3) In all EL1 entry paths, check there's room for the regs.
FWIW, for the moment I've applied (2), as you suggested, to my
arm64/vmap-stack branch, adding an additional:

sub x0, x0, #S_FRAME_SIZE

... to the entry path.

I think it's worth trying (3) so that we consistently report these
cases, benchmarks permitting.

It's probably worth putting the fast-path check directly into the
vectors, where we currently only use 1/32 of the instruction slots
available to us.
Post by Mark Rutland
As above, I think it's helpful to think of this as something closer to a
double-fault handler (i.e. aiming to catch when we can't take the
exception safely), rather than something that's trying to catch logical
stack overflows.
Does this make sense to you?

I've tried to reword the log output, as below, to give this impression.

[ 49.288232] Insufficient stack space to handle exception!
[ 49.288245] CPU: 5 PID: 2208 Comm: bash Not tainted 4.12.0-00005-ga781af2 #81
[ 49.300680] Hardware name: ARM Juno development board (r1) (DT)
[ 49.306549] task: ffff800974955100 task.stack: ffff00000d6f0000
[ 49.312426] PC is at recursive_loop+0x10/0x50
[ 49.316747] LR is at recursive_loop+0x34/0x50
[ 49.321066] pc : [<ffff000008588aa0>] lr : [<ffff000008588ac4>] pstate: 40000145
[ 49.328398] sp : ffff00000d6eff30
[ 49.331682] x29: ffff00000d6f0350 x28: ffff800974955100
[ 49.336953] x27: ffff000008942000 x26: ffff000008f0d758
[ 49.342223] x25: ffff00000d6f3eb8 x24: ffff00000d6f3eb8
[ 49.347493] x23: ffff000008f0d490 x22: 0000000000000009
[ 49.352764] x21: ffff800974a57000 x20: ffff000008f0d4e0
[ 49.358034] x19: 0000000000000013 x18: 0000ffffe7e2e4f0
[ 49.363304] x17: 0000ffff9c1256a4 x16: ffff0000081f8b88
[ 49.368574] x15: 00002a81b8000000 x14: 00000000fffffff0
[ 49.373845] x13: ffff000008f6278a x12: ffff000008e62818
[ 49.379115] x11: 0000000000000000 x10: 000000000000019e
[ 49.384385] x9 : 0000000000000004 x8 : ffff00000d6f0770
[ 49.389656] x7 : 1313131313131313 x6 : 000000000000019e
[ 49.394925] x5 : 0000000000000000 x4 : 0000000000000000
[ 49.400205] x3 : 0000000000000000 x2 : 0000000000000400
[ 49.405484] x1 : 0000000000000013 x0 : 0000000000000012
[ 49.410764] Task stack: [0xffff00000d6f0000..0xffff00000d6f4000]
[ 49.416728] IRQ stack: [0xffff80097ffb90a0..0xffff80097ffbd0a0]
[ 49.422692] ESR: 0x96000047 -- DABT (current EL)
[ 49.427277] FAR: 0xffff00000d6eff30
[ 49.430742] Kernel panic - not syncing: kernel stack overflow
[ 49.436451] CPU: 5 PID: 2208 Comm: bash Not tainted 4.12.0-00005-ga781af2 #81
[ 49.443534] Hardware name: ARM Juno development board (r1) (DT)
[ 49.449412] Call trace:
[ 49.451852] [<ffff0000080885f0>] dump_backtrace+0x0/0x230
[ 49.457218] [<ffff0000080888e4>] show_stack+0x14/0x20
[ 49.462240] [<ffff00000839be0c>] dump_stack+0x9c/0xc0
[ 49.467261] [<ffff000008175218>] panic+0x11c/0x294
[ 49.472024] [<ffff000008089184>] handle_bad_stack+0xe4/0xe8
[ 49.477561] [<ffff000008588ac4>] recursive_loop+0x34/0x50
[ 49.482926] SMP: stopping secondary CPUs
[ 49.487145] Kernel Offset: disabled
[ 49.490609] Memory Limit: none
[ 49.493649] ---[ end Kernel panic - not syncing: kernel stack overflow

... I still need to attack the backtracing to walk across stacks.

Thanks,
Mark.
Ard Biesheuvel
2017-07-13 18:28:48 UTC
Permalink
Post by Mark Rutland
Post by Mark Rutland
Post by Ard Biesheuvel
Given that the very first stp in kernel_entry will fault if we have
less than S_FRAME_SIZE bytes of stack left, I think we should check
that we have at least that much space available.
I was going to reply saying that I didn't agree, but in writing up
examples, I mostly convinced myself that this is the right thing to do.
So I mostly agree!
This would mean we treat the first impossible-to-handle exception as
that fatal case, which is similar to x86's double-fault, triggered when
the HW can't stack the regs. All other cases are just arbitrary faults.
However, to provide that consistently, we'll need to perform this check
at every exception boundary, or some of those cases will result in a
recursive fault first.
1) In el1_sync, only check SP bounds, and live with the recursive
faults.
2) in el1_sync, check there's room for the regs, and live with the
recursive faults for overflow on other exceptions.
3) In all EL1 entry paths, check there's room for the regs.
FWIW, for the moment I've applied (2), as you suggested, to my
sub x0, x0, #S_FRAME_SIZE
... to the entry path.
I think it's worth trying (3) so that we consistently report these
cases, benchmarks permitting.
OK, so here's a crazy idea: what if we
a) carve out a dedicated range in the VMALLOC area for stacks
b) for each stack, allocate a naturally aligned window of 2x the stack
size, and map the stack inside it, leaving the remaining space
unmapped

That way, we can compare SP (minus S_FRAME_SIZE) against a mask that
is a build time constant, to decide whether its value points into a
stack or not. Of course, it may be pointing into the wrong stack, but
that should not prevent us from taking the exception, and we can deal
with that later. It would give us a very cheap way to perform this
test on the hot paths.
Post by Mark Rutland
Post by Mark Rutland
I believe that determining whether the exception was caused by a stack
overflow is not something we can do robustly or efficiently.
Actually, if the stack pointer is within S_FRAME_SIZE of the base, and
the faulting address points into the guard page, that is a pretty
strong indicator that the stack overflowed. That shouldn't be too
costly?
Post by Mark Rutland
It's probably worth putting the fast-path check directly into the
vectors, where we currently only use 1/32 of the instruction slots
available to us.
Post by Mark Rutland
As above, I think it's helpful to think of this as something closer to a
double-fault handler (i.e. aiming to catch when we can't take the
exception safely), rather than something that's trying to catch logical
stack overflows.
Does this make sense to you?
I've tried to reword the log output, as below, to give this impression.
[ 49.288232] Insufficient stack space to handle exception!
This could be a separate warning, if we find out that the actual
exception was caused by something else.
Post by Mark Rutland
[ 49.288245] CPU: 5 PID: 2208 Comm: bash Not tainted 4.12.0-00005-ga781af2 #81
[ 49.300680] Hardware name: ARM Juno development board (r1) (DT)
[ 49.306549] task: ffff800974955100 task.stack: ffff00000d6f0000
[ 49.312426] PC is at recursive_loop+0x10/0x50
[ 49.316747] LR is at recursive_loop+0x34/0x50
[ 49.321066] pc : [<ffff000008588aa0>] lr : [<ffff000008588ac4>] pstate: 40000145
[ 49.328398] sp : ffff00000d6eff30
[ 49.331682] x29: ffff00000d6f0350 x28: ffff800974955100
[ 49.336953] x27: ffff000008942000 x26: ffff000008f0d758
[ 49.342223] x25: ffff00000d6f3eb8 x24: ffff00000d6f3eb8
[ 49.347493] x23: ffff000008f0d490 x22: 0000000000000009
[ 49.352764] x21: ffff800974a57000 x20: ffff000008f0d4e0
[ 49.358034] x19: 0000000000000013 x18: 0000ffffe7e2e4f0
[ 49.363304] x17: 0000ffff9c1256a4 x16: ffff0000081f8b88
[ 49.368574] x15: 00002a81b8000000 x14: 00000000fffffff0
[ 49.373845] x13: ffff000008f6278a x12: ffff000008e62818
[ 49.379115] x11: 0000000000000000 x10: 000000000000019e
[ 49.384385] x9 : 0000000000000004 x8 : ffff00000d6f0770
[ 49.389656] x7 : 1313131313131313 x6 : 000000000000019e
[ 49.394925] x5 : 0000000000000000 x4 : 0000000000000000
[ 49.400205] x3 : 0000000000000000 x2 : 0000000000000400
[ 49.405484] x1 : 0000000000000013 x0 : 0000000000000012
[ 49.410764] Task stack: [0xffff00000d6f0000..0xffff00000d6f4000]
[ 49.416728] IRQ stack: [0xffff80097ffb90a0..0xffff80097ffbd0a0]
[ 49.422692] ESR: 0x96000047 -- DABT (current EL)
[ 49.427277] FAR: 0xffff00000d6eff30
[ 49.430742] Kernel panic - not syncing: kernel stack overflow
[ 49.436451] CPU: 5 PID: 2208 Comm: bash Not tainted 4.12.0-00005-ga781af2 #81
[ 49.443534] Hardware name: ARM Juno development board (r1) (DT)
[ 49.451852] [<ffff0000080885f0>] dump_backtrace+0x0/0x230
[ 49.457218] [<ffff0000080888e4>] show_stack+0x14/0x20
[ 49.462240] [<ffff00000839be0c>] dump_stack+0x9c/0xc0
[ 49.467261] [<ffff000008175218>] panic+0x11c/0x294
[ 49.472024] [<ffff000008089184>] handle_bad_stack+0xe4/0xe8
[ 49.477561] [<ffff000008588ac4>] recursive_loop+0x34/0x50
[ 49.482926] SMP: stopping secondary CPUs
[ 49.487145] Kernel Offset: disabled
[ 49.490609] Memory Limit: none
[ 49.493649] ---[ end Kernel panic - not syncing: kernel stack overflow
Yes, this looks nice.
Post by Mark Rutland
... I still need to attack the backtracing to walk across stacks.
Yup
Mark Rutland
2017-07-14 10:32:59 UTC
Permalink
Post by Ard Biesheuvel
Post by Mark Rutland
Post by Mark Rutland
Post by Ard Biesheuvel
Given that the very first stp in kernel_entry will fault if we have
less than S_FRAME_SIZE bytes of stack left, I think we should check
that we have at least that much space available.
I was going to reply saying that I didn't agree, but in writing up
examples, I mostly convinced myself that this is the right thing to do.
So I mostly agree!
This would mean we treat the first impossible-to-handle exception as
that fatal case, which is similar to x86's double-fault, triggered when
the HW can't stack the regs. All other cases are just arbitrary faults.
However, to provide that consistently, we'll need to perform this check
at every exception boundary, or some of those cases will result in a
recursive fault first.
1) In el1_sync, only check SP bounds, and live with the recursive
faults.
2) in el1_sync, check there's room for the regs, and live with the
recursive faults for overflow on other exceptions.
3) In all EL1 entry paths, check there's room for the regs.
FWIW, for the moment I've applied (2), as you suggested, to my
sub x0, x0, #S_FRAME_SIZE
... to the entry path.
I think it's worth trying (3) so that we consistently report these
cases, benchmarks permitting.
OK, so here's a crazy idea: what if we
a) carve out a dedicated range in the VMALLOC area for stacks
b) for each stack, allocate a naturally aligned window of 2x the stack
size, and map the stack inside it, leaving the remaining space
unmapped
This is not such a crazy idea. :)

In fact, it was one I toyed with before getting lost on a register
juggling tangent (see below).
Post by Ard Biesheuvel
That way, we can compare SP (minus S_FRAME_SIZE) against a mask that
is a build time constant, to decide whether its value points into a
stack or not. Of course, it may be pointing into the wrong stack, but
that should not prevent us from taking the exception, and we can deal
with that later. It would give us a very cheap way to perform this
test on the hot paths.
The logical ops (TST) and conditional branches (TB(N)Z, CB(N)Z) operate
on XZR rather than SP, so to do this we need to get the SP value into a
GPR.

Previously, I assumed this meant we needed to corrupt a GPR (and hence
stash that GPR in a sysreg), so I started writing code to free sysregs.

However, I now realise I was being thick, since we can stash the GPR
in the SP:

sub sp, sp, x0 // sp = orig_sp - x0
add x0, sp, x0 // x0 = x0 - (orig_sp - x0) == orig_sp
sub x0, x0, #S_FRAME_SIZE
tb(nz) x0, #THREAD_SHIFT, overflow
add x0, x0, #S_FRAME_SIZE
sub x0, sp, x0
add sp, sp, x0

... so yes, this could work!

This means that we have to align the initial task, so the kernel Image
will grow by THREAD_SIZE. Likewise for IRQ stacks, unless we can rework
things such that we can dynamically allocate all of those.
Post by Ard Biesheuvel
Post by Mark Rutland
Post by Mark Rutland
I believe that determining whether the exception was caused by a stack
overflow is not something we can do robustly or efficiently.
Actually, if the stack pointer is within S_FRAME_SIZE of the base, and
the faulting address points into the guard page, that is a pretty
strong indicator that the stack overflowed. That shouldn't be too
costly?
Sure, but that's still a a heuristic. For example, that also catches an
unrelated vmalloc address gone wrong, while SP was close to the end of
the stack.

The important thing is whether we can *safely enter the exception* (i.e.
stack the regs), or whether this'll push the SP (further) out-of-bounds.
I think we agree that we can reliably and efficiently check this.

The general case of nominal "stack overflows" (e.g. large preidx
decrements, proxied SP values, unrelated guard-page faults) is a
semantic minefield. I don't think we should add code to try to
distinguish these.

For that general case, if we can enter the exception then we can try to
handle the exception in the usual way, and either:

* The fault code determines the access was bad. We at least kill the
thread.

* We overflow the stack while trying to handle the exception, triggering
a new fault to triage.

To make it possible to distinguish and debug these, we need to fix the
backtracing code, but that's it.

Thanks,
Mark.
Ard Biesheuvel
2017-07-14 10:48:20 UTC
Permalink
Post by Mark Rutland
Post by Ard Biesheuvel
Post by Mark Rutland
Post by Mark Rutland
Post by Ard Biesheuvel
Given that the very first stp in kernel_entry will fault if we have
less than S_FRAME_SIZE bytes of stack left, I think we should check
that we have at least that much space available.
I was going to reply saying that I didn't agree, but in writing up
examples, I mostly convinced myself that this is the right thing to do.
So I mostly agree!
This would mean we treat the first impossible-to-handle exception as
that fatal case, which is similar to x86's double-fault, triggered when
the HW can't stack the regs. All other cases are just arbitrary faults.
However, to provide that consistently, we'll need to perform this check
at every exception boundary, or some of those cases will result in a
recursive fault first.
1) In el1_sync, only check SP bounds, and live with the recursive
faults.
2) in el1_sync, check there's room for the regs, and live with the
recursive faults for overflow on other exceptions.
3) In all EL1 entry paths, check there's room for the regs.
FWIW, for the moment I've applied (2), as you suggested, to my
sub x0, x0, #S_FRAME_SIZE
... to the entry path.
I think it's worth trying (3) so that we consistently report these
cases, benchmarks permitting.
OK, so here's a crazy idea: what if we
a) carve out a dedicated range in the VMALLOC area for stacks
b) for each stack, allocate a naturally aligned window of 2x the stack
size, and map the stack inside it, leaving the remaining space
unmapped
This is not such a crazy idea. :)
In fact, it was one I toyed with before getting lost on a register
juggling tangent (see below).
Post by Ard Biesheuvel
That way, we can compare SP (minus S_FRAME_SIZE) against a mask that
is a build time constant, to decide whether its value points into a
stack or not. Of course, it may be pointing into the wrong stack, but
that should not prevent us from taking the exception, and we can deal
with that later. It would give us a very cheap way to perform this
test on the hot paths.
The logical ops (TST) and conditional branches (TB(N)Z, CB(N)Z) operate
on XZR rather than SP, so to do this we need to get the SP value into a
GPR.
Previously, I assumed this meant we needed to corrupt a GPR (and hence
stash that GPR in a sysreg), so I started writing code to free sysregs.
However, I now realise I was being thick, since we can stash the GPR
sub sp, sp, x0 // sp = orig_sp - x0
add x0, sp, x0 // x0 = x0 - (orig_sp - x0) == orig_sp
sub x0, x0, #S_FRAME_SIZE
tb(nz) x0, #THREAD_SHIFT, overflow
add x0, x0, #S_FRAME_SIZE
sub x0, sp, x0
add sp, sp, x0
... so yes, this could work!
Nice!
Post by Mark Rutland
This means that we have to align the initial task, so the kernel Image
will grow by THREAD_SIZE. Likewise for IRQ stacks, unless we can rework
things such that we can dynamically allocate all of those.
We can't currently do that for 64k pages, since the segment alignment
is only 64k. But we should be able to patch that up I think
Post by Mark Rutland
Post by Ard Biesheuvel
Post by Mark Rutland
Post by Mark Rutland
I believe that determining whether the exception was caused by a stack
overflow is not something we can do robustly or efficiently.
Actually, if the stack pointer is within S_FRAME_SIZE of the base, and
the faulting address points into the guard page, that is a pretty
strong indicator that the stack overflowed. That shouldn't be too
costly?
Sure, but that's still a a heuristic. For example, that also catches an
unrelated vmalloc address gone wrong, while SP was close to the end of
the stack.
Yes, but the likelihood that an unrelated stray vmalloc access is
within 16 KB of a stack pointer that is close ot its limit is
extremely low, so we should be able to live with the risk of
misidentifying it.
Post by Mark Rutland
The important thing is whether we can *safely enter the exception* (i.e.
stack the regs), or whether this'll push the SP (further) out-of-bounds.
I think we agree that we can reliably and efficiently check this.
Yes.
Post by Mark Rutland
The general case of nominal "stack overflows" (e.g. large preidx
decrements, proxied SP values, unrelated guard-page faults) is a
semantic minefield. I don't think we should add code to try to
distinguish these.
For that general case, if we can enter the exception then we can try to
* The fault code determines the access was bad. We at least kill the
thread.
* We overflow the stack while trying to handle the exception, triggering
a new fault to triage.
To make it possible to distinguish and debug these, we need to fix the
backtracing code, but that's it.
Thanks,
Mark.
Ard Biesheuvel
2017-07-14 12:27:14 UTC
Permalink
Post by Ard Biesheuvel
Post by Mark Rutland
Post by Ard Biesheuvel
Post by Mark Rutland
Post by Mark Rutland
Post by Ard Biesheuvel
Given that the very first stp in kernel_entry will fault if we have
less than S_FRAME_SIZE bytes of stack left, I think we should check
that we have at least that much space available.
I was going to reply saying that I didn't agree, but in writing up
examples, I mostly convinced myself that this is the right thing to do.
So I mostly agree!
This would mean we treat the first impossible-to-handle exception as
that fatal case, which is similar to x86's double-fault, triggered when
the HW can't stack the regs. All other cases are just arbitrary faults.
However, to provide that consistently, we'll need to perform this check
at every exception boundary, or some of those cases will result in a
recursive fault first.
1) In el1_sync, only check SP bounds, and live with the recursive
faults.
2) in el1_sync, check there's room for the regs, and live with the
recursive faults for overflow on other exceptions.
3) In all EL1 entry paths, check there's room for the regs.
FWIW, for the moment I've applied (2), as you suggested, to my
sub x0, x0, #S_FRAME_SIZE
... to the entry path.
I think it's worth trying (3) so that we consistently report these
cases, benchmarks permitting.
OK, so here's a crazy idea: what if we
a) carve out a dedicated range in the VMALLOC area for stacks
b) for each stack, allocate a naturally aligned window of 2x the stack
size, and map the stack inside it, leaving the remaining space
unmapped
This is not such a crazy idea. :)
In fact, it was one I toyed with before getting lost on a register
juggling tangent (see below).
Post by Ard Biesheuvel
That way, we can compare SP (minus S_FRAME_SIZE) against a mask that
is a build time constant, to decide whether its value points into a
stack or not. Of course, it may be pointing into the wrong stack, but
that should not prevent us from taking the exception, and we can deal
with that later. It would give us a very cheap way to perform this
test on the hot paths.
The logical ops (TST) and conditional branches (TB(N)Z, CB(N)Z) operate
on XZR rather than SP, so to do this we need to get the SP value into a
GPR.
Previously, I assumed this meant we needed to corrupt a GPR (and hence
stash that GPR in a sysreg), so I started writing code to free sysregs.
However, I now realise I was being thick, since we can stash the GPR
sub sp, sp, x0 // sp = orig_sp - x0
add x0, sp, x0 // x0 = x0 - (orig_sp - x0) == orig_sp
sub x0, x0, #S_FRAME_SIZE
tb(nz) x0, #THREAD_SHIFT, overflow
add x0, x0, #S_FRAME_SIZE
sub x0, sp, x0
You need a neg x0, x0 here I think
Post by Ard Biesheuvel
Post by Mark Rutland
add sp, sp, x0
... so yes, this could work!
Nice!
... only, this requires a dedicated stack region, and so we'd need to
check whether sp is inside that window as well.

The easieast way would be to use a window whose start address is base2
aligned, but that means the beginning of the kernel VA range (where
KASAN currently lives, and cannot be moved afaik), or a window at the
top of the linear region. Neither look very appealing

So that means arbitrary low and high limits to compare against in this
entry path. That means more GPRs I'm afraid.
Post by Ard Biesheuvel
Post by Mark Rutland
This means that we have to align the initial task, so the kernel Image
will grow by THREAD_SIZE. Likewise for IRQ stacks, unless we can rework
things such that we can dynamically allocate all of those.
We can't currently do that for 64k pages, since the segment alignment
is only 64k. But we should be able to patch that up I think
Post by Mark Rutland
Post by Ard Biesheuvel
Post by Mark Rutland
Post by Mark Rutland
I believe that determining whether the exception was caused by a stack
overflow is not something we can do robustly or efficiently.
Actually, if the stack pointer is within S_FRAME_SIZE of the base, and
the faulting address points into the guard page, that is a pretty
strong indicator that the stack overflowed. That shouldn't be too
costly?
Sure, but that's still a a heuristic. For example, that also catches an
unrelated vmalloc address gone wrong, while SP was close to the end of
the stack.
Yes, but the likelihood that an unrelated stray vmalloc access is
within 16 KB of a stack pointer that is close ot its limit is
extremely low, so we should be able to live with the risk of
misidentifying it.
Post by Mark Rutland
The important thing is whether we can *safely enter the exception* (i.e.
stack the regs), or whether this'll push the SP (further) out-of-bounds.
I think we agree that we can reliably and efficiently check this.
Yes.
Post by Mark Rutland
The general case of nominal "stack overflows" (e.g. large preidx
decrements, proxied SP values, unrelated guard-page faults) is a
semantic minefield. I don't think we should add code to try to
distinguish these.
For that general case, if we can enter the exception then we can try to
* The fault code determines the access was bad. We at least kill the
thread.
* We overflow the stack while trying to handle the exception, triggering
a new fault to triage.
To make it possible to distinguish and debug these, we need to fix the
backtracing code, but that's it.
Thanks,
Mark.
Mark Rutland
2017-07-14 14:06:06 UTC
Permalink
Post by Ard Biesheuvel
Post by Mark Rutland
Post by Ard Biesheuvel
OK, so here's a crazy idea: what if we
a) carve out a dedicated range in the VMALLOC area for stacks
b) for each stack, allocate a naturally aligned window of 2x the stack
size, and map the stack inside it, leaving the remaining space
unmapped
The logical ops (TST) and conditional branches (TB(N)Z, CB(N)Z) operate
on XZR rather than SP, so to do this we need to get the SP value into a
GPR.
Previously, I assumed this meant we needed to corrupt a GPR (and hence
stash that GPR in a sysreg), so I started writing code to free sysregs.
However, I now realise I was being thick, since we can stash the GPR
sub sp, sp, x0 // sp = orig_sp - x0
add x0, sp, x0 // x0 = x0 - (orig_sp - x0) == orig_sp
That comment is off, and should say x0 = x0 + (orig_sp - x0) == orig_sp
Post by Ard Biesheuvel
Post by Mark Rutland
sub x0, x0, #S_FRAME_SIZE
tb(nz) x0, #THREAD_SHIFT, overflow
add x0, x0, #S_FRAME_SIZE
sub x0, sp, x0
You need a neg x0, x0 here I think
Oh, whoops. I'd mis-simplified things.

We can avoid that by storing orig_sp + orig_x0 in sp:

add sp, sp, x0 // sp = orig_sp + orig_x0
sub x0, sp, x0 // x0 = orig_sp
< check >
sub x0, sp, x0 // x0 = orig_x0
sub sp, sp, x0 // sp = orig_sp

... which works in a locally-built kernel where I've aligned all the
stacks.
Post by Ard Biesheuvel
... only, this requires a dedicated stack region, and so we'd need to
check whether sp is inside that window as well.
The easieast way would be to use a window whose start address is base2
aligned, but that means the beginning of the kernel VA range (where
KASAN currently lives, and cannot be moved afaik), or a window at the
top of the linear region. Neither look very appealing
So that means arbitrary low and high limits to compare against in this
entry path. That means more GPRs I'm afraid.
Could you elaborate on that? I'm not sure that I follow.

My understanding was that the comprimise with this approach is that we
only catch overflow/underflow within THREAD_SIZE of the stack, and can
get false-negatives elsewhere. Otherwise, IIUC this is sufficient

Are you after a more stringent check (like those from the two existing
proposals that caught all out-of-bounds accesses)?

Or am I missing something else?

Thanks,
Mark.
Ard Biesheuvel
2017-07-14 14:14:20 UTC
Permalink
Post by Mark Rutland
Post by Ard Biesheuvel
Post by Mark Rutland
Post by Ard Biesheuvel
OK, so here's a crazy idea: what if we
a) carve out a dedicated range in the VMALLOC area for stacks
b) for each stack, allocate a naturally aligned window of 2x the stack
size, and map the stack inside it, leaving the remaining space
unmapped
The logical ops (TST) and conditional branches (TB(N)Z, CB(N)Z) operate
on XZR rather than SP, so to do this we need to get the SP value into a
GPR.
Previously, I assumed this meant we needed to corrupt a GPR (and hence
stash that GPR in a sysreg), so I started writing code to free sysregs.
However, I now realise I was being thick, since we can stash the GPR
sub sp, sp, x0 // sp = orig_sp - x0
add x0, sp, x0 // x0 = x0 - (orig_sp - x0) == orig_sp
That comment is off, and should say x0 = x0 + (orig_sp - x0) == orig_sp
Post by Ard Biesheuvel
Post by Mark Rutland
sub x0, x0, #S_FRAME_SIZE
tb(nz) x0, #THREAD_SHIFT, overflow
add x0, x0, #S_FRAME_SIZE
sub x0, sp, x0
You need a neg x0, x0 here I think
Oh, whoops. I'd mis-simplified things.
add sp, sp, x0 // sp = orig_sp + orig_x0
sub x0, sp, x0 // x0 = orig_sp
< check >
sub x0, sp, x0 // x0 = orig_x0
sub sp, sp, x0 // sp = orig_sp
... which works in a locally-built kernel where I've aligned all the
stacks.
Yes, that looks correct to me now.
Post by Mark Rutland
Post by Ard Biesheuvel
... only, this requires a dedicated stack region, and so we'd need to
check whether sp is inside that window as well.
The easieast way would be to use a window whose start address is base2
aligned, but that means the beginning of the kernel VA range (where
KASAN currently lives, and cannot be moved afaik), or a window at the
top of the linear region. Neither look very appealing
So that means arbitrary low and high limits to compare against in this
entry path. That means more GPRs I'm afraid.
Could you elaborate on that? I'm not sure that I follow.
My understanding was that the comprimise with this approach is that we
only catch overflow/underflow within THREAD_SIZE of the stack, and can
get false-negatives elsewhere. Otherwise, IIUC this is sufficient
Are you after a more stringent check (like those from the two existing
proposals that caught all out-of-bounds accesses)?
Or am I missing something else?
No, not at all. I managed to confuse myself into thinking that we need
to validate the value of SP in some way, i.e., as we would when
dealing with an arbitrary faulting address.
Robin Murphy
2017-07-14 14:39:39 UTC
Permalink
Post by Mark Rutland
Post by Ard Biesheuvel
Post by Mark Rutland
Post by Ard Biesheuvel
OK, so here's a crazy idea: what if we
a) carve out a dedicated range in the VMALLOC area for stacks
b) for each stack, allocate a naturally aligned window of 2x the stack
size, and map the stack inside it, leaving the remaining space
unmapped
The logical ops (TST) and conditional branches (TB(N)Z, CB(N)Z) operate
on XZR rather than SP, so to do this we need to get the SP value into a
GPR.
Previously, I assumed this meant we needed to corrupt a GPR (and hence
stash that GPR in a sysreg), so I started writing code to free sysregs.
However, I now realise I was being thick, since we can stash the GPR
sub sp, sp, x0 // sp = orig_sp - x0
add x0, sp, x0 // x0 = x0 - (orig_sp - x0) == orig_sp
That comment is off, and should say x0 = x0 + (orig_sp - x0) == orig_sp
Post by Ard Biesheuvel
Post by Mark Rutland
sub x0, x0, #S_FRAME_SIZE
tb(nz) x0, #THREAD_SHIFT, overflow
add x0, x0, #S_FRAME_SIZE
sub x0, sp, x0
You need a neg x0, x0 here I think
Oh, whoops. I'd mis-simplified things.
add sp, sp, x0 // sp = orig_sp + orig_x0
sub x0, sp, x0 // x0 = orig_sp
< check >
sub x0, sp, x0 // x0 = orig_x0
Haven't you now forcibly cleared the top bit of x0 thanks to overflow?

Robin.
Post by Mark Rutland
sub sp, sp, x0 // sp = orig_sp
... which works in a locally-built kernel where I've aligned all the
stacks.
Post by Ard Biesheuvel
... only, this requires a dedicated stack region, and so we'd need to
check whether sp is inside that window as well.
The easieast way would be to use a window whose start address is base2
aligned, but that means the beginning of the kernel VA range (where
KASAN currently lives, and cannot be moved afaik), or a window at the
top of the linear region. Neither look very appealing
So that means arbitrary low and high limits to compare against in this
entry path. That means more GPRs I'm afraid.
Could you elaborate on that? I'm not sure that I follow.
My understanding was that the comprimise with this approach is that we
only catch overflow/underflow within THREAD_SIZE of the stack, and can
get false-negatives elsewhere. Otherwise, IIUC this is sufficient
Are you after a more stringent check (like those from the two existing
proposals that caught all out-of-bounds accesses)?
Or am I missing something else?
Thanks,
Mark.
_______________________________________________
linux-arm-kernel mailing list
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel
Robin Murphy
2017-07-14 15:03:51 UTC
Permalink
Post by Robin Murphy
Post by Mark Rutland
Post by Ard Biesheuvel
Post by Mark Rutland
Post by Ard Biesheuvel
OK, so here's a crazy idea: what if we
a) carve out a dedicated range in the VMALLOC area for stacks
b) for each stack, allocate a naturally aligned window of 2x the stack
size, and map the stack inside it, leaving the remaining space
unmapped
The logical ops (TST) and conditional branches (TB(N)Z, CB(N)Z) operate
on XZR rather than SP, so to do this we need to get the SP value into a
GPR.
Previously, I assumed this meant we needed to corrupt a GPR (and hence
stash that GPR in a sysreg), so I started writing code to free sysregs.
However, I now realise I was being thick, since we can stash the GPR
sub sp, sp, x0 // sp = orig_sp - x0
add x0, sp, x0 // x0 = x0 - (orig_sp - x0) == orig_sp
That comment is off, and should say x0 = x0 + (orig_sp - x0) == orig_sp
Post by Ard Biesheuvel
Post by Mark Rutland
sub x0, x0, #S_FRAME_SIZE
tb(nz) x0, #THREAD_SHIFT, overflow
add x0, x0, #S_FRAME_SIZE
sub x0, sp, x0
You need a neg x0, x0 here I think
Oh, whoops. I'd mis-simplified things.
add sp, sp, x0 // sp = orig_sp + orig_x0
sub x0, sp, x0 // x0 = orig_sp
< check >
sub x0, sp, x0 // x0 = orig_x0
Haven't you now forcibly cleared the top bit of x0 thanks to overflow?
...or maybe not. I still can't quite see it, but I suppose it must
cancel out somewhere, since Mr. Helpful C Program[1] has apparently
proven me mistaken :(

I guess that means I approve!

Robin.

[1]:
#include <assert.h>
#include <stdint.h>

int main(void) {
for (int i = 0; i < 256; i++) {
for (int j = 0; j < 256; j++) {
uint8_t x = i;
uint8_t y = j;
y = y + x;
x = y - x;
x = y - x;
y = y - x;
assert(x == i && y == j);
}
}
}
Post by Robin Murphy
Post by Mark Rutland
sub sp, sp, x0 // sp = orig_sp
... which works in a locally-built kernel where I've aligned all the
stacks.
Post by Ard Biesheuvel
... only, this requires a dedicated stack region, and so we'd need to
check whether sp is inside that window as well.
The easieast way would be to use a window whose start address is base2
aligned, but that means the beginning of the kernel VA range (where
KASAN currently lives, and cannot be moved afaik), or a window at the
top of the linear region. Neither look very appealing
So that means arbitrary low and high limits to compare against in this
entry path. That means more GPRs I'm afraid.
Could you elaborate on that? I'm not sure that I follow.
My understanding was that the comprimise with this approach is that we
only catch overflow/underflow within THREAD_SIZE of the stack, and can
get false-negatives elsewhere. Otherwise, IIUC this is sufficient
Are you after a more stringent check (like those from the two existing
proposals that caught all out-of-bounds accesses)?
Or am I missing something else?
Thanks,
Mark.
_______________________________________________
linux-arm-kernel mailing list
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel
_______________________________________________
linux-arm-kernel mailing list
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel
Ard Biesheuvel
2017-07-14 15:15:37 UTC
Permalink
Post by Robin Murphy
Post by Robin Murphy
Post by Mark Rutland
Post by Ard Biesheuvel
Post by Mark Rutland
Post by Ard Biesheuvel
OK, so here's a crazy idea: what if we
a) carve out a dedicated range in the VMALLOC area for stacks
b) for each stack, allocate a naturally aligned window of 2x the stack
size, and map the stack inside it, leaving the remaining space
unmapped
The logical ops (TST) and conditional branches (TB(N)Z, CB(N)Z) operate
on XZR rather than SP, so to do this we need to get the SP value into a
GPR.
Previously, I assumed this meant we needed to corrupt a GPR (and hence
stash that GPR in a sysreg), so I started writing code to free sysregs.
However, I now realise I was being thick, since we can stash the GPR
sub sp, sp, x0 // sp = orig_sp - x0
add x0, sp, x0 // x0 = x0 - (orig_sp - x0) == orig_sp
That comment is off, and should say x0 = x0 + (orig_sp - x0) == orig_sp
Post by Ard Biesheuvel
Post by Mark Rutland
sub x0, x0, #S_FRAME_SIZE
tb(nz) x0, #THREAD_SHIFT, overflow
add x0, x0, #S_FRAME_SIZE
sub x0, sp, x0
You need a neg x0, x0 here I think
Oh, whoops. I'd mis-simplified things.
add sp, sp, x0 // sp = orig_sp + orig_x0
sub x0, sp, x0 // x0 = orig_sp
< check >
sub x0, sp, x0 // x0 = orig_x0
Haven't you now forcibly cleared the top bit of x0 thanks to overflow?
...or maybe not. I still can't quite see it, but I suppose it must
cancel out somewhere, since Mr. Helpful C Program[1] has apparently
proven me mistaken :(
I guess that means I approve!
Robin.
#include <assert.h>
#include <stdint.h>
int main(void) {
for (int i = 0; i < 256; i++) {
for (int j = 0; j < 256; j++) {
uint8_t x = i;
uint8_t y = j;
y = y + x;
x = y - x;
x = y - x;
y = y - x;
assert(x == i && y == j);
}
}
}
Yeah, I think the carry out in the first instruction can be ignored,
given that we don't care about the magnitude of the result, only about
the lower 64-bits. The subtraction that inverts it will be off by
exactly 2^64
Mark Rutland
2017-07-14 15:25:03 UTC
Permalink
Post by Robin Murphy
Post by Robin Murphy
Post by Mark Rutland
add sp, sp, x0 // sp = orig_sp + orig_x0
sub x0, sp, x0 // x0 = orig_sp
< check >
sub x0, sp, x0 // x0 = orig_x0
Haven't you now forcibly cleared the top bit of x0 thanks to overflow?
...or maybe not. I still can't quite see it, but I suppose it must
cancel out somewhere, since Mr. Helpful C Program[1] has apparently
proven me mistaken :(
I guess that means I approve!
Robin.
#include <assert.h>
#include <stdint.h>
int main(void) {
for (int i = 0; i < 256; i++) {
for (int j = 0; j < 256; j++) {
uint8_t x = i;
uint8_t y = j;
y = y + x;
x = y - x;
x = y - x;
y = y - x;
assert(x == i && y == j);
}
}
}
I guess we have our first Tested-by for this series. :)

Thanks for taking a look!

Mark.
Mark Rutland
2017-07-14 21:27:18 UTC
Permalink
Post by Mark Rutland
Post by Ard Biesheuvel
Post by Mark Rutland
Post by Ard Biesheuvel
OK, so here's a crazy idea: what if we
a) carve out a dedicated range in the VMALLOC area for stacks
b) for each stack, allocate a naturally aligned window of 2x the stack
size, and map the stack inside it, leaving the remaining space
unmapped
The logical ops (TST) and conditional branches (TB(N)Z, CB(N)Z) operate
on XZR rather than SP, so to do this we need to get the SP value into a
GPR.
Previously, I assumed this meant we needed to corrupt a GPR (and hence
stash that GPR in a sysreg), so I started writing code to free sysregs.
However, I now realise I was being thick, since we can stash the GPR
sub sp, sp, x0 // sp = orig_sp - x0
add x0, sp, x0 // x0 = x0 - (orig_sp - x0) == orig_sp
That comment is off, and should say x0 = x0 + (orig_sp - x0) == orig_sp
Post by Ard Biesheuvel
Post by Mark Rutland
sub x0, x0, #S_FRAME_SIZE
tb(nz) x0, #THREAD_SHIFT, overflow
add x0, x0, #S_FRAME_SIZE
sub x0, sp, x0
You need a neg x0, x0 here I think
Oh, whoops. I'd mis-simplified things.
add sp, sp, x0 // sp = orig_sp + orig_x0
sub x0, sp, x0 // x0 = orig_sp
< check >
sub x0, sp, x0 // x0 = orig_x0
sub sp, sp, x0 // sp = orig_sp
... which works in a locally-built kernel where I've aligned all the
stacks.
FWIW, I've pushed out a somewhat cleaned-up (and slightly broken!)
version of said kernel source to my arm64/vmap-stack-align branch [1].
That's still missing the backtrace handling, IRQ stack alignment is
broken at least on 64K pages, and there's still more cleanup and rework
to do.

Thanks,
Mark.

[1] git://git.kernel.org/pub/scm/linux/kernel/git/mark/linux.git arm64/vmap-stack-align
Ard Biesheuvel
2017-07-16 00:03:05 UTC
Permalink
Post by Mark Rutland
Post by Mark Rutland
Post by Ard Biesheuvel
Post by Mark Rutland
Post by Ard Biesheuvel
OK, so here's a crazy idea: what if we
a) carve out a dedicated range in the VMALLOC area for stacks
b) for each stack, allocate a naturally aligned window of 2x the stack
size, and map the stack inside it, leaving the remaining space
unmapped
The logical ops (TST) and conditional branches (TB(N)Z, CB(N)Z) operate
on XZR rather than SP, so to do this we need to get the SP value into a
GPR.
Previously, I assumed this meant we needed to corrupt a GPR (and hence
stash that GPR in a sysreg), so I started writing code to free sysregs.
However, I now realise I was being thick, since we can stash the GPR
sub sp, sp, x0 // sp = orig_sp - x0
add x0, sp, x0 // x0 = x0 - (orig_sp - x0) == orig_sp
That comment is off, and should say x0 = x0 + (orig_sp - x0) == orig_sp
Post by Ard Biesheuvel
Post by Mark Rutland
sub x0, x0, #S_FRAME_SIZE
tb(nz) x0, #THREAD_SHIFT, overflow
add x0, x0, #S_FRAME_SIZE
sub x0, sp, x0
You need a neg x0, x0 here I think
Oh, whoops. I'd mis-simplified things.
add sp, sp, x0 // sp = orig_sp + orig_x0
sub x0, sp, x0 // x0 = orig_sp
< check >
sub x0, sp, x0 // x0 = orig_x0
sub sp, sp, x0 // sp = orig_sp
... which works in a locally-built kernel where I've aligned all the
stacks.
FWIW, I've pushed out a somewhat cleaned-up (and slightly broken!)
version of said kernel source to my arm64/vmap-stack-align branch [1].
That's still missing the backtrace handling, IRQ stack alignment is
broken at least on 64K pages, and there's still more cleanup and rework
to do.
I have spent some time addressing the issues mentioned in the commit
log. Please take a look.

git://git.kernel.org/pub/scm/linux/kernel/git/ardb/linux.git vmap-arm64-mark
Laura Abbott
2017-07-18 21:53:40 UTC
Permalink
Post by Ard Biesheuvel
Post by Mark Rutland
Post by Mark Rutland
Post by Ard Biesheuvel
Post by Mark Rutland
Post by Ard Biesheuvel
OK, so here's a crazy idea: what if we
a) carve out a dedicated range in the VMALLOC area for stacks
b) for each stack, allocate a naturally aligned window of 2x the stack
size, and map the stack inside it, leaving the remaining space
unmapped
The logical ops (TST) and conditional branches (TB(N)Z, CB(N)Z) operate
on XZR rather than SP, so to do this we need to get the SP value into a
GPR.
Previously, I assumed this meant we needed to corrupt a GPR (and hence
stash that GPR in a sysreg), so I started writing code to free sysregs.
However, I now realise I was being thick, since we can stash the GPR
sub sp, sp, x0 // sp = orig_sp - x0
add x0, sp, x0 // x0 = x0 - (orig_sp - x0) == orig_sp
That comment is off, and should say x0 = x0 + (orig_sp - x0) == orig_sp
Post by Ard Biesheuvel
Post by Mark Rutland
sub x0, x0, #S_FRAME_SIZE
tb(nz) x0, #THREAD_SHIFT, overflow
add x0, x0, #S_FRAME_SIZE
sub x0, sp, x0
You need a neg x0, x0 here I think
Oh, whoops. I'd mis-simplified things.
add sp, sp, x0 // sp = orig_sp + orig_x0
sub x0, sp, x0 // x0 = orig_sp
< check >
sub x0, sp, x0 // x0 = orig_x0
sub sp, sp, x0 // sp = orig_sp
... which works in a locally-built kernel where I've aligned all the
stacks.
FWIW, I've pushed out a somewhat cleaned-up (and slightly broken!)
version of said kernel source to my arm64/vmap-stack-align branch [1].
That's still missing the backtrace handling, IRQ stack alignment is
broken at least on 64K pages, and there's still more cleanup and rework
to do.
I have spent some time addressing the issues mentioned in the commit
log. Please take a look.
git://git.kernel.org/pub/scm/linux/kernel/git/ardb/linux.git vmap-arm64-mark
I used vmap-arm64-mark to compile kernels for a few days. It seemed to
work well enough.

Thanks,
Laura
Ard Biesheuvel
2017-07-19 08:08:37 UTC
Permalink
Post by Laura Abbott
Post by Ard Biesheuvel
Post by Mark Rutland
Post by Mark Rutland
Post by Ard Biesheuvel
Post by Mark Rutland
Post by Ard Biesheuvel
OK, so here's a crazy idea: what if we
a) carve out a dedicated range in the VMALLOC area for stacks
b) for each stack, allocate a naturally aligned window of 2x the stack
size, and map the stack inside it, leaving the remaining space
unmapped
The logical ops (TST) and conditional branches (TB(N)Z, CB(N)Z) operate
on XZR rather than SP, so to do this we need to get the SP value into a
GPR.
Previously, I assumed this meant we needed to corrupt a GPR (and hence
stash that GPR in a sysreg), so I started writing code to free sysregs.
However, I now realise I was being thick, since we can stash the GPR
sub sp, sp, x0 // sp = orig_sp - x0
add x0, sp, x0 // x0 = x0 - (orig_sp - x0) == orig_sp
That comment is off, and should say x0 = x0 + (orig_sp - x0) == orig_sp
Post by Ard Biesheuvel
Post by Mark Rutland
sub x0, x0, #S_FRAME_SIZE
tb(nz) x0, #THREAD_SHIFT, overflow
add x0, x0, #S_FRAME_SIZE
sub x0, sp, x0
You need a neg x0, x0 here I think
Oh, whoops. I'd mis-simplified things.
add sp, sp, x0 // sp = orig_sp + orig_x0
sub x0, sp, x0 // x0 = orig_sp
< check >
sub x0, sp, x0 // x0 = orig_x0
sub sp, sp, x0 // sp = orig_sp
... which works in a locally-built kernel where I've aligned all the
stacks.
FWIW, I've pushed out a somewhat cleaned-up (and slightly broken!)
version of said kernel source to my arm64/vmap-stack-align branch [1].
That's still missing the backtrace handling, IRQ stack alignment is
broken at least on 64K pages, and there's still more cleanup and rework
to do.
I have spent some time addressing the issues mentioned in the commit
log. Please take a look.
git://git.kernel.org/pub/scm/linux/kernel/git/ardb/linux.git vmap-arm64-mark
I used vmap-arm64-mark to compile kernels for a few days. It seemed to
work well enough.
Thanks for giving this a spin. Any comments on the performance impact?
(if you happened to notice any)
Ard Biesheuvel
2017-07-20 05:35:44 UTC
Permalink
Post by Ard Biesheuvel
Post by Laura Abbott
Post by Ard Biesheuvel
Post by Mark Rutland
Post by Mark Rutland
Post by Ard Biesheuvel
Post by Mark Rutland
Post by Ard Biesheuvel
OK, so here's a crazy idea: what if we
a) carve out a dedicated range in the VMALLOC area for stacks
b) for each stack, allocate a naturally aligned window of 2x the stack
size, and map the stack inside it, leaving the remaining space
unmapped
The logical ops (TST) and conditional branches (TB(N)Z, CB(N)Z) operate
on XZR rather than SP, so to do this we need to get the SP value into a
GPR.
Previously, I assumed this meant we needed to corrupt a GPR (and hence
stash that GPR in a sysreg), so I started writing code to free sysregs.
However, I now realise I was being thick, since we can stash the GPR
sub sp, sp, x0 // sp = orig_sp - x0
add x0, sp, x0 // x0 = x0 - (orig_sp - x0) == orig_sp
That comment is off, and should say x0 = x0 + (orig_sp - x0) == orig_sp
Post by Ard Biesheuvel
Post by Mark Rutland
sub x0, x0, #S_FRAME_SIZE
tb(nz) x0, #THREAD_SHIFT, overflow
add x0, x0, #S_FRAME_SIZE
sub x0, sp, x0
You need a neg x0, x0 here I think
Oh, whoops. I'd mis-simplified things.
add sp, sp, x0 // sp = orig_sp + orig_x0
sub x0, sp, x0 // x0 = orig_sp
< check >
sub x0, sp, x0 // x0 = orig_x0
sub sp, sp, x0 // sp = orig_sp
... which works in a locally-built kernel where I've aligned all the
stacks.
FWIW, I've pushed out a somewhat cleaned-up (and slightly broken!)
version of said kernel source to my arm64/vmap-stack-align branch [1].
That's still missing the backtrace handling, IRQ stack alignment is
broken at least on 64K pages, and there's still more cleanup and rework
to do.
I have spent some time addressing the issues mentioned in the commit
log. Please take a look.
git://git.kernel.org/pub/scm/linux/kernel/git/ardb/linux.git vmap-arm64-mark
I used vmap-arm64-mark to compile kernels for a few days. It seemed to
work well enough.
Thanks for giving this a spin. Any comments on the performance impact?
(if you happened to notice any)
I didn't notice any performance impact but I also wasn't trying that
hard. I did try this with a different configuration and ran into
[ 0.358026] smp: Brought up 1 node, 8 CPUs
[ 0.359359] SMP: Total of 8 processors activated.
[ 0.359542] CPU features: detected feature: 32-bit EL0 Support
[ 0.361781] Insufficient stack space to handle exception!
[ 0.362075] CPU: 4 PID: 0 Comm: swapper/4 Not tainted 4.12.0-00018-ge9cf49d604ef-dirty #23
[ 0.362538] Hardware name: linux,dummy-virt (DT)
[ 0.362844] task: ffffffc03a8a3200 task.stack: ffffff8008e80000
[ 0.363389] PC is at __do_softirq+0x88/0x210
[ 0.363585] LR is at __do_softirq+0x78/0x210
[ 0.363859] pc : [<ffffff80080bfba8>] lr : [<ffffff80080bfb98>] pstate: 80000145
[ 0.364109] sp : ffffffc03bf65ea0
[ 0.364253] x29: ffffffc03bf66830 x28: 0000000000000002
[ 0.364547] x27: ffffff8008e83e20 x26: 00000000fffedb5a
[ 0.364777] x25: 0000000000000001 x24: 0000000000000000
[ 0.365017] x23: ffffff8008dc5900 x22: ffffff8008c37000
[ 0.365242] x21: 0000000000000003 x20: 0000000000000000
[ 0.365557] x19: ffffff8008d02000 x18: 0000000000040000
[ 0.365991] x17: 0000000000000000 x16: 0000000000000008
[ 0.366148] x15: ffffffc03a400228 x14: 0000000000000000
[ 0.366296] x13: ffffff8008a50b98 x12: ffffffc03a916480
[ 0.366442] x11: ffffff8008a50ba0 x10: 0000000000000008
[ 0.366624] x9 : 0000000000000004 x8 : ffffffc03bf6f630
[ 0.366779] x7 : 0000000000000020 x6 : 00000000fffedb5a
[ 0.366924] x5 : 00000000ffffffff x4 : 000000403326a000
[ 0.367071] x3 : 0000000000000101 x2 : ffffff8008ce8000
[ 0.367218] x1 : ffffff8008dc5900 x0 : 0000000000000200
[ 0.367382] Task stack: [0xffffff8008e80000..0xffffff8008e84000]
[ 0.367519] IRQ stack: [0xffffffc03bf62000..0xffffffc03bf66000]
The IRQ stack is not 16K aligned ...
[ 0.367687] ESR: 0x00000000 -- Unknown/Uncategorized
[ 0.367868] FAR: 0x0000000000000000
[ 0.368059] Kernel panic - not syncing: kernel stack overflow
[ 0.368252] CPU: 4 PID: 0 Comm: swapper/4 Not tainted 4.12.0-00018-ge9cf49d604ef-dirty #23
[ 0.368427] Hardware name: linux,dummy-virt (DT)
[ 0.368774] [<ffffff8008087fd8>] dump_backtrace+0x0/0x228
[ 0.368979] [<ffffff80080882c8>] show_stack+0x10/0x20
[ 0.369270] [<ffffff80084602dc>] dump_stack+0x88/0xac
[ 0.369459] [<ffffff800816328c>] panic+0x120/0x278
[ 0.369582] [<ffffff8008088b40>] handle_bad_stack+0xd0/0xd8
[ 0.369799] [<ffffff80080bfb94>] __do_softirq+0x74/0x210
[ 0.370560] SMP: stopping secondary CPUs
[ 0.384269] Rebooting in 5 seconds..
The config is based on what I use for booting my Hikey android
board. I haven't been able to narrow down exactly which
set of configs set this off.
... so for some reason, the percpu atom size change fails to take effect here.
James Morse
2017-07-20 08:36:18 UTC
Permalink
Hi Ard,
Post by Ard Biesheuvel
I didn't notice any performance impact but I also wasn't trying that
hard. I did try this with a different configuration and ran into
[ 0.358026] smp: Brought up 1 node, 8 CPUs
[ 0.359359] SMP: Total of 8 processors activated.
[ 0.359542] CPU features: detected feature: 32-bit EL0 Support
[ 0.361781] Insufficient stack space to handle exception!
[...]
Post by Ard Biesheuvel
[ 0.367382] Task stack: [0xffffff8008e80000..0xffffff8008e84000]
[ 0.367519] IRQ stack: [0xffffffc03bf62000..0xffffffc03bf66000]
The IRQ stack is not 16K aligned ...
[ 0.367687] ESR: 0x00000000 -- Unknown/Uncategorized
[ 0.367868] FAR: 0x0000000000000000
[ 0.368059] Kernel panic - not syncing: kernel stack overflow
[ 0.368252] CPU: 4 PID: 0 Comm: swapper/4 Not tainted 4.12.0-00018-ge9cf49d604ef-dirty #23
[ 0.368427] Hardware name: linux,dummy-virt (DT)
[ 0.368774] [<ffffff8008087fd8>] dump_backtrace+0x0/0x228
[ 0.368979] [<ffffff80080882c8>] show_stack+0x10/0x20
[ 0.369270] [<ffffff80084602dc>] dump_stack+0x88/0xac
[ 0.369459] [<ffffff800816328c>] panic+0x120/0x278
[ 0.369582] [<ffffff8008088b40>] handle_bad_stack+0xd0/0xd8
[ 0.369799] [<ffffff80080bfb94>] __do_softirq+0x74/0x210
[ 0.370560] SMP: stopping secondary CPUs
[ 0.384269] Rebooting in 5 seconds..
The config is based on what I use for booting my Hikey android
board. I haven't been able to narrow down exactly which
set of configs set this off.
... so for some reason, the percpu atom size change fails to take effect here.
I'm not completely up to speed with these series, so this may be noise:

When we added the IRQ stack Jungseok Lee discovered that alignment greater than
PAGE_SIZE only applies to CPU0. Secondary CPUs read the per-cpu init data into a
page-aligned area, but any greater alignment requirement is lost.

Because of this the irqstack was only 16byte aligned, and struct thread_info had
to be be discovered without depending on stack alignment.


Thanks,

James
Ard Biesheuvel
2017-07-20 08:56:29 UTC
Permalink
Post by James Morse
Hi Ard,
Post by Ard Biesheuvel
I didn't notice any performance impact but I also wasn't trying that
hard. I did try this with a different configuration and ran into
[ 0.358026] smp: Brought up 1 node, 8 CPUs
[ 0.359359] SMP: Total of 8 processors activated.
[ 0.359542] CPU features: detected feature: 32-bit EL0 Support
[ 0.361781] Insufficient stack space to handle exception!
[...]
Post by Ard Biesheuvel
[ 0.367382] Task stack: [0xffffff8008e80000..0xffffff8008e84000]
[ 0.367519] IRQ stack: [0xffffffc03bf62000..0xffffffc03bf66000]
The IRQ stack is not 16K aligned ...
[ 0.367687] ESR: 0x00000000 -- Unknown/Uncategorized
[ 0.367868] FAR: 0x0000000000000000
[ 0.368059] Kernel panic - not syncing: kernel stack overflow
[ 0.368252] CPU: 4 PID: 0 Comm: swapper/4 Not tainted 4.12.0-00018-ge9cf49d604ef-dirty #23
[ 0.368427] Hardware name: linux,dummy-virt (DT)
[ 0.368774] [<ffffff8008087fd8>] dump_backtrace+0x0/0x228
[ 0.368979] [<ffffff80080882c8>] show_stack+0x10/0x20
[ 0.369270] [<ffffff80084602dc>] dump_stack+0x88/0xac
[ 0.369459] [<ffffff800816328c>] panic+0x120/0x278
[ 0.369582] [<ffffff8008088b40>] handle_bad_stack+0xd0/0xd8
[ 0.369799] [<ffffff80080bfb94>] __do_softirq+0x74/0x210
[ 0.370560] SMP: stopping secondary CPUs
[ 0.384269] Rebooting in 5 seconds..
The config is based on what I use for booting my Hikey android
board. I haven't been able to narrow down exactly which
set of configs set this off.
... so for some reason, the percpu atom size change fails to take effect here.
When we added the IRQ stack Jungseok Lee discovered that alignment greater than
PAGE_SIZE only applies to CPU0. Secondary CPUs read the per-cpu init data into a
page-aligned area, but any greater alignment requirement is lost.
Because of this the irqstack was only 16byte aligned, and struct thread_info had
to be be discovered without depending on stack alignment.
We [attempted to] address that by increasing the per-CPU atom size to
THREAD_ALIGN if CONFIG_VMAP_STACK=y, but as I am typing this, I wonder
if that percolates all the way down to the actual vmap() calls. I will
investigate ...
Ard Biesheuvel
2017-07-20 17:30:10 UTC
Permalink
Post by Ard Biesheuvel
Post by James Morse
Hi Ard,
Post by Ard Biesheuvel
I didn't notice any performance impact but I also wasn't trying that
hard. I did try this with a different configuration and ran into
[ 0.358026] smp: Brought up 1 node, 8 CPUs
[ 0.359359] SMP: Total of 8 processors activated.
[ 0.359542] CPU features: detected feature: 32-bit EL0 Support
[ 0.361781] Insufficient stack space to handle exception!
[...]
Post by Ard Biesheuvel
[ 0.367382] Task stack: [0xffffff8008e80000..0xffffff8008e84000]
[ 0.367519] IRQ stack: [0xffffffc03bf62000..0xffffffc03bf66000]
The IRQ stack is not 16K aligned ...
[ 0.367687] ESR: 0x00000000 -- Unknown/Uncategorized
[ 0.367868] FAR: 0x0000000000000000
[ 0.368059] Kernel panic - not syncing: kernel stack overflow
[ 0.368252] CPU: 4 PID: 0 Comm: swapper/4 Not tainted 4.12.0-00018-ge9cf49d604ef-dirty #23
[ 0.368427] Hardware name: linux,dummy-virt (DT)
[ 0.368774] [<ffffff8008087fd8>] dump_backtrace+0x0/0x228
[ 0.368979] [<ffffff80080882c8>] show_stack+0x10/0x20
[ 0.369270] [<ffffff80084602dc>] dump_stack+0x88/0xac
[ 0.369459] [<ffffff800816328c>] panic+0x120/0x278
[ 0.369582] [<ffffff8008088b40>] handle_bad_stack+0xd0/0xd8
[ 0.369799] [<ffffff80080bfb94>] __do_softirq+0x74/0x210
[ 0.370560] SMP: stopping secondary CPUs
[ 0.384269] Rebooting in 5 seconds..
The config is based on what I use for booting my Hikey android
board. I haven't been able to narrow down exactly which
set of configs set this off.
... so for some reason, the percpu atom size change fails to take effect here.
When we added the IRQ stack Jungseok Lee discovered that alignment greater than
PAGE_SIZE only applies to CPU0. Secondary CPUs read the per-cpu init data into a
page-aligned area, but any greater alignment requirement is lost.
Because of this the irqstack was only 16byte aligned, and struct thread_info had
to be be discovered without depending on stack alignment.
We [attempted to] address that by increasing the per-CPU atom size to
THREAD_ALIGN if CONFIG_VMAP_STACK=y, but as I am typing this, I wonder
if that percolates all the way down to the actual vmap() calls. I will
investigate ...
The issue is easily reproducible in QEMU as well, when building from
the same config. I tracked it down to CONFIG_NUMA=y, which sets
CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK=y, affecting the placement of
the static per-CPU data (including the IRQ stack).

However, what I hadn't realised is that the first chunk is referenced
via the linear mapping, so we will need to [vm]allocate the per-CPU
IRQ stacks explicitly, and record the address in a per-CPU pointer
variable instead.

I have updated my branch accordingly.
Laura Abbott
2017-07-20 19:10:34 UTC
Permalink
Post by Ard Biesheuvel
Post by Ard Biesheuvel
Post by James Morse
Hi Ard,
Post by Ard Biesheuvel
I didn't notice any performance impact but I also wasn't trying that
hard. I did try this with a different configuration and ran into
[ 0.358026] smp: Brought up 1 node, 8 CPUs
[ 0.359359] SMP: Total of 8 processors activated.
[ 0.359542] CPU features: detected feature: 32-bit EL0 Support
[ 0.361781] Insufficient stack space to handle exception!
[...]
Post by Ard Biesheuvel
[ 0.367382] Task stack: [0xffffff8008e80000..0xffffff8008e84000]
[ 0.367519] IRQ stack: [0xffffffc03bf62000..0xffffffc03bf66000]
The IRQ stack is not 16K aligned ...
[ 0.367687] ESR: 0x00000000 -- Unknown/Uncategorized
[ 0.367868] FAR: 0x0000000000000000
[ 0.368059] Kernel panic - not syncing: kernel stack overflow
[ 0.368252] CPU: 4 PID: 0 Comm: swapper/4 Not tainted 4.12.0-00018-ge9cf49d604ef-dirty #23
[ 0.368427] Hardware name: linux,dummy-virt (DT)
[ 0.368774] [<ffffff8008087fd8>] dump_backtrace+0x0/0x228
[ 0.368979] [<ffffff80080882c8>] show_stack+0x10/0x20
[ 0.369270] [<ffffff80084602dc>] dump_stack+0x88/0xac
[ 0.369459] [<ffffff800816328c>] panic+0x120/0x278
[ 0.369582] [<ffffff8008088b40>] handle_bad_stack+0xd0/0xd8
[ 0.369799] [<ffffff80080bfb94>] __do_softirq+0x74/0x210
[ 0.370560] SMP: stopping secondary CPUs
[ 0.384269] Rebooting in 5 seconds..
The config is based on what I use for booting my Hikey android
board. I haven't been able to narrow down exactly which
set of configs set this off.
... so for some reason, the percpu atom size change fails to take effect here.
When we added the IRQ stack Jungseok Lee discovered that alignment greater than
PAGE_SIZE only applies to CPU0. Secondary CPUs read the per-cpu init data into a
page-aligned area, but any greater alignment requirement is lost.
Because of this the irqstack was only 16byte aligned, and struct thread_info had
to be be discovered without depending on stack alignment.
We [attempted to] address that by increasing the per-CPU atom size to
THREAD_ALIGN if CONFIG_VMAP_STACK=y, but as I am typing this, I wonder
if that percolates all the way down to the actual vmap() calls. I will
investigate ...
The issue is easily reproducible in QEMU as well, when building from
the same config. I tracked it down to CONFIG_NUMA=y, which sets
CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK=y, affecting the placement of
the static per-CPU data (including the IRQ stack).
However, what I hadn't realised is that the first chunk is referenced
via the linear mapping, so we will need to [vm]allocate the per-CPU
IRQ stacks explicitly, and record the address in a per-CPU pointer
variable instead.
I have updated my branch accordingly.
Yep, this version works, both in QEMU and booting Android.

Thanks,
Laura

Mark Rutland
2017-07-14 12:52:15 UTC
Permalink
Post by Ard Biesheuvel
Post by Mark Rutland
This means that we have to align the initial task, so the kernel Image
will grow by THREAD_SIZE. Likewise for IRQ stacks, unless we can rework
things such that we can dynamically allocate all of those.
We can't currently do that for 64k pages, since the segment alignment
is only 64k. But we should be able to patch that up I think
I was assuming that the linked would bump up the segment alignment if a
more-aligned object were placed inside. I guess that doesn't happen in
all cases?

... or do you mean when the EFI stub relocates the kernel, assuming
relaxed alignment constraints?
Post by Ard Biesheuvel
Post by Mark Rutland
Post by Ard Biesheuvel
Post by Mark Rutland
I believe that determining whether the exception was caused by a stack
overflow is not something we can do robustly or efficiently.
Actually, if the stack pointer is within S_FRAME_SIZE of the base, and
the faulting address points into the guard page, that is a pretty
strong indicator that the stack overflowed. That shouldn't be too
costly?
Sure, but that's still a a heuristic. For example, that also catches an
unrelated vmalloc address gone wrong, while SP was close to the end of
the stack.
Yes, but the likelihood that an unrelated stray vmalloc access is
within 16 KB of a stack pointer that is close ot its limit is
extremely low, so we should be able to live with the risk of
misidentifying it.
I guess, but at that point, why bother?

That gives us a fuzzy check for one specific "stack overflow", while not
catching the general case.

So long as we have a reliable stack trace, we can figure out that was
the case, and we don't set the expectation that we're trying to
categorize the general case (minefield and all).

Thanks,
Mark.
Ard Biesheuvel
2017-07-14 12:55:31 UTC
Permalink
Post by Mark Rutland
Post by Ard Biesheuvel
Post by Mark Rutland
This means that we have to align the initial task, so the kernel Image
will grow by THREAD_SIZE. Likewise for IRQ stacks, unless we can rework
things such that we can dynamically allocate all of those.
We can't currently do that for 64k pages, since the segment alignment
is only 64k. But we should be able to patch that up I think
I was assuming that the linked would bump up the segment alignment if a
more-aligned object were placed inside. I guess that doesn't happen in
all cases?
... or do you mean when the EFI stub relocates the kernel, assuming
relaxed alignment constraints?
No, I mean under KASLR, which randomizes at SEGMENT_ALIGN granularity.
Post by Mark Rutland
Post by Ard Biesheuvel
Post by Mark Rutland
Post by Ard Biesheuvel
Post by Mark Rutland
I believe that determining whether the exception was caused by a stack
overflow is not something we can do robustly or efficiently.
Actually, if the stack pointer is within S_FRAME_SIZE of the base, and
the faulting address points into the guard page, that is a pretty
strong indicator that the stack overflowed. That shouldn't be too
costly?
Sure, but that's still a a heuristic. For example, that also catches an
unrelated vmalloc address gone wrong, while SP was close to the end of
the stack.
Yes, but the likelihood that an unrelated stray vmalloc access is
within 16 KB of a stack pointer that is close ot its limit is
extremely low, so we should be able to live with the risk of
misidentifying it.
I guess, but at that point, why bother?
That gives us a fuzzy check for one specific "stack overflow", while not
catching the general case.
So long as we have a reliable stack trace, we can figure out that was
the case, and we don't set the expectation that we're trying to
categorize the general case (minefield and all).
Yes. As long as the context is described accurately, there is no need
to make any inferences on behalf of the user.
Mark Rutland
2017-07-12 22:33:02 UTC
Permalink
To reliably check stack bounds, we'll need to know whether we're on a
task stack, or an IRQ stack.

Stash the base of the current stack in thread_info so that we have this
information.

Signed-off-by: Mark Rutland <***@arm.com>
---
arch/arm64/include/asm/thread_info.h | 3 +++
arch/arm64/kernel/asm-offsets.c | 3 +++
arch/arm64/kernel/entry.S | 7 +++++++
arch/arm64/kernel/head.S | 6 ++++++
arch/arm64/kernel/process.c | 4 ++++
5 files changed, 23 insertions(+)

diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h
index 3684f86..ae4f44b 100644
--- a/arch/arm64/include/asm/thread_info.h
+++ b/arch/arm64/include/asm/thread_info.h
@@ -62,6 +62,9 @@ struct thread_info {
#endif
unsigned long pcp_offset;
int preempt_count; /* 0 => preemptable, <0 => bug */
+#ifdef CONFIG_VMAP_STACK
+ unsigned long current_stack;
+#endif
};

#define INIT_THREAD_INFO(tsk) \
diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c
index 17001be..10c8ffa 100644
--- a/arch/arm64/kernel/asm-offsets.c
+++ b/arch/arm64/kernel/asm-offsets.c
@@ -40,6 +40,9 @@ int main(void)
DEFINE(TSK_TI_PREEMPT, offsetof(struct task_struct, thread_info.preempt_count));
DEFINE(TSK_TI_PCP, offsetof(struct task_struct, thread_info.pcp_offset));
DEFINE(TSK_TI_ADDR_LIMIT, offsetof(struct task_struct, thread_info.addr_limit));
+#ifdef CONFIG_VMAP_STACK
+ DEFINE(TSK_TI_CUR_STK, offsetof(struct task_struct, thread_info.current_stack));
+#endif
#ifdef CONFIG_ARM64_SW_TTBR0_PAN
DEFINE(TSK_TI_TTBR0, offsetof(struct task_struct, thread_info.ttbr0));
#endif
diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index 773b3fea..7c8b164 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -258,6 +258,9 @@ alternative_else_nop_endif

/* switch to the irq stack */
mov sp, x26
+#ifdef CONFIG_VMAP_STACK
+ str x25, [tsk, #TSK_TI_CUR_STK]
+#endif

/*
* Add a dummy stack frame, this non-standard format is fixed up
@@ -275,6 +278,10 @@ alternative_else_nop_endif
*/
.macro irq_stack_exit
mov sp, x19
+#ifdef CONFIG_VMAP_STACK
+ and x19, x19, #~(THREAD_SIZE - 1)
+ str x19, [tsk, #TSK_TI_CUR_STK]
+#endif
.endm

/*
diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S
index db77cac..3363846 100644
--- a/arch/arm64/kernel/head.S
+++ b/arch/arm64/kernel/head.S
@@ -325,6 +325,9 @@ __primary_switched:
add sp, x4, #THREAD_SIZE
adr_l x5, init_task
msr tpidr_el1, x5 // Save thread_info
+#ifdef CONFIG_VMAP_STACK
+ str x4, [x5, #TSK_TI_CUR_STK]
+#endif

adr_l x8, vectors // load VBAR_EL1 with virtual
msr vbar_el1, x8 // vector table address
@@ -616,6 +619,9 @@ __secondary_switched:
mov x3, #THREAD_START_SP
add sp, x1, x3
ldr x2, [x0, #CPU_BOOT_TASK]
+#ifdef CONFIG_VMAP_STACK
+ str x1, [x2, #TSK_TI_CUR_STK]
+#endif
msr tpidr_el1, x2
mov x29, #0
b secondary_start_kernel
diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c
index 4212da3..5dc5797 100644
--- a/arch/arm64/kernel/process.c
+++ b/arch/arm64/kernel/process.c
@@ -294,6 +294,10 @@ int copy_thread(unsigned long clone_flags, unsigned long stack_start,

ptrace_hw_copy_thread(p);

+#ifdef CONFIG_VMAP_STACK
+ p->thread_info.current_stack = (unsigned long)p->stack;
+#endif
+
return 0;
}
--
1.9.1
James Morse
2017-07-13 10:18:35 UTC
Permalink
Hi Mark,
Post by Mark Rutland
Currently we define THREAD_SIZE_ORDER dependent on which arm64-specific
page size kconfig symbol was selected. This is unfortunate, as it hides
the relationship between THREAD_SIZE_ORDER and THREAD_SIZE, and makes it
painful more painful than necessary to modify the thread size as we will
need to do for some debug configurations.
This patch follows arch/metag's approach of consistently defining
THREAD_SIZE in terms of THREAD_SIZE_ORDER. This avoids having ifdefs for
particular page size configurations, and allows us to change a single
definition to change the thread size.
I think this has unintended side effects for 64K page systems. (or at least not
yet intended)
Post by Mark Rutland
#ifdef CONFIG_ARM64_4K_PAGES
#define THREAD_SIZE_ORDER 2
#elif defined(CONFIG_ARM64_16K_PAGES)
#define THREAD_SIZE_ORDER 0
#endif
#define THREAD_SIZE 16384
# if THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK)
[...]
Post by Mark Rutland
#else
[...]
Post by Mark Rutland
void thread_stack_cache_init(void)
{
thread_stack_cache = kmem_cache_create("thread_stack", THREAD_SIZE,
THREAD_SIZE, 0, NULL);
BUG_ON(thread_stack_cache == NULL);
}
#endif
To create a kmemcache to share 64K pages as 16K stacks.
Post by Mark Rutland
#define THREAD_SHIFT 14
#if THREAD_SHIFT >= PAGE_SHIFT
#define THREAD_SIZE_ORDER (THREAD_SHIFT - PAGE_SHIFT)
#else
#define THREAD_SIZE_ORDER 0
#endif
#define THREAD_SIZE (PAGE_SIZE << THREAD_SIZE_ORDER)
gives us a 64K THREAD_SIZE.



Thanks,

James
Post by Mark Rutland
diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h
index 141f13e9..6d0c59a 100644
--- a/arch/arm64/include/asm/thread_info.h
+++ b/arch/arm64/include/asm/thread_info.h
@@ -23,13 +23,17 @@
#include <linux/compiler.h>
-#ifdef CONFIG_ARM64_4K_PAGES
-#define THREAD_SIZE_ORDER 2
-#elif defined(CONFIG_ARM64_16K_PAGES)
+#include <asm/page.h>
+
+#define THREAD_SHIFT 14
+
+#if THREAD_SHIFT >= PAGE_SHIFT
+#define THREAD_SIZE_ORDER (THREAD_SHIFT - PAGE_SHIFT)
+#else
#define THREAD_SIZE_ORDER 0
#endif
-#define THREAD_SIZE 16384
+#define THREAD_SIZE (PAGE_SIZE << THREAD_SIZE_ORDER)
#define THREAD_START_SP (THREAD_SIZE - 16)
#ifndef __ASSEMBLY__
Mark Rutland
2017-07-13 11:26:40 UTC
Permalink
Post by Ard Biesheuvel
Hi Mark,
Post by Mark Rutland
Currently we define THREAD_SIZE_ORDER dependent on which arm64-specific
page size kconfig symbol was selected. This is unfortunate, as it hides
the relationship between THREAD_SIZE_ORDER and THREAD_SIZE, and makes it
painful more painful than necessary to modify the thread size as we will
need to do for some debug configurations.
This patch follows arch/metag's approach of consistently defining
THREAD_SIZE in terms of THREAD_SIZE_ORDER. This avoids having ifdefs for
particular page size configurations, and allows us to change a single
definition to change the thread size.
I think this has unintended side effects for 64K page systems. (or at least not
yet intended)
Post by Mark Rutland
#ifdef CONFIG_ARM64_4K_PAGES
#define THREAD_SIZE_ORDER 2
#elif defined(CONFIG_ARM64_16K_PAGES)
#define THREAD_SIZE_ORDER 0
#endif
#define THREAD_SIZE 16384
# if THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK)
[...]
Post by Mark Rutland
#else
[...]
Post by Mark Rutland
void thread_stack_cache_init(void)
{
thread_stack_cache = kmem_cache_create("thread_stack", THREAD_SIZE,
THREAD_SIZE, 0, NULL);
BUG_ON(thread_stack_cache == NULL);
}
#endif
To create a kmemcache to share 64K pages as 16K stacks.
Post by Mark Rutland
#define THREAD_SHIFT 14
#if THREAD_SHIFT >= PAGE_SHIFT
#define THREAD_SIZE_ORDER (THREAD_SHIFT - PAGE_SHIFT)
#else
#define THREAD_SIZE_ORDER 0
#endif
#define THREAD_SIZE (PAGE_SIZE << THREAD_SIZE_ORDER)
gives us a 64K THREAD_SIZE.
Yes; I'd gotten confused as to what I was doing here. Thanks for
spotting that.

I've folded this and the next patch, with the resultant logic being as
below, which I think fixes this.

Thanks,
Mark.

---->8----
#define MIN_THREAD_SHIFT 14

/*
* Each VMAP stack is a separate VMALLOC allocation, which is at least
* PAGE_SIZE.
*/
#if defined(CONFIG_VMAP_STACK) && (MIN_THREAD_SHIFT < PAGE_SHIFT)
#define THREAD_SHIFT PAGE_SHIFT
#else
#define THREAD_SHIFT MIN_THREAD_SHIFT
#endif

#if THREAD_SHIFT >= PAGE_SHIFT
#define THREAD_SIZE_ORDER (THREAD_SHIFT - PAGE_SHIFT)
#endif

#define THREAD_SIZE (1UL << THREAD_SHIFT)
#define THREAD_START_SP (THREAD_SIZE - 16)
Loading...