Discussion:
[PATCH v10 1/3] x86/syscalls: Check address limit on user-mode return
Thomas Garnier
2017-06-15 01:12:01 UTC
Permalink
Ensure the address limit is a user-mode segment before returning to
user-mode. Otherwise a process can corrupt kernel-mode memory and elevate
privileges [1].

The set_fs function sets the TIF_SETFS flag to force a slow path on
return. In the slow path, the address limit is checked to be USER_DS if
needed.

The addr_limit_user_check function is added as a cross-architecture
function to check the address limit.

[1] https://bugs.chromium.org/p/project-zero/issues/detail?id=990

Signed-off-by: Thomas Garnier <***@google.com>
---
v10 redesigns the change to use work flags on set_fs as recommended by
Linus and agreed by others.

Based on next-20170609
---
arch/x86/entry/common.c | 3 +++
arch/x86/include/asm/thread_info.h | 5 ++++-
arch/x86/include/asm/uaccess.h | 7 ++++++-
include/linux/syscalls.h | 16 ++++++++++++++++
4 files changed, 29 insertions(+), 2 deletions(-)

diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index cdefcfdd9e63..03505ffbe1b6 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -23,6 +23,7 @@
#include <linux/user-return-notifier.h>
#include <linux/uprobes.h>
#include <linux/livepatch.h>
+#include <linux/syscalls.h>

#include <asm/desc.h>
#include <asm/traps.h>
@@ -183,6 +184,8 @@ __visible inline void prepare_exit_to_usermode(struct pt_regs *regs)
struct thread_info *ti = current_thread_info();
u32 cached_flags;

+ addr_limit_user_check();
+
if (IS_ENABLED(CONFIG_PROVE_LOCKING) && WARN_ON(!irqs_disabled()))
local_irq_disable();

diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index e00e1bd6e7b3..5161da1a0fa0 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -98,6 +98,7 @@ struct thread_info {
#define TIF_SYSCALL_TRACEPOINT 28 /* syscall tracepoint instrumentation */
#define TIF_ADDR32 29 /* 32-bit address space on 64 bits */
#define TIF_X32 30 /* 32-bit native x86-64 binary */
+#define TIF_FSCHECK 31 /* Check FS is USER_DS on return */

#define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE)
#define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME)
@@ -122,6 +123,7 @@ struct thread_info {
#define _TIF_SYSCALL_TRACEPOINT (1 << TIF_SYSCALL_TRACEPOINT)
#define _TIF_ADDR32 (1 << TIF_ADDR32)
#define _TIF_X32 (1 << TIF_X32)
+#define _TIF_FSCHECK (1 << TIF_FSCHECK)

/*
* work to do in syscall_trace_enter(). Also includes TIF_NOHZ for
@@ -137,7 +139,8 @@ struct thread_info {
(_TIF_SYSCALL_TRACE | _TIF_NOTIFY_RESUME | _TIF_SIGPENDING | \
_TIF_NEED_RESCHED | _TIF_SINGLESTEP | _TIF_SYSCALL_EMU | \
_TIF_SYSCALL_AUDIT | _TIF_USER_RETURN_NOTIFY | _TIF_UPROBE | \
- _TIF_PATCH_PENDING | _TIF_NOHZ | _TIF_SYSCALL_TRACEPOINT)
+ _TIF_PATCH_PENDING | _TIF_NOHZ | _TIF_SYSCALL_TRACEPOINT | \
+ _TIF_FSCHECK)

/* flags to check in __switch_to() */
#define _TIF_WORK_CTXSW \
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index a059aac9e937..11433f9018e2 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -26,7 +26,12 @@

#define get_ds() (KERNEL_DS)
#define get_fs() (current->thread.addr_limit)
-#define set_fs(x) (current->thread.addr_limit = (x))
+static inline void set_fs(mm_segment_t fs)
+{
+ current->thread.addr_limit = fs;
+ /* On user-mode return, check fs is correct */
+ set_thread_flag(TIF_FSCHECK);
+}

#define segment_eq(a, b) ((a).seg == (b).seg)

diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 980c3c9b06f8..ac0cf6fb25d6 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -206,6 +206,22 @@ extern struct trace_event_functions exit_syscall_print_funcs;
} \
static inline long SYSC##name(__MAP(x,__SC_DECL,__VA_ARGS__))

+#ifdef TIF_FSCHECK
+/*
+ * Called before coming back to user-mode. Returning to user-mode with an
+ * address limit different than USER_DS can allow to overwrite kernel memory.
+ */
+static inline void addr_limit_user_check(void)
+{
+
+ if (!test_thread_flag(TIF_FSCHECK))
+ return;
+
+ BUG_ON(!segment_eq(get_fs(), USER_DS));
+ clear_thread_flag(TIF_FSCHECK);
+}
+#endif
+
asmlinkage long sys32_quotactl(unsigned int cmd, const char __user *special,
qid_t id, void __user *addr);
asmlinkage long sys_time(time_t __user *tloc);
--
2.13.1.518.g3df882009-goog
Thomas Garnier
2017-06-15 01:12:02 UTC
Permalink
Ensure the address limit is a user-mode segment before returning to
user-mode. Otherwise a process can corrupt kernel-mode memory and
elevate privileges [1].

The set_fs function sets the TIF_SETFS flag to force a slow path on
return. In the slow path, the address limit is checked to be USER_DS if
needed.

The TIF_SETFS flag is added to _TIF_WORK_MASK shifting _TIF_SYSCALL_WORK
for arm instruction immediate support. The global work mask is too big
to used on a single instruction so adapt ret_fast_syscall.

[1] https://bugs.chromium.org/p/project-zero/issues/detail?id=990

Signed-off-by: Thomas Garnier <***@google.com>
---
v10 redesigns the change to use work flags on set_fs as recommended by
Linus and agreed by others.

Based on next-20170609
---
arch/arm/include/asm/thread_info.h | 15 +++++++++------
arch/arm/include/asm/uaccess.h | 2 ++
arch/arm/kernel/entry-common.S | 9 +++++++--
arch/arm/kernel/signal.c | 5 +++++
4 files changed, 23 insertions(+), 8 deletions(-)

diff --git a/arch/arm/include/asm/thread_info.h b/arch/arm/include/asm/thread_info.h
index 776757d1604a..1d468b527b7b 100644
--- a/arch/arm/include/asm/thread_info.h
+++ b/arch/arm/include/asm/thread_info.h
@@ -139,10 +139,11 @@ extern int vfp_restore_user_hwstate(struct user_vfp __user *,
#define TIF_NEED_RESCHED 1 /* rescheduling necessary */
#define TIF_NOTIFY_RESUME 2 /* callback before returning to user */
#define TIF_UPROBE 3 /* breakpointed or singlestepping */
-#define TIF_SYSCALL_TRACE 4 /* syscall trace active */
-#define TIF_SYSCALL_AUDIT 5 /* syscall auditing active */
-#define TIF_SYSCALL_TRACEPOINT 6 /* syscall tracepoint instrumentation */
-#define TIF_SECCOMP 7 /* seccomp syscall filtering active */
+#define TIF_FSCHECK 4 /* Check FS is USER_DS on return */
+#define TIF_SYSCALL_TRACE 5 /* syscall trace active */
+#define TIF_SYSCALL_AUDIT 6 /* syscall auditing active */
+#define TIF_SYSCALL_TRACEPOINT 7 /* syscall tracepoint instrumentation */
+#define TIF_SECCOMP 8 /* seccomp syscall filtering active */

#define TIF_NOHZ 12 /* in adaptive nohz mode */
#define TIF_USING_IWMMXT 17
@@ -153,6 +154,7 @@ extern int vfp_restore_user_hwstate(struct user_vfp __user *,
#define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED)
#define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME)
#define _TIF_UPROBE (1 << TIF_UPROBE)
+#define _TIF_FSCHECK (1 << TIF_FSCHECK)
#define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE)
#define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT)
#define _TIF_SYSCALL_TRACEPOINT (1 << TIF_SYSCALL_TRACEPOINT)
@@ -166,8 +168,9 @@ extern int vfp_restore_user_hwstate(struct user_vfp __user *,
/*
* Change these and you break ASM code in entry-common.S
*/
-#define _TIF_WORK_MASK (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \
- _TIF_NOTIFY_RESUME | _TIF_UPROBE)
+#define _TIF_WORK_MASK (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \
+ _TIF_NOTIFY_RESUME | _TIF_UPROBE | \
+ _TIF_FSCHECK)

#endif /* __KERNEL__ */
#endif /* __ASM_ARM_THREAD_INFO_H */
diff --git a/arch/arm/include/asm/uaccess.h b/arch/arm/include/asm/uaccess.h
index 2577405d082d..6cc882223e34 100644
--- a/arch/arm/include/asm/uaccess.h
+++ b/arch/arm/include/asm/uaccess.h
@@ -77,6 +77,8 @@ static inline void set_fs(mm_segment_t fs)
{
current_thread_info()->addr_limit = fs;
modify_domain(DOMAIN_KERNEL, fs ? DOMAIN_CLIENT : DOMAIN_MANAGER);
+ /* On user-mode return, check fs is correct */
+ set_thread_flag(TIF_FSCHECK);
}

#define segment_eq(a, b) ((a) == (b))
diff --git a/arch/arm/kernel/entry-common.S b/arch/arm/kernel/entry-common.S
index eb5cd77bf1d8..e33c32d56193 100644
--- a/arch/arm/kernel/entry-common.S
+++ b/arch/arm/kernel/entry-common.S
@@ -41,7 +41,9 @@ ret_fast_syscall:
UNWIND(.cantunwind )
disable_irq_notrace @ disable interrupts
ldr r1, [tsk, #TI_FLAGS] @ re-check for syscall tracing
- tst r1, #_TIF_SYSCALL_WORK | _TIF_WORK_MASK
+ tst r1, #_TIF_SYSCALL_WORK
+ bne fast_work_pending
+ tst r1, #_TIF_WORK_MASK
bne fast_work_pending

/* perform architecture specific actions before user return */
@@ -67,12 +69,15 @@ ret_fast_syscall:
str r0, [sp, #S_R0 + S_OFF]! @ save returned r0
disable_irq_notrace @ disable interrupts
ldr r1, [tsk, #TI_FLAGS] @ re-check for syscall tracing
- tst r1, #_TIF_SYSCALL_WORK | _TIF_WORK_MASK
+ tst r1, #_TIF_SYSCALL_WORK
+ bne fast_work_pending
+ tst r1, #_TIF_WORK_MASK
beq no_work_pending
UNWIND(.fnend )
ENDPROC(ret_fast_syscall)

/* Slower path - fall through to work_pending */
+fast_work_pending:
#endif

tst r1, #_TIF_SYSCALL_WORK
diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c
index 7b8f2141427b..3a48b54c6405 100644
--- a/arch/arm/kernel/signal.c
+++ b/arch/arm/kernel/signal.c
@@ -14,6 +14,7 @@
#include <linux/uaccess.h>
#include <linux/tracehook.h>
#include <linux/uprobes.h>
+#include <linux/syscalls.h>

#include <asm/elf.h>
#include <asm/cacheflush.h>
@@ -571,6 +572,10 @@ do_work_pending(struct pt_regs *regs, unsigned int thread_flags, int syscall)
* Update the trace code with the current status.
*/
trace_hardirqs_off();
+
+ /* Check valid user FS if needed */
+ addr_limit_user_check();
+
do {
if (likely(thread_flags & _TIF_NEED_RESCHED)) {
schedule();
--
2.13.1.518.g3df882009-goog
Kees Cook
2017-06-20 20:18:02 UTC
Permalink
Post by Thomas Garnier
Ensure the address limit is a user-mode segment before returning to
user-mode. Otherwise a process can corrupt kernel-mode memory and
elevate privileges [1].
The set_fs function sets the TIF_SETFS flag to force a slow path on
return. In the slow path, the address limit is checked to be USER_DS if
needed.
The TIF_SETFS flag is added to _TIF_WORK_MASK shifting _TIF_SYSCALL_WORK
for arm instruction immediate support. The global work mask is too big
to used on a single instruction so adapt ret_fast_syscall.
[1] https://bugs.chromium.org/p/project-zero/issues/detail?id=990
---
v10 redesigns the change to use work flags on set_fs as recommended by
Linus and agreed by others.
Based on next-20170609
---
arch/arm/include/asm/thread_info.h | 15 +++++++++------
arch/arm/include/asm/uaccess.h | 2 ++
arch/arm/kernel/entry-common.S | 9 +++++++--
arch/arm/kernel/signal.c | 5 +++++
4 files changed, 23 insertions(+), 8 deletions(-)
diff --git a/arch/arm/include/asm/thread_info.h b/arch/arm/include/asm/thread_info.h
index 776757d1604a..1d468b527b7b 100644
--- a/arch/arm/include/asm/thread_info.h
+++ b/arch/arm/include/asm/thread_info.h
@@ -139,10 +139,11 @@ extern int vfp_restore_user_hwstate(struct user_vfp __user *,
#define TIF_NEED_RESCHED 1 /* rescheduling necessary */
#define TIF_NOTIFY_RESUME 2 /* callback before returning to user */
#define TIF_UPROBE 3 /* breakpointed or singlestepping */
-#define TIF_SYSCALL_TRACE 4 /* syscall trace active */
-#define TIF_SYSCALL_AUDIT 5 /* syscall auditing active */
-#define TIF_SYSCALL_TRACEPOINT 6 /* syscall tracepoint instrumentation */
-#define TIF_SECCOMP 7 /* seccomp syscall filtering active */
+#define TIF_FSCHECK 4 /* Check FS is USER_DS on return */
+#define TIF_SYSCALL_TRACE 5 /* syscall trace active */
+#define TIF_SYSCALL_AUDIT 6 /* syscall auditing active */
+#define TIF_SYSCALL_TRACEPOINT 7 /* syscall tracepoint instrumentation */
+#define TIF_SECCOMP 8 /* seccomp syscall filtering active */
#define TIF_NOHZ 12 /* in adaptive nohz mode */
#define TIF_USING_IWMMXT 17
@@ -153,6 +154,7 @@ extern int vfp_restore_user_hwstate(struct user_vfp __user *,
#define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED)
#define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME)
#define _TIF_UPROBE (1 << TIF_UPROBE)
+#define _TIF_FSCHECK (1 << TIF_FSCHECK)
#define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE)
#define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT)
#define _TIF_SYSCALL_TRACEPOINT (1 << TIF_SYSCALL_TRACEPOINT)
@@ -166,8 +168,9 @@ extern int vfp_restore_user_hwstate(struct user_vfp __user *,
/*
* Change these and you break ASM code in entry-common.S
*/
-#define _TIF_WORK_MASK (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \
- _TIF_NOTIFY_RESUME | _TIF_UPROBE)
+#define _TIF_WORK_MASK (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \
+ _TIF_NOTIFY_RESUME | _TIF_UPROBE | \
+ _TIF_FSCHECK)
#endif /* __KERNEL__ */
#endif /* __ASM_ARM_THREAD_INFO_H */
diff --git a/arch/arm/include/asm/uaccess.h b/arch/arm/include/asm/uaccess.h
index 2577405d082d..6cc882223e34 100644
--- a/arch/arm/include/asm/uaccess.h
+++ b/arch/arm/include/asm/uaccess.h
@@ -77,6 +77,8 @@ static inline void set_fs(mm_segment_t fs)
{
current_thread_info()->addr_limit = fs;
modify_domain(DOMAIN_KERNEL, fs ? DOMAIN_CLIENT : DOMAIN_MANAGER);
+ /* On user-mode return, check fs is correct */
+ set_thread_flag(TIF_FSCHECK);
}
#define segment_eq(a, b) ((a) == (b))
diff --git a/arch/arm/kernel/entry-common.S b/arch/arm/kernel/entry-common.S
index eb5cd77bf1d8..e33c32d56193 100644
--- a/arch/arm/kernel/entry-common.S
+++ b/arch/arm/kernel/entry-common.S
UNWIND(.cantunwind )
- tst r1, #_TIF_SYSCALL_WORK | _TIF_WORK_MASK
+ tst r1, #_TIF_SYSCALL_WORK
+ bne fast_work_pending
+ tst r1, #_TIF_WORK_MASK
(IIUC) MOV32 is 2 cycles (MOVW, MOVT), and each TST above is 1 cycle
and each BNE is 1 cycle (when not taken). So:

mov32 r2, #_TIF_SYSCALL_WORK | _TIF_WORK_MASK
tst r1, r2
bne fast_work_pending

is 4 cycles and tst, bne, tst, bne is also 4 cycles. Would mov32 be
more readable (since it keeps the flags together)?

-Kees
--
Kees Cook
Pixel Security
Thomas Garnier
2017-06-20 20:31:14 UTC
Permalink
Post by Kees Cook
Post by Thomas Garnier
Ensure the address limit is a user-mode segment before returning to
user-mode. Otherwise a process can corrupt kernel-mode memory and
elevate privileges [1].
The set_fs function sets the TIF_SETFS flag to force a slow path on
return. In the slow path, the address limit is checked to be USER_DS if
needed.
The TIF_SETFS flag is added to _TIF_WORK_MASK shifting _TIF_SYSCALL_WORK
for arm instruction immediate support. The global work mask is too big
to used on a single instruction so adapt ret_fast_syscall.
[1] https://bugs.chromium.org/p/project-zero/issues/detail?id=990
---
v10 redesigns the change to use work flags on set_fs as recommended by
Linus and agreed by others.
Based on next-20170609
---
arch/arm/include/asm/thread_info.h | 15 +++++++++------
arch/arm/include/asm/uaccess.h | 2 ++
arch/arm/kernel/entry-common.S | 9 +++++++--
arch/arm/kernel/signal.c | 5 +++++
4 files changed, 23 insertions(+), 8 deletions(-)
diff --git a/arch/arm/include/asm/thread_info.h b/arch/arm/include/asm/thread_info.h
index 776757d1604a..1d468b527b7b 100644
--- a/arch/arm/include/asm/thread_info.h
+++ b/arch/arm/include/asm/thread_info.h
@@ -139,10 +139,11 @@ extern int vfp_restore_user_hwstate(struct user_vfp __user *,
#define TIF_NEED_RESCHED 1 /* rescheduling necessary */
#define TIF_NOTIFY_RESUME 2 /* callback before returning to user */
#define TIF_UPROBE 3 /* breakpointed or singlestepping */
-#define TIF_SYSCALL_TRACE 4 /* syscall trace active */
-#define TIF_SYSCALL_AUDIT 5 /* syscall auditing active */
-#define TIF_SYSCALL_TRACEPOINT 6 /* syscall tracepoint instrumentation */
-#define TIF_SECCOMP 7 /* seccomp syscall filtering active */
+#define TIF_FSCHECK 4 /* Check FS is USER_DS on return */
+#define TIF_SYSCALL_TRACE 5 /* syscall trace active */
+#define TIF_SYSCALL_AUDIT 6 /* syscall auditing active */
+#define TIF_SYSCALL_TRACEPOINT 7 /* syscall tracepoint instrumentation */
+#define TIF_SECCOMP 8 /* seccomp syscall filtering active */
#define TIF_NOHZ 12 /* in adaptive nohz mode */
#define TIF_USING_IWMMXT 17
@@ -153,6 +154,7 @@ extern int vfp_restore_user_hwstate(struct user_vfp __user *,
#define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED)
#define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME)
#define _TIF_UPROBE (1 << TIF_UPROBE)
+#define _TIF_FSCHECK (1 << TIF_FSCHECK)
#define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE)
#define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT)
#define _TIF_SYSCALL_TRACEPOINT (1 << TIF_SYSCALL_TRACEPOINT)
@@ -166,8 +168,9 @@ extern int vfp_restore_user_hwstate(struct user_vfp __user *,
/*
* Change these and you break ASM code in entry-common.S
*/
-#define _TIF_WORK_MASK (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \
- _TIF_NOTIFY_RESUME | _TIF_UPROBE)
+#define _TIF_WORK_MASK (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \
+ _TIF_NOTIFY_RESUME | _TIF_UPROBE | \
+ _TIF_FSCHECK)
#endif /* __KERNEL__ */
#endif /* __ASM_ARM_THREAD_INFO_H */
diff --git a/arch/arm/include/asm/uaccess.h b/arch/arm/include/asm/uaccess.h
index 2577405d082d..6cc882223e34 100644
--- a/arch/arm/include/asm/uaccess.h
+++ b/arch/arm/include/asm/uaccess.h
@@ -77,6 +77,8 @@ static inline void set_fs(mm_segment_t fs)
{
current_thread_info()->addr_limit = fs;
modify_domain(DOMAIN_KERNEL, fs ? DOMAIN_CLIENT : DOMAIN_MANAGER);
+ /* On user-mode return, check fs is correct */
+ set_thread_flag(TIF_FSCHECK);
}
#define segment_eq(a, b) ((a) == (b))
diff --git a/arch/arm/kernel/entry-common.S b/arch/arm/kernel/entry-common.S
index eb5cd77bf1d8..e33c32d56193 100644
--- a/arch/arm/kernel/entry-common.S
+++ b/arch/arm/kernel/entry-common.S
UNWIND(.cantunwind )
- tst r1, #_TIF_SYSCALL_WORK | _TIF_WORK_MASK
+ tst r1, #_TIF_SYSCALL_WORK
+ bne fast_work_pending
+ tst r1, #_TIF_WORK_MASK
(IIUC) MOV32 is 2 cycles (MOVW, MOVT), and each TST above is 1 cycle
mov32 r2, #_TIF_SYSCALL_WORK | _TIF_WORK_MASK
tst r1, r2
bne fast_work_pending
is 4 cycles and tst, bne, tst, bne is also 4 cycles. Would mov32 be
more readable (since it keeps the flags together)?
I guess it would be more readable. Any opinion from the arm folks?
Post by Kees Cook
-Kees
--
Kees Cook
Pixel Security
--
Thomas
Will Deacon
2017-06-21 09:08:15 UTC
Permalink
Post by Thomas Garnier
Post by Kees Cook
Post by Thomas Garnier
diff --git a/arch/arm/kernel/entry-common.S b/arch/arm/kernel/entry-common.S
index eb5cd77bf1d8..e33c32d56193 100644
--- a/arch/arm/kernel/entry-common.S
+++ b/arch/arm/kernel/entry-common.S
UNWIND(.cantunwind )
- tst r1, #_TIF_SYSCALL_WORK | _TIF_WORK_MASK
+ tst r1, #_TIF_SYSCALL_WORK
+ bne fast_work_pending
+ tst r1, #_TIF_WORK_MASK
(IIUC) MOV32 is 2 cycles (MOVW, MOVT), and each TST above is 1 cycle
mov32 r2, #_TIF_SYSCALL_WORK | _TIF_WORK_MASK
tst r1, r2
bne fast_work_pending
is 4 cycles and tst, bne, tst, bne is also 4 cycles. Would mov32 be
more readable (since it keeps the flags together)?
I guess it would be more readable. Any opinion from the arm folks?
The mov32 sequence is probably better, but statically attributing cycles
on a per instruction basis is pretty futile on modern CPUs.

Will
Leonard Crestez
2017-07-18 14:36:06 UTC
Permalink
Post by Thomas Garnier
Ensure the address limit is a user-mode segment before returning to
user-mode. Otherwise a process can corrupt kernel-mode memory and
elevate privileges [1].
The set_fs function sets the TIF_SETFS flag to force a slow path on
return. In the slow path, the address limit is checked to be USER_DS if
needed.
The TIF_SETFS flag is added to _TIF_WORK_MASK shifting _TIF_SYSCALL_WORK
for arm instruction immediate support. The global work mask is too big
to used on a single instruction so adapt ret_fast_syscall.
[1] https://bugs.chromium.org/p/project-zero/issues/detail?id=990
---
v10 redesigns the change to use work flags on set_fs as recommended by
Linus and agreed by others.
Based on next-20170609
---
 arch/arm/include/asm/thread_info.h | 15 +++++++++------
 arch/arm/include/asm/uaccess.h     |  2 ++
 arch/arm/kernel/entry-common.S     |  9 +++++++--
 arch/arm/kernel/signal.c           |  5 +++++
 4 files changed, 23 insertions(+), 8 deletions(-)
diff --git a/arch/arm/include/asm/thread_info.h b/arch/arm/include/asm/thread_info.h
index 776757d1604a..1d468b527b7b 100644
--- a/arch/arm/include/asm/thread_info.h
+++ b/arch/arm/include/asm/thread_info.h
@@ -139,10 +139,11 @@ extern int vfp_restore_user_hwstate(struct user_vfp __user *,
 #define TIF_NEED_RESCHED 1 /* rescheduling necessary */
 #define TIF_NOTIFY_RESUME 2 /* callback before returning to user */
 #define TIF_UPROBE 3 /* breakpointed or singlestepping */
-#define TIF_SYSCALL_TRACE 4 /* syscall trace active */
-#define TIF_SYSCALL_AUDIT 5 /* syscall auditing active */
-#define TIF_SYSCALL_TRACEPOINT 6 /* syscall tracepoint instrumentation */
-#define TIF_SECCOMP 7 /* seccomp syscall filtering active */
+#define TIF_FSCHECK 4 /* Check FS is USER_DS on return */
+#define TIF_SYSCALL_TRACE 5 /* syscall trace active */
+#define TIF_SYSCALL_AUDIT 6 /* syscall auditing active */
+#define TIF_SYSCALL_TRACEPOINT 7 /* syscall tracepoint instrumentation */
+#define TIF_SECCOMP 8 /* seccomp syscall filtering active */
 
 #define TIF_NOHZ 12 /* in adaptive nohz mode */
 #define TIF_USING_IWMMXT 17
@@ -153,6 +154,7 @@ extern int vfp_restore_user_hwstate(struct user_vfp __user *,
 #define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED)
 #define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME)
 #define _TIF_UPROBE (1 << TIF_UPROBE)
+#define _TIF_FSCHECK (1 << TIF_FSCHECK)
 #define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE)
 #define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT)
 #define _TIF_SYSCALL_TRACEPOINT (1 << TIF_SYSCALL_TRACEPOINT)
@@ -166,8 +168,9 @@ extern int vfp_restore_user_hwstate(struct user_vfp __user *,
 /*
  * Change these and you break ASM code in entry-common.S
  */
-#define _TIF_WORK_MASK (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \
-  _TIF_NOTIFY_RESUME | _TIF_UPROBE)
+#define _TIF_WORK_MASK (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \
+  _TIF_NOTIFY_RESUME | _TIF_UPROBE | \
+  _TIF_FSCHECK)
 
 #endif /* __KERNEL__ */
 #endif /* __ASM_ARM_THREAD_INFO_H */
diff --git a/arch/arm/include/asm/uaccess.h b/arch/arm/include/asm/uaccess.h
index 2577405d082d..6cc882223e34 100644
--- a/arch/arm/include/asm/uaccess.h
+++ b/arch/arm/include/asm/uaccess.h
@@ -77,6 +77,8 @@ static inline void set_fs(mm_segment_t fs)
 {
  current_thread_info()->addr_limit = fs;
  modify_domain(DOMAIN_KERNEL, fs ? DOMAIN_CLIENT : DOMAIN_MANAGER);
+ /* On user-mode return, check fs is correct */
+ set_thread_flag(TIF_FSCHECK);
 }
 
 #define segment_eq(a, b) ((a) == (b))
diff --git a/arch/arm/kernel/entry-common.S b/arch/arm/kernel/entry-common.S
index eb5cd77bf1d8..e33c32d56193 100644
--- a/arch/arm/kernel/entry-common.S
+++ b/arch/arm/kernel/entry-common.S
  UNWIND(.cantunwind )
- tst r1, #_TIF_SYSCALL_WORK | _TIF_WORK_MASK
+ tst r1, #_TIF_SYSCALL_WORK
+ bne fast_work_pending
+ tst r1, #_TIF_WORK_MASK
  bne fast_work_pending
 
  /* perform architecture specific actions before user return */
- tst r1, #_TIF_SYSCALL_WORK | _TIF_WORK_MASK
+ tst r1, #_TIF_SYSCALL_WORK
+ bne fast_work_pending
+ tst r1, #_TIF_WORK_MASK
  beq no_work_pending
  UNWIND(.fnend )
 ENDPROC(ret_fast_syscall)
 
  /* Slower path - fall through to work_pending */
 #endif
 
  tst r1, #_TIF_SYSCALL_WORK
diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c
index 7b8f2141427b..3a48b54c6405 100644
--- a/arch/arm/kernel/signal.c
+++ b/arch/arm/kernel/signal.c
@@ -14,6 +14,7 @@
 #include
 #include
 #include
+#include
 
 #include
 #include
@@ -571,6 +572,10 @@ do_work_pending(struct pt_regs *regs, unsigned int thread_flags, int syscall)
   * Update the trace code with the current status.
   */
  trace_hardirqs_off();
+
+ /* Check valid user FS if needed */
+ addr_limit_user_check();
+
  do {
  if (likely(thread_flags & _TIF_NEED_RESCHED)) {
  schedule();
This patch made it's way into linux-next next-20170717 and it seems to
cause hangs when booting some boards over NFS (found via bisection). I
don't know exactly what determines the issue but I can reproduce hangs
if even if I just boot with init=/bin/bash and do stuff like

# sleep 1 & sleep 1 & sleep 1 & wait; wait; wait; echo done!

When this happens sysrq-t shows a sleep task hung in the 'R' state
spinning in do_work_pending, so maybe there is a potential infinite
loop here?

The addr_limit_user_check at the start of do_work_pending will check
for TIF_FSCHECK once and clear it but the function loops while
(thread_flags & _TIF_WORK_MASK), so it if TIF_FSCHECK is set again then
the loop will never terminate. Does this make sense?

I added some instrumentation to check if TIF_FSCHECK can show up during
the do_work_pending loop and the answer seems to be yes. I also tried
to get a stack with a set_fs call from inside do_work_pending and got
the following:

[  227.582402] CPU: 0 PID: 829 Comm: sleep Not tainted 4.12.0-01057-g93af8f7-dirty #332
[  227.590171] Hardware name: Freescale i.MX6 SoloLite (Device Tree)
[  227.596275] Backtrace: 
[  227.598754] [<c010cbb4>] (dump_backtrace) from [<c010ce60>] (show_stack+0x18/0x1c)
[  227.606339]  r7:00000000 r6:60070113 r5:00000000 r4:c105a958
[  227.612016] [<c010ce48>] (show_stack) from [<c0493498>] (dump_stack+0xb4/0xe8)
[  227.619258] [<c04933e4>] (dump_stack) from [<c010c350>] (mydbg_set_fs+0x40/0x48)
[  227.626671]  r9:c08cf35c r8:ee1cda7c r7:ee1e3dce r6:bf000000 r5:00000000 r4:ffffe000
[  227.634433] [<c010c310>] (mydbg_set_fs) from [<c021f0b8>] (__probe_kernel_read+0x44/0xd0)
[  227.642629] [<c021f074>] (__probe_kernel_read) from [<c011b8d8>] (do_alignment+0x8c/0x75c)
[  227.650909]  r10:ef085000 r9:c08cf35c r8:00000001 r7:ee1e3dce r6:c011b84c r5:ee1cdbe0
[  227.658748]  r4:00000000 r3:00000000
[  227.662338] [<c011b84c>] (do_alignment) from [<c0101394>] (do_DataAbort+0x40/0xc0)
[  227.669921]  r10:ef085000 r9:ee1cc000 r8:ee1cdbe0 r7:ee1e3dce r6:c011b84c r5:00000001
[  227.677760]  r4:c100dd3c
[  227.680308] [<c0101354>] (do_DataAbort) from [<c010da44>] (__dabt_svc+0x64/0xa0)
[  227.687714] Exception stack(0xee1cdbe0 to 0xee1cdc28)
[  227.692780] dbe0: 9064a8c0 ee1e3de2 d82727d8 00000000 ee1b20c0 ee1e3dce 00000000 ef08572c
[  227.700971] dc00: c0bb2034 c10c75ea ef085000 ee1cdc74 ee1cdc00 ee1cdc30 c01761a8 c08cf35c
[  227.709158] dc20: 40070113 ffffffff
[  227.712661]  r8:c0bb2034 r7:ee1cdc14 r6:ffffffff r5:40070113 r4:c08cf35c
[  227.719382] [<c08cf16c>] (inet_gro_receive) from [<c084a8ec>] (dev_gro_receive+0x2f0/0x618)
[  227.727746]  r10:ef085000 r9:00000001 r8:00000000 r7:ef085710 r6:c1008b88 r5:ee1b20c0
[  227.735585]  r4:c1009f78
[  227.738132] [<c084a5fc>] (dev_gro_receive) from [<c084ac8c>] (napi_gro_receive+0x78/0x1f4)
[  227.746410]  r10:ef085000 r9:00000001 r8:c10d15ec r7:c100792c r6:ef085710 r5:c10c744e
[  227.754249]  r4:ee1b20c0
[  227.756801] [<c084ac14>] (napi_gro_receive) from [<c06a2784>] (fec_enet_rx_napi+0x39c/0x988)
[  227.765253]  r9:00000001 r8:f0c8a960 r7:00000000 r6:00000000 r5:ef086000 r4:ee1b20c0
[  227.773010] [<c06a23e8>] (fec_enet_rx_napi) from [<c084a3a4>] (net_rx_action+0x21c/0x474)
[  227.781201]  r10:ee1cdd78 r9:c0fa7b80 r8:ef7dab80 r7:0000012c r6:00000040 r5:00000001
[  227.789039]  r4:ef085710
[  227.791593] [<c084a188>] (net_rx_action) from [<c012f2d4>] (__do_softirq+0x158/0x534)
[  227.799437]  r10:00000008 r9:ee1cc000 r8:c10ce568 r7:c100792c r6:c10247bd r5:00000003
[  227.807275]  r4:c100208c
[  227.809824] [<c012f17c>] (__do_softirq) from [<c012fa68>] (irq_exit+0xec/0x168)
[  227.817147]  r10:c1007ea0 r9:ef010400 r8:00000001 r7:00000000 r6:c1007d3c r5:00000000
[  227.824984]  r4:c0fa534c
[  227.827534] [<c012f97c>] (irq_exit) from [<c01883f4>] (__handle_domain_irq+0x74/0xe8)
[  227.835377] [<c0188380>] (__handle_domain_irq) from [<c01015fc>] (gic_handle_irq+0x58/0xbc)
[  227.843742]  r9:f080b100 r8:c105ae80 r7:ee1cde80 r6:000003ff r5:000003eb r4:f080b10c
[  227.851498] [<c01015a4>] (gic_handle_irq) from [<c010daf0>] (__irq_svc+0x70/0x98)
[  227.858990] Exception stack(0xee1cde80 to 0xee1cdec8)
[  227.864056] de80: ee7a1140 00000001 00000000 000012a9 ee7a1140 ee9d9f10 ee76edc0 ee9d9f60
[  227.872248] dea0: 00000000 ee9d9f10 00000010 ee1cdeec ee1cdeb8 ee1cded0 c038a77c c0389688
[  227.880434] dec0: 60070013 ffffffff
[  227.883937]  r10:00000010 r9:ee1cc000 r8:00000000 r7:ee1cdeb4 r6:ffffffff r5:60070013
[  227.891775]  r4:c0389688
[  227.894327] [<c038a6f8>] (nfs_file_clear_open_context) from [<c03860e8>] (nfs_file_release+0x54/0x60)
[  227.903558]  r7:ee9a78a0 r6:ee68f010 r5:ee9d9f10 r4:ee76edc0
[  227.909235] [<c0386094>] (nfs_file_release) from [<c0276cb4>] (__fput+0x94/0x1e0)
[  227.916734] [<c0276c20>] (__fput) from [<c0276e60>] (____fput+0x10/0x14)
[  227.923448]  r10:c10d4298 r9:00000000 r8:00000000 r7:ef2ed780 r6:ef2edc00 r5:c10d5180
[  227.931286]  r4:ef2edbd4
[  227.933839] [<c0276e50>] (____fput) from [<c014c534>] (task_work_run+0xc8/0xec)
[  227.941166] [<c014c46c>] (task_work_run) from [<c010c484>] (do_work_pending+0x12c/0x1c4)
[  227.949271]  r9:ee1cdfb0 r8:00000000 r7:00000000 r6:ee1cc000 r5:00000000 r4:00000000
[  227.957029] [<c010c358>] (do_work_pending) from [<c0107c90>] (slow_work_pending+0xc/0x20)
[  227.965219]  r10:00000000 r9:ee1cc000 r8:c0107e24 r7:0000005b r6:b6f76568 r5:b6f741f0
[  227.973058]  r4:b6f76904

Maybe the reason this reproduces easily in this particular setup is
that ethernet causes lots of alignment faults?

--
Regards,
Leonard
Thomas Garnier
2017-07-18 16:04:52 UTC
Permalink
On Tue, Jul 18, 2017 at 7:36 AM, Leonard Crestez
Post by Leonard Crestez
Post by Thomas Garnier
Ensure the address limit is a user-mode segment before returning to
user-mode. Otherwise a process can corrupt kernel-mode memory and
elevate privileges [1].
The set_fs function sets the TIF_SETFS flag to force a slow path on
return. In the slow path, the address limit is checked to be USER_DS if
needed.
The TIF_SETFS flag is added to _TIF_WORK_MASK shifting _TIF_SYSCALL_WORK
for arm instruction immediate support. The global work mask is too big
to used on a single instruction so adapt ret_fast_syscall.
[1] https://bugs.chromium.org/p/project-zero/issues/detail?id=990
---
v10 redesigns the change to use work flags on set_fs as recommended by
Linus and agreed by others.
Based on next-20170609
---
arch/arm/include/asm/thread_info.h | 15 +++++++++------
arch/arm/include/asm/uaccess.h | 2 ++
arch/arm/kernel/entry-common.S | 9 +++++++--
arch/arm/kernel/signal.c | 5 +++++
4 files changed, 23 insertions(+), 8 deletions(-)
diff --git a/arch/arm/include/asm/thread_info.h b/arch/arm/include/asm/thread_info.h
index 776757d1604a..1d468b527b7b 100644
--- a/arch/arm/include/asm/thread_info.h
+++ b/arch/arm/include/asm/thread_info.h
@@ -139,10 +139,11 @@ extern int vfp_restore_user_hwstate(struct user_vfp __user *,
#define TIF_NEED_RESCHED 1 /* rescheduling necessary */
#define TIF_NOTIFY_RESUME 2 /* callback before returning to user */
#define TIF_UPROBE 3 /* breakpointed or singlestepping */
-#define TIF_SYSCALL_TRACE 4 /* syscall trace active */
-#define TIF_SYSCALL_AUDIT 5 /* syscall auditing active */
-#define TIF_SYSCALL_TRACEPOINT 6 /* syscall tracepoint instrumentation */
-#define TIF_SECCOMP 7 /* seccomp syscall filtering active */
+#define TIF_FSCHECK 4 /* Check FS is USER_DS on return */
+#define TIF_SYSCALL_TRACE 5 /* syscall trace active */
+#define TIF_SYSCALL_AUDIT 6 /* syscall auditing active */
+#define TIF_SYSCALL_TRACEPOINT 7 /* syscall tracepoint instrumentation */
+#define TIF_SECCOMP 8 /* seccomp syscall filtering active */
#define TIF_NOHZ 12 /* in adaptive nohz mode */
#define TIF_USING_IWMMXT 17
@@ -153,6 +154,7 @@ extern int vfp_restore_user_hwstate(struct user_vfp __user *,
#define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED)
#define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME)
#define _TIF_UPROBE (1 << TIF_UPROBE)
+#define _TIF_FSCHECK (1 << TIF_FSCHECK)
#define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE)
#define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT)
#define _TIF_SYSCALL_TRACEPOINT (1 << TIF_SYSCALL_TRACEPOINT)
@@ -166,8 +168,9 @@ extern int vfp_restore_user_hwstate(struct user_vfp __user *,
/*
* Change these and you break ASM code in entry-common.S
*/
-#define _TIF_WORK_MASK (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \
- _TIF_NOTIFY_RESUME | _TIF_UPROBE)
+#define _TIF_WORK_MASK (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \
+ _TIF_NOTIFY_RESUME | _TIF_UPROBE | \
+ _TIF_FSCHECK)
#endif /* __KERNEL__ */
#endif /* __ASM_ARM_THREAD_INFO_H */
diff --git a/arch/arm/include/asm/uaccess.h b/arch/arm/include/asm/uaccess.h
index 2577405d082d..6cc882223e34 100644
--- a/arch/arm/include/asm/uaccess.h
+++ b/arch/arm/include/asm/uaccess.h
@@ -77,6 +77,8 @@ static inline void set_fs(mm_segment_t fs)
{
current_thread_info()->addr_limit = fs;
modify_domain(DOMAIN_KERNEL, fs ? DOMAIN_CLIENT : DOMAIN_MANAGER);
+ /* On user-mode return, check fs is correct */
+ set_thread_flag(TIF_FSCHECK);
}
#define segment_eq(a, b) ((a) == (b))
diff --git a/arch/arm/kernel/entry-common.S b/arch/arm/kernel/entry-common.S
index eb5cd77bf1d8..e33c32d56193 100644
--- a/arch/arm/kernel/entry-common.S
+++ b/arch/arm/kernel/entry-common.S
UNWIND(.cantunwind )
- tst r1, #_TIF_SYSCALL_WORK | _TIF_WORK_MASK
+ tst r1, #_TIF_SYSCALL_WORK
+ bne fast_work_pending
+ tst r1, #_TIF_WORK_MASK
bne fast_work_pending
/* perform architecture specific actions before user return */
- tst r1, #_TIF_SYSCALL_WORK | _TIF_WORK_MASK
+ tst r1, #_TIF_SYSCALL_WORK
+ bne fast_work_pending
+ tst r1, #_TIF_WORK_MASK
beq no_work_pending
UNWIND(.fnend )
ENDPROC(ret_fast_syscall)
/* Slower path - fall through to work_pending */
#endif
tst r1, #_TIF_SYSCALL_WORK
diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c
index 7b8f2141427b..3a48b54c6405 100644
--- a/arch/arm/kernel/signal.c
+++ b/arch/arm/kernel/signal.c
@@ -14,6 +14,7 @@
#include
#include
#include
+#include
#include
#include
@@ -571,6 +572,10 @@ do_work_pending(struct pt_regs *regs, unsigned int thread_flags, int syscall)
* Update the trace code with the current status.
*/
trace_hardirqs_off();
+
+ /* Check valid user FS if needed */
+ addr_limit_user_check();
+
do {
if (likely(thread_flags & _TIF_NEED_RESCHED)) {
schedule();
This patch made it's way into linux-next next-20170717 and it seems to
cause hangs when booting some boards over NFS (found via bisection). I
don't know exactly what determines the issue but I can reproduce hangs
if even if I just boot with init=/bin/bash and do stuff like
# sleep 1 & sleep 1 & sleep 1 & wait; wait; wait; echo done!
When this happens sysrq-t shows a sleep task hung in the 'R' state
spinning in do_work_pending, so maybe there is a potential infinite
loop here?
The addr_limit_user_check at the start of do_work_pending will check
for TIF_FSCHECK once and clear it but the function loops while
(thread_flags & _TIF_WORK_MASK), so it if TIF_FSCHECK is set again then
the loop will never terminate. Does this make sense?
Yes, it does. Thanks for looking into this.
Post by Leonard Crestez
I added some instrumentation to check if TIF_FSCHECK can show up during
the do_work_pending loop and the answer seems to be yes. I also tried
to get a stack with a set_fs call from inside do_work_pending and got
[ 227.582402] CPU: 0 PID: 829 Comm: sleep Not tainted 4.12.0-01057-g93af8f7-dirty #332
[ 227.590171] Hardware name: Freescale i.MX6 SoloLite (Device Tree)
[ 227.598754] [<c010cbb4>] (dump_backtrace) from [<c010ce60>] (show_stack+0x18/0x1c)
[ 227.606339] r7:00000000 r6:60070113 r5:00000000 r4:c105a958
[ 227.612016] [<c010ce48>] (show_stack) from [<c0493498>] (dump_stack+0xb4/0xe8)
[ 227.619258] [<c04933e4>] (dump_stack) from [<c010c350>] (mydbg_set_fs+0x40/0x48)
[ 227.626671] r9:c08cf35c r8:ee1cda7c r7:ee1e3dce r6:bf000000 r5:00000000 r4:ffffe000
[ 227.634433] [<c010c310>] (mydbg_set_fs) from [<c021f0b8>] (__probe_kernel_read+0x44/0xd0)
[ 227.642629] [<c021f074>] (__probe_kernel_read) from [<c011b8d8>] (do_alignment+0x8c/0x75c)
[ 227.650909] r10:ef085000 r9:c08cf35c r8:00000001 r7:ee1e3dce r6:c011b84c r5:ee1cdbe0
[ 227.658748] r4:00000000 r3:00000000
[ 227.662338] [<c011b84c>] (do_alignment) from [<c0101394>] (do_DataAbort+0x40/0xc0)
[ 227.669921] r10:ef085000 r9:ee1cc000 r8:ee1cdbe0 r7:ee1e3dce r6:c011b84c r5:00000001
[ 227.677760] r4:c100dd3c
[ 227.680308] [<c0101354>] (do_DataAbort) from [<c010da44>] (__dabt_svc+0x64/0xa0)
[ 227.687714] Exception stack(0xee1cdbe0 to 0xee1cdc28)
[ 227.692780] dbe0: 9064a8c0 ee1e3de2 d82727d8 00000000 ee1b20c0 ee1e3dce 00000000 ef08572c
[ 227.700971] dc00: c0bb2034 c10c75ea ef085000 ee1cdc74 ee1cdc00 ee1cdc30 c01761a8 c08cf35c
[ 227.709158] dc20: 40070113 ffffffff
[ 227.712661] r8:c0bb2034 r7:ee1cdc14 r6:ffffffff r5:40070113 r4:c08cf35c
[ 227.719382] [<c08cf16c>] (inet_gro_receive) from [<c084a8ec>] (dev_gro_receive+0x2f0/0x618)
[ 227.727746] r10:ef085000 r9:00000001 r8:00000000 r7:ef085710 r6:c1008b88 r5:ee1b20c0
[ 227.735585] r4:c1009f78
[ 227.738132] [<c084a5fc>] (dev_gro_receive) from [<c084ac8c>] (napi_gro_receive+0x78/0x1f4)
[ 227.746410] r10:ef085000 r9:00000001 r8:c10d15ec r7:c100792c r6:ef085710 r5:c10c744e
[ 227.754249] r4:ee1b20c0
[ 227.756801] [<c084ac14>] (napi_gro_receive) from [<c06a2784>] (fec_enet_rx_napi+0x39c/0x988)
[ 227.765253] r9:00000001 r8:f0c8a960 r7:00000000 r6:00000000 r5:ef086000 r4:ee1b20c0
[ 227.773010] [<c06a23e8>] (fec_enet_rx_napi) from [<c084a3a4>] (net_rx_action+0x21c/0x474)
[ 227.781201] r10:ee1cdd78 r9:c0fa7b80 r8:ef7dab80 r7:0000012c r6:00000040 r5:00000001
[ 227.789039] r4:ef085710
[ 227.791593] [<c084a188>] (net_rx_action) from [<c012f2d4>] (__do_softirq+0x158/0x534)
[ 227.799437] r10:00000008 r9:ee1cc000 r8:c10ce568 r7:c100792c r6:c10247bd r5:00000003
[ 227.807275] r4:c100208c
[ 227.809824] [<c012f17c>] (__do_softirq) from [<c012fa68>] (irq_exit+0xec/0x168)
[ 227.817147] r10:c1007ea0 r9:ef010400 r8:00000001 r7:00000000 r6:c1007d3c r5:00000000
[ 227.824984] r4:c0fa534c
[ 227.827534] [<c012f97c>] (irq_exit) from [<c01883f4>] (__handle_domain_irq+0x74/0xe8)
[ 227.835377] [<c0188380>] (__handle_domain_irq) from [<c01015fc>] (gic_handle_irq+0x58/0xbc)
[ 227.843742] r9:f080b100 r8:c105ae80 r7:ee1cde80 r6:000003ff r5:000003eb r4:f080b10c
[ 227.851498] [<c01015a4>] (gic_handle_irq) from [<c010daf0>] (__irq_svc+0x70/0x98)
[ 227.858990] Exception stack(0xee1cde80 to 0xee1cdec8)
[ 227.864056] de80: ee7a1140 00000001 00000000 000012a9 ee7a1140 ee9d9f10 ee76edc0 ee9d9f60
[ 227.872248] dea0: 00000000 ee9d9f10 00000010 ee1cdeec ee1cdeb8 ee1cded0 c038a77c c0389688
[ 227.880434] dec0: 60070013 ffffffff
[ 227.883937] r10:00000010 r9:ee1cc000 r8:00000000 r7:ee1cdeb4 r6:ffffffff r5:60070013
[ 227.891775] r4:c0389688
[ 227.894327] [<c038a6f8>] (nfs_file_clear_open_context) from [<c03860e8>] (nfs_file_release+0x54/0x60)
[ 227.903558] r7:ee9a78a0 r6:ee68f010 r5:ee9d9f10 r4:ee76edc0
[ 227.909235] [<c0386094>] (nfs_file_release) from [<c0276cb4>] (__fput+0x94/0x1e0)
[ 227.916734] [<c0276c20>] (__fput) from [<c0276e60>] (____fput+0x10/0x14)
[ 227.923448] r10:c10d4298 r9:00000000 r8:00000000 r7:ef2ed780 r6:ef2edc00 r5:c10d5180
[ 227.931286] r4:ef2edbd4
[ 227.933839] [<c0276e50>] (____fput) from [<c014c534>] (task_work_run+0xc8/0xec)
[ 227.941166] [<c014c46c>] (task_work_run) from [<c010c484>] (do_work_pending+0x12c/0x1c4)
[ 227.949271] r9:ee1cdfb0 r8:00000000 r7:00000000 r6:ee1cc000 r5:00000000 r4:00000000
[ 227.957029] [<c010c358>] (do_work_pending) from [<c0107c90>] (slow_work_pending+0xc/0x20)
[ 227.965219] r10:00000000 r9:ee1cc000 r8:c0107e24 r7:0000005b r6:b6f76568 r5:b6f741f0
[ 227.973058] r4:b6f76904
Maybe the reason this reproduces easily in this particular setup is
that ethernet causes lots of alignment faults?
Can you try this change?

diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c
index 3a48b54c6405..bc6ad7789568 100644
--- a/arch/arm/kernel/signal.c
+++ b/arch/arm/kernel/signal.c
@@ -573,12 +573,11 @@ do_work_pending(struct pt_regs *regs, unsigned
int thread_flags, int syscall)
*/
trace_hardirqs_off();

- /* Check valid user FS if needed */
- addr_limit_user_check();
-
do {
if (likely(thread_flags & _TIF_NEED_RESCHED)) {
schedule();
+ } else if (thread_flags & _TIF_FSCHECK) {
+ addr_limit_user_check();
} else {
if (unlikely(!user_mode(regs)))
return 0;
Post by Leonard Crestez
--
Regards,
Leonard
--
Thomas
Leonard Crestez
2017-07-18 17:18:31 UTC
Permalink
Post by Thomas Garnier
Post by Leonard Crestez
Post by Thomas Garnier
Ensure the address limit is a user-mode segment before returning to
user-mode. Otherwise a process can corrupt kernel-mode memory and
elevate privileges [1].
The set_fs function sets the TIF_SETFS flag to force a slow path on
return. In the slow path, the address limit is checked to be USER_DS if
needed.
The TIF_SETFS flag is added to _TIF_WORK_MASK shifting _TIF_SYSCALL_WORK
for arm instruction immediate support. The global work mask is too big
to used on a single instruction so adapt ret_fast_syscall.
@@ -571,6 +572,10 @@ do_work_pending(struct pt_regs *regs, unsigned int thread_flags, int syscall)
       * Update the trace code with the current status.
       */
      trace_hardirqs_off();
+
+     /* Check valid user FS if needed */
+     addr_limit_user_check();
+
      do {
              if (likely(thread_flags & _TIF_NEED_RESCHED)) {
                      schedule();
This patch made it's way into linux-next next-20170717 and it seems to
cause hangs when booting some boards over NFS (found via bisection). I
don't know exactly what determines the issue but I can reproduce hangs
if even if I just boot with init=/bin/bash and do stuff like
# sleep 1 & sleep 1 & sleep 1 & wait; wait; wait; echo done!
When this happens sysrq-t shows a sleep task hung in the 'R' state
spinning in do_work_pending, so maybe there is a potential infinite
loop here?
The addr_limit_user_check at the start of do_work_pending will check
for TIF_FSCHECK once and clear it but the function loops while
(thread_flags & _TIF_WORK_MASK), so it if TIF_FSCHECK is set again then
the loop will never terminate. Does this make sense?
Yes, it does. Thanks for looking into this.
Post by Leonard Crestez
I added some instrumentation to check if TIF_FSCHECK can show up during
the do_work_pending loop and the answer seems to be yes. I also tried
to get a stack with a set_fs call from inside do_work_pending and got
[  227.582402] CPU: 0 PID: 829 Comm: sleep Not tainted 4.12.0-01057-g93af8f7-dirty #332
[  227.590171] Hardware name: Freescale i.MX6 SoloLite (Device Tree)
[  227.598754] [] (dump_backtrace) from [] (show_stack+0x18/0x1c)
[  227.606339]  r7:00000000 r6:60070113 r5:00000000 r4:c105a958
[  227.612016] [] (show_stack) from [] (dump_stack+0xb4/0xe8)
[  227.619258] [] (dump_stack) from [] (mydbg_set_fs+0x40/0x48)
[  227.626671]  r9:c08cf35c r8:ee1cda7c r7:ee1e3dce r6:bf000000 r5:00000000 r4:ffffe000
[  227.634433] [] (mydbg_set_fs) from [] (__probe_kernel_read+0x44/0xd0)
[  227.642629] [] (__probe_kernel_read) from [] (do_alignment+0x8c/0x75c)
[  227.650909]  r10:ef085000 r9:c08cf35c r8:00000001 r7:ee1e3dce r6:c011b84c r5:ee1cdbe0
[  227.658748]  r4:00000000 r3:00000000
[  227.662338] [] (do_alignment) from [] (do_DataAbort+0x40/0xc0)
[  227.669921]  r10:ef085000 r9:ee1cc000 r8:ee1cdbe0 r7:ee1e3dce r6:c011b84c r5:00000001
[  227.677760]  r4:c100dd3c
[  227.680308] [] (do_DataAbort) from [] (__dabt_svc+0x64/0xa0)
[  227.687714] Exception stack(0xee1cdbe0 to 0xee1cdc28)
[  227.692780] dbe0: 9064a8c0 ee1e3de2 d82727d8 00000000 ee1b20c0 ee1e3dce 00000000 ef08572c
[  227.700971] dc00: c0bb2034 c10c75ea ef085000 ee1cdc74 ee1cdc00 ee1cdc30 c01761a8 c08cf35c
[  227.709158] dc20: 40070113 ffffffff
[  227.712661]  r8:c0bb2034 r7:ee1cdc14 r6:ffffffff r5:40070113 r4:c08cf35c
[  227.719382] [] (inet_gro_receive) from [] (dev_gro_receive+0x2f0/0x618)
[  227.727746]  r10:ef085000 r9:00000001 r8:00000000 r7:ef085710 r6:c1008b88 r5:ee1b20c0
[  227.735585]  r4:c1009f78
[  227.738132] [] (dev_gro_receive) from [] (napi_gro_receive+0x78/0x1f4)
[  227.746410]  r10:ef085000 r9:00000001 r8:c10d15ec r7:c100792c r6:ef085710 r5:c10c744e
[  227.754249]  r4:ee1b20c0
[  227.756801] [] (napi_gro_receive) from [] (fec_enet_rx_napi+0x39c/0x988)
[  227.765253]  r9:00000001 r8:f0c8a960 r7:00000000 r6:00000000 r5:ef086000 r4:ee1b20c0
[  227.773010] [] (fec_enet_rx_napi) from [] (net_rx_action+0x21c/0x474)
[  227.781201]  r10:ee1cdd78 r9:c0fa7b80 r8:ef7dab80 r7:0000012c r6:00000040 r5:00000001
[  227.789039]  r4:ef085710
[  227.791593] [] (net_rx_action) from [] (__do_softirq+0x158/0x534)
[  227.799437]  r10:00000008 r9:ee1cc000 r8:c10ce568 r7:c100792c r6:c10247bd r5:00000003
[  227.807275]  r4:c100208c
[  227.809824] [] (__do_softirq) from [] (irq_exit+0xec/0x168)
[  227.817147]  r10:c1007ea0 r9:ef010400 r8:00000001 r7:00000000 r6:c1007d3c r5:00000000
[  227.824984]  r4:c0fa534c
[  227.827534] [] (irq_exit) from [] (__handle_domain_irq+0x74/0xe8)
[  227.835377] [] (__handle_domain_irq) from [] (gic_handle_irq+0x58/0xbc)
[  227.843742]  r9:f080b100 r8:c105ae80 r7:ee1cde80 r6:000003ff r5:000003eb r4:f080b10c
[  227.851498] [] (gic_handle_irq) from [] (__irq_svc+0x70/0x98)
[  227.858990] Exception stack(0xee1cde80 to 0xee1cdec8)
[  227.864056] de80: ee7a1140 00000001 00000000 000012a9 ee7a1140 ee9d9f10 ee76edc0 ee9d9f60
[  227.872248] dea0: 00000000 ee9d9f10 00000010 ee1cdeec ee1cdeb8 ee1cded0 c038a77c c0389688
[  227.880434] dec0: 60070013 ffffffff
[  227.883937]  r10:00000010 r9:ee1cc000 r8:00000000 r7:ee1cdeb4 r6:ffffffff r5:60070013
[  227.891775]  r4:c0389688
[  227.894327] [] (nfs_file_clear_open_context) from [] (nfs_file_release+0x54/0x60)
[  227.903558]  r7:ee9a78a0 r6:ee68f010 r5:ee9d9f10 r4:ee76edc0
[  227.909235] [] (nfs_file_release) from [] (__fput+0x94/0x1e0)
[  227.916734] [] (__fput) from [] (____fput+0x10/0x14)
[  227.923448]  r10:c10d4298 r9:00000000 r8:00000000 r7:ef2ed780 r6:ef2edc00 r5:c10d5180
[  227.931286]  r4:ef2edbd4
[  227.933839] [] (____fput) from [] (task_work_run+0xc8/0xec)
[  227.941166] [] (task_work_run) from [] (do_work_pending+0x12c/0x1c4)
[  227.949271]  r9:ee1cdfb0 r8:00000000 r7:00000000 r6:ee1cc000 r5:00000000 r4:00000000
[  227.957029] [] (do_work_pending) from [] (slow_work_pending+0xc/0x20)
[  227.965219]  r10:00000000 r9:ee1cc000 r8:c0107e24 r7:0000005b r6:b6f76568 r5:b6f741f0
[  227.973058]  r4:b6f76904
Maybe the reason this reproduces easily in this particular setup is
that ethernet causes lots of alignment faults?
Can you try this change?
diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c
index 3a48b54c6405..bc6ad7789568 100644
--- a/arch/arm/kernel/signal.c
+++ b/arch/arm/kernel/signal.c
@@ -573,12 +573,11 @@ do_work_pending(struct pt_regs *regs, unsigned
int thread_flags, int syscall)
  */
  trace_hardirqs_off();
- /* Check valid user FS if needed */
- addr_limit_user_check();
-
  do {
  if (likely(thread_flags & _TIF_NEED_RESCHED)) {
  schedule();
+ } else if (thread_flags & _TIF_FSCHECK) {
+ addr_limit_user_check();
  } else {
  if (unlikely(!user_mode(regs)))
  return 0;
This does seem to work, it no longer hangs on boot in my setup. This is
obviously only a very superficial test.

The new location of this check seems weird, it's not clear why it
should be on an else path. Perhaps it should be moved to right before
where current_thread_info()->flags is fetched again?

The issue seems like it would affect arm64 as well.

If the purpose is hardening against buggy kernel code doing bad set_fs
calls shouldn't this flag also be checked before looking at
TIF_NEED_RESCHED and calling schedule()?

--
Regards,
Leonard
Thomas Garnier
2017-07-18 19:04:36 UTC
Permalink
On Tue, Jul 18, 2017 at 10:18 AM, Leonard Crestez
Post by Leonard Crestez
Post by Thomas Garnier
Post by Leonard Crestez
Post by Thomas Garnier
Ensure the address limit is a user-mode segment before returning to
user-mode. Otherwise a process can corrupt kernel-mode memory and
elevate privileges [1].
The set_fs function sets the TIF_SETFS flag to force a slow path on
return. In the slow path, the address limit is checked to be USER_DS if
needed.
The TIF_SETFS flag is added to _TIF_WORK_MASK shifting _TIF_SYSCALL_WORK
for arm instruction immediate support. The global work mask is too big
to used on a single instruction so adapt ret_fast_syscall.
@@ -571,6 +572,10 @@ do_work_pending(struct pt_regs *regs, unsigned int thread_flags, int syscall)
* Update the trace code with the current status.
*/
trace_hardirqs_off();
+
+ /* Check valid user FS if needed */
+ addr_limit_user_check();
+
do {
if (likely(thread_flags & _TIF_NEED_RESCHED)) {
schedule();
This patch made it's way into linux-next next-20170717 and it seems to
cause hangs when booting some boards over NFS (found via bisection). I
don't know exactly what determines the issue but I can reproduce hangs
if even if I just boot with init=/bin/bash and do stuff like
# sleep 1 & sleep 1 & sleep 1 & wait; wait; wait; echo done!
When this happens sysrq-t shows a sleep task hung in the 'R' state
spinning in do_work_pending, so maybe there is a potential infinite
loop here?
The addr_limit_user_check at the start of do_work_pending will check
for TIF_FSCHECK once and clear it but the function loops while
(thread_flags & _TIF_WORK_MASK), so it if TIF_FSCHECK is set again then
the loop will never terminate. Does this make sense?
Yes, it does. Thanks for looking into this.
Post by Leonard Crestez
I added some instrumentation to check if TIF_FSCHECK can show up during
the do_work_pending loop and the answer seems to be yes. I also tried
to get a stack with a set_fs call from inside do_work_pending and got
[ 227.582402] CPU: 0 PID: 829 Comm: sleep Not tainted 4.12.0-01057-g93af8f7-dirty #332
[ 227.590171] Hardware name: Freescale i.MX6 SoloLite (Device Tree)
[ 227.598754] [] (dump_backtrace) from [] (show_stack+0x18/0x1c)
[ 227.606339] r7:00000000 r6:60070113 r5:00000000 r4:c105a958
[ 227.612016] [] (show_stack) from [] (dump_stack+0xb4/0xe8)
[ 227.619258] [] (dump_stack) from [] (mydbg_set_fs+0x40/0x48)
[ 227.626671] r9:c08cf35c r8:ee1cda7c r7:ee1e3dce r6:bf000000 r5:00000000 r4:ffffe000
[ 227.634433] [] (mydbg_set_fs) from [] (__probe_kernel_read+0x44/0xd0)
[ 227.642629] [] (__probe_kernel_read) from [] (do_alignment+0x8c/0x75c)
[ 227.650909] r10:ef085000 r9:c08cf35c r8:00000001 r7:ee1e3dce r6:c011b84c r5:ee1cdbe0
[ 227.658748] r4:00000000 r3:00000000
[ 227.662338] [] (do_alignment) from [] (do_DataAbort+0x40/0xc0)
[ 227.669921] r10:ef085000 r9:ee1cc000 r8:ee1cdbe0 r7:ee1e3dce r6:c011b84c r5:00000001
[ 227.677760] r4:c100dd3c
[ 227.680308] [] (do_DataAbort) from [] (__dabt_svc+0x64/0xa0)
[ 227.687714] Exception stack(0xee1cdbe0 to 0xee1cdc28)
[ 227.692780] dbe0: 9064a8c0 ee1e3de2 d82727d8 00000000 ee1b20c0 ee1e3dce 00000000 ef08572c
[ 227.700971] dc00: c0bb2034 c10c75ea ef085000 ee1cdc74 ee1cdc00 ee1cdc30 c01761a8 c08cf35c
[ 227.709158] dc20: 40070113 ffffffff
[ 227.712661] r8:c0bb2034 r7:ee1cdc14 r6:ffffffff r5:40070113 r4:c08cf35c
[ 227.719382] [] (inet_gro_receive) from [] (dev_gro_receive+0x2f0/0x618)
[ 227.727746] r10:ef085000 r9:00000001 r8:00000000 r7:ef085710 r6:c1008b88 r5:ee1b20c0
[ 227.735585] r4:c1009f78
[ 227.738132] [] (dev_gro_receive) from [] (napi_gro_receive+0x78/0x1f4)
[ 227.746410] r10:ef085000 r9:00000001 r8:c10d15ec r7:c100792c r6:ef085710 r5:c10c744e
[ 227.754249] r4:ee1b20c0
[ 227.756801] [] (napi_gro_receive) from [] (fec_enet_rx_napi+0x39c/0x988)
[ 227.765253] r9:00000001 r8:f0c8a960 r7:00000000 r6:00000000 r5:ef086000 r4:ee1b20c0
[ 227.773010] [] (fec_enet_rx_napi) from [] (net_rx_action+0x21c/0x474)
[ 227.781201] r10:ee1cdd78 r9:c0fa7b80 r8:ef7dab80 r7:0000012c r6:00000040 r5:00000001
[ 227.789039] r4:ef085710
[ 227.791593] [] (net_rx_action) from [] (__do_softirq+0x158/0x534)
[ 227.799437] r10:00000008 r9:ee1cc000 r8:c10ce568 r7:c100792c r6:c10247bd r5:00000003
[ 227.807275] r4:c100208c
[ 227.809824] [] (__do_softirq) from [] (irq_exit+0xec/0x168)
[ 227.817147] r10:c1007ea0 r9:ef010400 r8:00000001 r7:00000000 r6:c1007d3c r5:00000000
[ 227.824984] r4:c0fa534c
[ 227.827534] [] (irq_exit) from [] (__handle_domain_irq+0x74/0xe8)
[ 227.835377] [] (__handle_domain_irq) from [] (gic_handle_irq+0x58/0xbc)
[ 227.843742] r9:f080b100 r8:c105ae80 r7:ee1cde80 r6:000003ff r5:000003eb r4:f080b10c
[ 227.851498] [] (gic_handle_irq) from [] (__irq_svc+0x70/0x98)
[ 227.858990] Exception stack(0xee1cde80 to 0xee1cdec8)
[ 227.864056] de80: ee7a1140 00000001 00000000 000012a9 ee7a1140 ee9d9f10 ee76edc0 ee9d9f60
[ 227.872248] dea0: 00000000 ee9d9f10 00000010 ee1cdeec ee1cdeb8 ee1cded0 c038a77c c0389688
[ 227.880434] dec0: 60070013 ffffffff
[ 227.883937] r10:00000010 r9:ee1cc000 r8:00000000 r7:ee1cdeb4 r6:ffffffff r5:60070013
[ 227.891775] r4:c0389688
[ 227.894327] [] (nfs_file_clear_open_context) from [] (nfs_file_release+0x54/0x60)
[ 227.903558] r7:ee9a78a0 r6:ee68f010 r5:ee9d9f10 r4:ee76edc0
[ 227.909235] [] (nfs_file_release) from [] (__fput+0x94/0x1e0)
[ 227.916734] [] (__fput) from [] (____fput+0x10/0x14)
[ 227.923448] r10:c10d4298 r9:00000000 r8:00000000 r7:ef2ed780 r6:ef2edc00 r5:c10d5180
[ 227.931286] r4:ef2edbd4
[ 227.933839] [] (____fput) from [] (task_work_run+0xc8/0xec)
[ 227.941166] [] (task_work_run) from [] (do_work_pending+0x12c/0x1c4)
[ 227.949271] r9:ee1cdfb0 r8:00000000 r7:00000000 r6:ee1cc000 r5:00000000 r4:00000000
[ 227.957029] [] (do_work_pending) from [] (slow_work_pending+0xc/0x20)
[ 227.965219] r10:00000000 r9:ee1cc000 r8:c0107e24 r7:0000005b r6:b6f76568 r5:b6f741f0
[ 227.973058] r4:b6f76904
Maybe the reason this reproduces easily in this particular setup is
that ethernet causes lots of alignment faults?
Can you try this change?
diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c
index 3a48b54c6405..bc6ad7789568 100644
--- a/arch/arm/kernel/signal.c
+++ b/arch/arm/kernel/signal.c
@@ -573,12 +573,11 @@ do_work_pending(struct pt_regs *regs, unsigned
int thread_flags, int syscall)
*/
trace_hardirqs_off();
- /* Check valid user FS if needed */
- addr_limit_user_check();
-
do {
if (likely(thread_flags & _TIF_NEED_RESCHED)) {
schedule();
+ } else if (thread_flags & _TIF_FSCHECK) {
+ addr_limit_user_check();
} else {
if (unlikely(!user_mode(regs)))
return 0;
This does seem to work, it no longer hangs on boot in my setup. This is
obviously only a very superficial test.
The new location of this check seems weird, it's not clear why it
should be on an else path. Perhaps it should be moved to right before
where current_thread_info()->flags is fetched again?
I was hitting bug when I tried that.I think that's because you
basically let the signal handler do pending work before you check the
flag, that's not a good idea.
Post by Leonard Crestez
The issue seems like it would affect arm64 as well.
Yes, I will propose a fix on each architecture.
Post by Leonard Crestez
If the purpose is hardening against buggy kernel code doing bad set_fs
calls shouldn't this flag also be checked before looking at
TIF_NEED_RESCHED and calling schedule()?
I am not sure to be honest. I expected schedule to only schedule the
processor to another task which would be fine given only the current
task have a bogus fs. I will put it first in case there is an edge
case scenario I missed.

What do you think? Let me know and I will look at changes all
architectures and testing them.

Thanks!
Post by Leonard Crestez
--
Regards,
Leonard
--
Thomas
Leonard Crestez
2017-07-19 14:58:20 UTC
Permalink
Post by Thomas Garnier
Post by Leonard Crestez
Post by Thomas Garnier
Post by Leonard Crestez
Post by Thomas Garnier
Ensure the address limit is a user-mode segment before returning to
user-mode. Otherwise a process can corrupt kernel-mode memory and
elevate privileges [1].
The set_fs function sets the TIF_SETFS flag to force a slow path on
return. In the slow path, the address limit is checked to be USER_DS if
needed.
The TIF_SETFS flag is added to _TIF_WORK_MASK shifting _TIF_SYSCALL_WORK
for arm instruction immediate support. The global work mask is too big
to used on a single instruction so adapt ret_fast_syscall.
@@ -571,6 +572,10 @@ do_work_pending(struct pt_regs *regs, unsigned int thread_flags, int syscall)
       * Update the trace code with the current status.
       */
      trace_hardirqs_off();
+
+     /* Check valid user FS if needed */
+     addr_limit_user_check();
+
      do {
              if (likely(thread_flags & _TIF_NEED_RESCHED)) {
                      schedule();
This patch made it's way into linux-next next-20170717 and it seems to
cause hangs when booting some boards over NFS (found via bisection). I
don't know exactly what determines the issue but I can reproduce hangs
if even if I just boot with init=/bin/bash and do stuff like
# sleep 1 & sleep 1 & sleep 1 & wait; wait; wait; echo done!
When this happens sysrq-t shows a sleep task hung in the 'R' state
spinning in do_work_pending, so maybe there is a potential infinite
loop here?
The addr_limit_user_check at the start of do_work_pending will check
for TIF_FSCHECK once and clear it but the function loops while
(thread_flags & _TIF_WORK_MASK), so it if TIF_FSCHECK is set again then
the loop will never terminate. Does this make sense?
Yes, it does. Thanks for looking into this.
Can you try this change?
diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c
index 3a48b54c6405..bc6ad7789568 100644
--- a/arch/arm/kernel/signal.c
+++ b/arch/arm/kernel/signal.c
@@ -573,12 +573,11 @@ do_work_pending(struct pt_regs *regs, unsigned
int thread_flags, int syscall)
  */
  trace_hardirqs_off();
- /* Check valid user FS if needed */
- addr_limit_user_check();
-
  do {
  if (likely(thread_flags & _TIF_NEED_RESCHED)) {
  schedule();
+ } else if (thread_flags & _TIF_FSCHECK) {
+ addr_limit_user_check();
  } else {
  if (unlikely(!user_mode(regs)))
  return 0;
This does seem to work, it no longer hangs on boot in my setup. This is
obviously only a very superficial test.
The new location of this check seems weird, it's not clear why it
should be on an else path. Perhaps it should be moved to right before
where current_thread_info()->flags is fetched again?
I was hitting bug when I tried that.I think that's because you
basically let the signal handler do pending work before you check the
flag, that's not a good idea.
Post by Leonard Crestez
If the purpose is hardening against buggy kernel code doing bad set_fs
calls shouldn't this flag also be checked before looking at
TIF_NEED_RESCHED and calling schedule()?
I am not sure to be honest. I expected schedule to only schedule the
processor to another task which would be fine given only the current
task have a bogus fs. I will put it first in case there is an edge
case scenario I missed.
What do you think? Let me know and I will look at changes all
architectures and testing them.
I don't know and I'd rather not guess on security issues. It's better
if someone else reviews the code.

Unless there is a very quick fix maybe this series should be removed or
reverted from linux-next? A diagnosis of "system calls can sometimes
hang on return" seems serious even for linux-next. Since it happens
very rarely in most setups I can easily imagine somebody spending a lot
of time digging at this.

--
Regards,
Leonard
Thomas Garnier
2017-07-19 16:51:12 UTC
Permalink
On Wed, Jul 19, 2017 at 7:58 AM, Leonard Crestez
Post by Leonard Crestez
Post by Thomas Garnier
Post by Leonard Crestez
Post by Thomas Garnier
Post by Leonard Crestez
Post by Thomas Garnier
Ensure the address limit is a user-mode segment before returning to
user-mode. Otherwise a process can corrupt kernel-mode memory and
elevate privileges [1].
The set_fs function sets the TIF_SETFS flag to force a slow path on
return. In the slow path, the address limit is checked to be USER_DS if
needed.
The TIF_SETFS flag is added to _TIF_WORK_MASK shifting _TIF_SYSCALL_WORK
for arm instruction immediate support. The global work mask is too big
to used on a single instruction so adapt ret_fast_syscall.
@@ -571,6 +572,10 @@ do_work_pending(struct pt_regs *regs, unsigned int thread_flags, int syscall)
* Update the trace code with the current status.
*/
trace_hardirqs_off();
+
+ /* Check valid user FS if needed */
+ addr_limit_user_check();
+
do {
if (likely(thread_flags & _TIF_NEED_RESCHED)) {
schedule();
This patch made it's way into linux-next next-20170717 and it seems to
cause hangs when booting some boards over NFS (found via bisection). I
don't know exactly what determines the issue but I can reproduce hangs
if even if I just boot with init=/bin/bash and do stuff like
# sleep 1 & sleep 1 & sleep 1 & wait; wait; wait; echo done!
When this happens sysrq-t shows a sleep task hung in the 'R' state
spinning in do_work_pending, so maybe there is a potential infinite
loop here?
The addr_limit_user_check at the start of do_work_pending will check
for TIF_FSCHECK once and clear it but the function loops while
(thread_flags & _TIF_WORK_MASK), so it if TIF_FSCHECK is set again then
the loop will never terminate. Does this make sense?
Yes, it does. Thanks for looking into this.
Can you try this change?
diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c
index 3a48b54c6405..bc6ad7789568 100644
--- a/arch/arm/kernel/signal.c
+++ b/arch/arm/kernel/signal.c
@@ -573,12 +573,11 @@ do_work_pending(struct pt_regs *regs, unsigned
int thread_flags, int syscall)
*/
trace_hardirqs_off();
- /* Check valid user FS if needed */
- addr_limit_user_check();
-
do {
if (likely(thread_flags & _TIF_NEED_RESCHED)) {
schedule();
+ } else if (thread_flags & _TIF_FSCHECK) {
+ addr_limit_user_check();
} else {
if (unlikely(!user_mode(regs)))
return 0;
This does seem to work, it no longer hangs on boot in my setup. This is
obviously only a very superficial test.
The new location of this check seems weird, it's not clear why it
should be on an else path. Perhaps it should be moved to right before
where current_thread_info()->flags is fetched again?
I was hitting bug when I tried that.I think that's because you
basically let the signal handler do pending work before you check the
flag, that's not a good idea.
Post by Leonard Crestez
If the purpose is hardening against buggy kernel code doing bad set_fs
calls shouldn't this flag also be checked before looking at
TIF_NEED_RESCHED and calling schedule()?
I am not sure to be honest. I expected schedule to only schedule the
processor to another task which would be fine given only the current
task have a bogus fs. I will put it first in case there is an edge
case scenario I missed.
What do you think? Let me know and I will look at changes all
architectures and testing them.
I don't know and I'd rather not guess on security issues. It's better
if someone else reviews the code.
Unless there is a very quick fix maybe this series should be removed or
reverted from linux-next? A diagnosis of "system calls can sometimes
hang on return" seems serious even for linux-next. Since it happens
very rarely in most setups I can easily imagine somebody spending a lot
of time digging at this.
I will send fixes for each architecture in the meantime.
Post by Leonard Crestez
--
Regards,
Leonard
--
Thomas
Russell King - ARM Linux
2017-07-19 17:06:14 UTC
Permalink
Post by Leonard Crestez
Post by Thomas Garnier
Post by Leonard Crestez
Post by Thomas Garnier
Post by Leonard Crestez
Post by Thomas Garnier
Ensure the address limit is a user-mode segment before returning to
user-mode. Otherwise a process can corrupt kernel-mode memory and
elevate privileges [1].
The set_fs function sets the TIF_SETFS flag to force a slow path on
return. In the slow path, the address limit is checked to be USER_DS if
needed.
The TIF_SETFS flag is added to _TIF_WORK_MASK shifting _TIF_SYSCALL_WORK
for arm instruction immediate support. The global work mask is too big
to used on a single instruction so adapt ret_fast_syscall.
@@ -571,6 +572,10 @@ do_work_pending(struct pt_regs *regs, unsigned int thread_flags, int syscall)
       * Update the trace code with the current status.
       */
      trace_hardirqs_off();
+
+     /* Check valid user FS if needed */
+     addr_limit_user_check();
+
      do {
              if (likely(thread_flags & _TIF_NEED_RESCHED)) {
                      schedule();
This patch made it's way into linux-next next-20170717 and it seems to
cause hangs when booting some boards over NFS (found via bisection). I
don't know exactly what determines the issue but I can reproduce hangs
if even if I just boot with init=/bin/bash and do stuff like
# sleep 1 & sleep 1 & sleep 1 & wait; wait; wait; echo done!
When this happens sysrq-t shows a sleep task hung in the 'R' state
spinning in do_work_pending, so maybe there is a potential infinite
loop here?
The addr_limit_user_check at the start of do_work_pending will check
for TIF_FSCHECK once and clear it but the function loops while
(thread_flags & _TIF_WORK_MASK), so it if TIF_FSCHECK is set again then
the loop will never terminate. Does this make sense?
Yes, it does. Thanks for looking into this.
Can you try this change?
diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c
index 3a48b54c6405..bc6ad7789568 100644
--- a/arch/arm/kernel/signal.c
+++ b/arch/arm/kernel/signal.c
@@ -573,12 +573,11 @@ do_work_pending(struct pt_regs *regs, unsigned
int thread_flags, int syscall)
  */
  trace_hardirqs_off();
- /* Check valid user FS if needed */
- addr_limit_user_check();
-
  do {
  if (likely(thread_flags & _TIF_NEED_RESCHED)) {
  schedule();
+ } else if (thread_flags & _TIF_FSCHECK) {
+ addr_limit_user_check();
  } else {
  if (unlikely(!user_mode(regs)))
  return 0;
This does seem to work, it no longer hangs on boot in my setup. This is
obviously only a very superficial test.
The new location of this check seems weird, it's not clear why it
should be on an else path. Perhaps it should be moved to right before
where current_thread_info()->flags is fetched again?
I was hitting bug when I tried that.I think that's because you
basically let the signal handler do pending work before you check the
flag, that's not a good idea.
Post by Leonard Crestez
If the purpose is hardening against buggy kernel code doing bad set_fs
calls shouldn't this flag also be checked before looking at
TIF_NEED_RESCHED and calling schedule()?
I am not sure to be honest. I expected schedule to only schedule the
processor to another task which would be fine given only the current
task have a bogus fs. I will put it first in case there is an edge
case scenario I missed.
What do you think? Let me know and I will look at changes all
architectures and testing them.
I don't know and I'd rather not guess on security issues. It's better
if someone else reviews the code.
Unless there is a very quick fix maybe this series should be removed or
reverted from linux-next? A diagnosis of "system calls can sometimes
hang on return" seems serious even for linux-next. Since it happens
very rarely in most setups I can easily imagine somebody spending a lot
of time digging at this.
Probably best to revert. I stopped looking at these patches during
the discussion, as the discussion seemed to be mainly around other
architectures, and I thought we had ARM settled.

Looking at this patch now, there's several things I'm not happy with.

The effect of adding a the new TIF flag for FSCHECK amongst the other
flags is that we end up overflowing the 8-bit constant, and have to
split the tests, meaning more instructions in the return path. Eg:

- tst r1, #_TIF_SYSCALL_WORK | _TIF_WORK_MASK
+ tst r1, #_TIF_SYSCALL_WORK
+ bne fast_work_pending
+ tst r1, #_TIF_WORK_MASK
bne fast_work_pending

should be written:

tst r1, #_TIF_SYSCALL_WORK
tsteq r1, #_TIF_WORK_MASK
bne fast_work_pending

and:

- tst r1, #_TIF_SYSCALL_WORK | _TIF_WORK_MASK
+ tst r1, #_TIF_SYSCALL_WORK
+ bne fast_work_pending
+ tst r1, #_TIF_WORK_MASK

should be:

tst r1, #_TIF_SYSCALL_WORK
tsteq r1, #_TIF_WORK_MASK

There's no need for extra branches.

Now, the next issue is that I don't think this TIF-flag approach is
good for ARM - alignment faults can happen any time due to misaligned
packets in the networking code, and we really don't want to be doing
this check in a place that we can loop.

My original suggestion for ARM was to do the address limit check after
all work had been processed, with interrupts disabled (so no
possibility of this kind of loop happening.) However, that seems to
have been replaced with this TIF approach, which is going to cause
loops - I suspect if the probes code is enabled, this will suffer
the same problem. Remember, the various probes stuff can walk
userspace stacks, which means they'll be using set_fs().

I don't see why we've ended up with this (imho) sub-standard TIF-flag
approach, and I think it's going to be very problematical.

Can we please go back to the approach I suggested back in March for
ARM that doesn't suffer from this problem?
--
RMK's Patch system: http://www.armlinux.org.uk/developer/patches/
FTTC broadband for 0.8mile line: currently at 9.6Mbps down 400kbps up
according to speedtest.net.
Thomas Garnier
2017-07-19 17:20:35 UTC
Permalink
On Wed, Jul 19, 2017 at 10:06 AM, Russell King - ARM Linux
Post by Russell King - ARM Linux
Post by Leonard Crestez
Post by Thomas Garnier
Post by Leonard Crestez
Post by Thomas Garnier
Post by Leonard Crestez
Post by Thomas Garnier
Ensure the address limit is a user-mode segment before returning to
user-mode. Otherwise a process can corrupt kernel-mode memory and
elevate privileges [1].
The set_fs function sets the TIF_SETFS flag to force a slow path on
return. In the slow path, the address limit is checked to be USER_DS if
needed.
The TIF_SETFS flag is added to _TIF_WORK_MASK shifting _TIF_SYSCALL_WORK
for arm instruction immediate support. The global work mask is too big
to used on a single instruction so adapt ret_fast_syscall.
@@ -571,6 +572,10 @@ do_work_pending(struct pt_regs *regs, unsigned int thread_flags, int syscall)
* Update the trace code with the current status.
*/
trace_hardirqs_off();
+
+ /* Check valid user FS if needed */
+ addr_limit_user_check();
+
do {
if (likely(thread_flags & _TIF_NEED_RESCHED)) {
schedule();
This patch made it's way into linux-next next-20170717 and it seems to
cause hangs when booting some boards over NFS (found via bisection). I
don't know exactly what determines the issue but I can reproduce hangs
if even if I just boot with init=/bin/bash and do stuff like
# sleep 1 & sleep 1 & sleep 1 & wait; wait; wait; echo done!
When this happens sysrq-t shows a sleep task hung in the 'R' state
spinning in do_work_pending, so maybe there is a potential infinite
loop here?
The addr_limit_user_check at the start of do_work_pending will check
for TIF_FSCHECK once and clear it but the function loops while
(thread_flags & _TIF_WORK_MASK), so it if TIF_FSCHECK is set again then
the loop will never terminate. Does this make sense?
Yes, it does. Thanks for looking into this.
Can you try this change?
diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c
index 3a48b54c6405..bc6ad7789568 100644
--- a/arch/arm/kernel/signal.c
+++ b/arch/arm/kernel/signal.c
@@ -573,12 +573,11 @@ do_work_pending(struct pt_regs *regs, unsigned
int thread_flags, int syscall)
*/
trace_hardirqs_off();
- /* Check valid user FS if needed */
- addr_limit_user_check();
-
do {
if (likely(thread_flags & _TIF_NEED_RESCHED)) {
schedule();
+ } else if (thread_flags & _TIF_FSCHECK) {
+ addr_limit_user_check();
} else {
if (unlikely(!user_mode(regs)))
return 0;
This does seem to work, it no longer hangs on boot in my setup. This is
obviously only a very superficial test.
The new location of this check seems weird, it's not clear why it
should be on an else path. Perhaps it should be moved to right before
where current_thread_info()->flags is fetched again?
I was hitting bug when I tried that.I think that's because you
basically let the signal handler do pending work before you check the
flag, that's not a good idea.
Post by Leonard Crestez
If the purpose is hardening against buggy kernel code doing bad set_fs
calls shouldn't this flag also be checked before looking at
TIF_NEED_RESCHED and calling schedule()?
I am not sure to be honest. I expected schedule to only schedule the
processor to another task which would be fine given only the current
task have a bogus fs. I will put it first in case there is an edge
case scenario I missed.
What do you think? Let me know and I will look at changes all
architectures and testing them.
I don't know and I'd rather not guess on security issues. It's better
if someone else reviews the code.
Unless there is a very quick fix maybe this series should be removed or
reverted from linux-next? A diagnosis of "system calls can sometimes
hang on return" seems serious even for linux-next. Since it happens
very rarely in most setups I can easily imagine somebody spending a lot
of time digging at this.
Probably best to revert. I stopped looking at these patches during
the discussion, as the discussion seemed to be mainly around other
architectures, and I thought we had ARM settled.
Looking at this patch now, there's several things I'm not happy with.
The effect of adding a the new TIF flag for FSCHECK amongst the other
flags is that we end up overflowing the 8-bit constant, and have to
- tst r1, #_TIF_SYSCALL_WORK | _TIF_WORK_MASK
+ tst r1, #_TIF_SYSCALL_WORK
+ bne fast_work_pending
+ tst r1, #_TIF_WORK_MASK
bne fast_work_pending
tst r1, #_TIF_SYSCALL_WORK
tsteq r1, #_TIF_WORK_MASK
bne fast_work_pending
- tst r1, #_TIF_SYSCALL_WORK | _TIF_WORK_MASK
+ tst r1, #_TIF_SYSCALL_WORK
+ bne fast_work_pending
+ tst r1, #_TIF_WORK_MASK
tst r1, #_TIF_SYSCALL_WORK
tsteq r1, #_TIF_WORK_MASK
There's no need for extra branches.
Now, the next issue is that I don't think this TIF-flag approach is
good for ARM - alignment faults can happen any time due to misaligned
packets in the networking code, and we really don't want to be doing
this check in a place that we can loop.
My original suggestion for ARM was to do the address limit check after
all work had been processed, with interrupts disabled (so no
possibility of this kind of loop happening.) However, that seems to
have been replaced with this TIF approach, which is going to cause
loops - I suspect if the probes code is enabled, this will suffer
the same problem. Remember, the various probes stuff can walk
userspace stacks, which means they'll be using set_fs().
I don't see why we've ended up with this (imho) sub-standard TIF-flag
approach, and I think it's going to be very problematical.
Can we please go back to the approach I suggested back in March for
ARM that doesn't suffer from this problem?
During the extensive thread discussion, Linus asked to move away from
architecture specific changes to this work flag system. I am glad to
fix the assembly as you asked on a separate patch.
Post by Russell King - ARM Linux
--
RMK's Patch system: http://www.armlinux.org.uk/developer/patches/
FTTC broadband for 0.8mile line: currently at 9.6Mbps down 400kbps up
according to speedtest.net.
--
Thomas
Russell King - ARM Linux
2017-07-19 18:35:43 UTC
Permalink
Post by Thomas Garnier
On Wed, Jul 19, 2017 at 10:06 AM, Russell King - ARM Linux
Post by Russell King - ARM Linux
Probably best to revert. I stopped looking at these patches during
the discussion, as the discussion seemed to be mainly around other
architectures, and I thought we had ARM settled.
Looking at this patch now, there's several things I'm not happy with.
The effect of adding a the new TIF flag for FSCHECK amongst the other
flags is that we end up overflowing the 8-bit constant, and have to
- tst r1, #_TIF_SYSCALL_WORK | _TIF_WORK_MASK
+ tst r1, #_TIF_SYSCALL_WORK
+ bne fast_work_pending
+ tst r1, #_TIF_WORK_MASK
bne fast_work_pending
tst r1, #_TIF_SYSCALL_WORK
tsteq r1, #_TIF_WORK_MASK
bne fast_work_pending
- tst r1, #_TIF_SYSCALL_WORK | _TIF_WORK_MASK
+ tst r1, #_TIF_SYSCALL_WORK
+ bne fast_work_pending
+ tst r1, #_TIF_WORK_MASK
tst r1, #_TIF_SYSCALL_WORK
tsteq r1, #_TIF_WORK_MASK
There's no need for extra branches.
Now, the next issue is that I don't think this TIF-flag approach is
good for ARM - alignment faults can happen any time due to misaligned
packets in the networking code, and we really don't want to be doing
this check in a place that we can loop.
My original suggestion for ARM was to do the address limit check after
all work had been processed, with interrupts disabled (so no
possibility of this kind of loop happening.) However, that seems to
have been replaced with this TIF approach, which is going to cause
loops - I suspect if the probes code is enabled, this will suffer
the same problem. Remember, the various probes stuff can walk
userspace stacks, which means they'll be using set_fs().
I don't see why we've ended up with this (imho) sub-standard TIF-flag
approach, and I think it's going to be very problematical.
Can we please go back to the approach I suggested back in March for
ARM that doesn't suffer from this problem?
During the extensive thread discussion, Linus asked to move away from
architecture specific changes to this work flag system. I am glad to
fix the assembly as you asked on a separate patch.
Well, for the record, I don't think you've got to the bottom of the
"infinite loop" potential of Linus' approach.

Eg, perf will likely trigger this same issue. Eg, perf record -a -g
will attempt to record the callchain both in kernel space and userspace
each time a perf interrupt happens. If the perf interrupt frequency is
sufficiently high that we have multiple interrupts during the execution
of do_work_pending() and its called functions, then that will turn this
into an infinite loop yet again.
--
RMK's Patch system: http://www.armlinux.org.uk/developer/patches/
FTTC broadband for 0.8mile line: currently at 9.6Mbps down 400kbps up
according to speedtest.net.
Thomas Garnier
2017-07-19 18:50:39 UTC
Permalink
On Wed, Jul 19, 2017 at 11:35 AM, Russell King - ARM Linux
Post by Russell King - ARM Linux
Post by Thomas Garnier
On Wed, Jul 19, 2017 at 10:06 AM, Russell King - ARM Linux
Post by Russell King - ARM Linux
Probably best to revert. I stopped looking at these patches during
the discussion, as the discussion seemed to be mainly around other
architectures, and I thought we had ARM settled.
Looking at this patch now, there's several things I'm not happy with.
The effect of adding a the new TIF flag for FSCHECK amongst the other
flags is that we end up overflowing the 8-bit constant, and have to
- tst r1, #_TIF_SYSCALL_WORK | _TIF_WORK_MASK
+ tst r1, #_TIF_SYSCALL_WORK
+ bne fast_work_pending
+ tst r1, #_TIF_WORK_MASK
bne fast_work_pending
tst r1, #_TIF_SYSCALL_WORK
tsteq r1, #_TIF_WORK_MASK
bne fast_work_pending
- tst r1, #_TIF_SYSCALL_WORK | _TIF_WORK_MASK
+ tst r1, #_TIF_SYSCALL_WORK
+ bne fast_work_pending
+ tst r1, #_TIF_WORK_MASK
tst r1, #_TIF_SYSCALL_WORK
tsteq r1, #_TIF_WORK_MASK
There's no need for extra branches.
Now, the next issue is that I don't think this TIF-flag approach is
good for ARM - alignment faults can happen any time due to misaligned
packets in the networking code, and we really don't want to be doing
this check in a place that we can loop.
My original suggestion for ARM was to do the address limit check after
all work had been processed, with interrupts disabled (so no
possibility of this kind of loop happening.) However, that seems to
have been replaced with this TIF approach, which is going to cause
loops - I suspect if the probes code is enabled, this will suffer
the same problem. Remember, the various probes stuff can walk
userspace stacks, which means they'll be using set_fs().
I don't see why we've ended up with this (imho) sub-standard TIF-flag
approach, and I think it's going to be very problematical.
Can we please go back to the approach I suggested back in March for
ARM that doesn't suffer from this problem?
During the extensive thread discussion, Linus asked to move away from
architecture specific changes to this work flag system. I am glad to
fix the assembly as you asked on a separate patch.
Well, for the record, I don't think you've got to the bottom of the
"infinite loop" potential of Linus' approach.
Eg, perf will likely trigger this same issue. Eg, perf record -a -g
will attempt to record the callchain both in kernel space and userspace
each time a perf interrupt happens. If the perf interrupt frequency is
sufficiently high that we have multiple interrupts during the execution
of do_work_pending() and its called functions, then that will turn this
into an infinite loop yet again.
Do you think it applies to the patch I just sent? The other approach
is to check at the entrance, ignore _TIF_FSCHECK on the loop and clear
it on exit.
Post by Russell King - ARM Linux
--
RMK's Patch system: http://www.armlinux.org.uk/developer/patches/
FTTC broadband for 0.8mile line: currently at 9.6Mbps down 400kbps up
according to speedtest.net.
--
Thomas
Thomas Garnier
2017-06-15 01:12:03 UTC
Permalink
Ensure the address limit is a user-mode segment before returning to
user-mode. Otherwise a process can corrupt kernel-mode memory and
elevate privileges [1].

The set_fs function sets the TIF_SETFS flag to force a slow path on
return. In the slow path, the address limit is checked to be USER_DS if
needed.

[1] https://bugs.chromium.org/p/project-zero/issues/detail?id=990

Signed-off-by: Thomas Garnier <***@google.com>
---
v10 redesigns the change to use work flags on set_fs as recommended by
Linus and agreed by others.

Based on next-20170609
---
arch/arm64/include/asm/thread_info.h | 4 +++-
arch/arm64/include/asm/uaccess.h | 3 +++
arch/arm64/kernel/signal.c | 5 +++++
3 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h
index 46c3b93cf865..c5ba565544ee 100644
--- a/arch/arm64/include/asm/thread_info.h
+++ b/arch/arm64/include/asm/thread_info.h
@@ -86,6 +86,7 @@ struct thread_info {
#define TIF_NOTIFY_RESUME 2 /* callback before returning to user */
#define TIF_FOREIGN_FPSTATE 3 /* CPU's FP state is not current's */
#define TIF_UPROBE 4 /* uprobe breakpoint or singlestep */
+#define TIF_FSCHECK 5 /* Check FS is USER_DS on return */
#define TIF_NOHZ 7
#define TIF_SYSCALL_TRACE 8
#define TIF_SYSCALL_AUDIT 9
@@ -107,11 +108,12 @@ struct thread_info {
#define _TIF_SYSCALL_TRACEPOINT (1 << TIF_SYSCALL_TRACEPOINT)
#define _TIF_SECCOMP (1 << TIF_SECCOMP)
#define _TIF_UPROBE (1 << TIF_UPROBE)
+#define _TIF_FSCHECK (1 << TIF_FSCHECK)
#define _TIF_32BIT (1 << TIF_32BIT)

#define _TIF_WORK_MASK (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \
_TIF_NOTIFY_RESUME | _TIF_FOREIGN_FPSTATE | \
- _TIF_UPROBE)
+ _TIF_UPROBE | _TIF_FSCHECK)

#define _TIF_SYSCALL_WORK (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \
_TIF_SYSCALL_TRACEPOINT | _TIF_SECCOMP | \
diff --git a/arch/arm64/include/asm/uaccess.h b/arch/arm64/include/asm/uaccess.h
index 7b8a04789cef..ced7a7c2dd41 100644
--- a/arch/arm64/include/asm/uaccess.h
+++ b/arch/arm64/include/asm/uaccess.h
@@ -45,6 +45,9 @@ static inline void set_fs(mm_segment_t fs)
{
current_thread_info()->addr_limit = fs;

+ /* On user-mode return, check fs is correct */
+ set_thread_flag(TIF_FSCHECK);
+
/*
* Enable/disable UAO so that copy_to_user() etc can access
* kernel memory with the unprivileged instructions.
diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c
index c7b6de62f9d3..0f0279148bdc 100644
--- a/arch/arm64/kernel/signal.c
+++ b/arch/arm64/kernel/signal.c
@@ -25,6 +25,7 @@
#include <linux/uaccess.h>
#include <linux/tracehook.h>
#include <linux/ratelimit.h>
+#include <linux/syscalls.h>

#include <asm/debug-monitors.h>
#include <asm/elf.h>
@@ -408,6 +409,10 @@ asmlinkage void do_notify_resume(struct pt_regs *regs,
* Update the trace code with the current status.
*/
trace_hardirqs_off();
+
+ /* Check valid user FS if needed */
+ addr_limit_user_check();
+
do {
if (thread_flags & _TIF_NEED_RESCHED) {
schedule();
--
2.13.1.518.g3df882009-goog
Catalin Marinas
2017-06-21 08:16:01 UTC
Permalink
Post by Thomas Garnier
Ensure the address limit is a user-mode segment before returning to
user-mode. Otherwise a process can corrupt kernel-mode memory and
elevate privileges [1].
The set_fs function sets the TIF_SETFS flag to force a slow path on
return. In the slow path, the address limit is checked to be USER_DS if
needed.
[1] https://bugs.chromium.org/p/project-zero/issues/detail?id=990
---
v10 redesigns the change to use work flags on set_fs as recommended by
Linus and agreed by others.
Based on next-20170609
---
arch/arm64/include/asm/thread_info.h | 4 +++-
arch/arm64/include/asm/uaccess.h | 3 +++
arch/arm64/kernel/signal.c | 5 +++++
3 files changed, 11 insertions(+), 1 deletion(-)
For arm64:

Reviewed-by: Catalin Marinas <***@arm.com>
Thomas Garnier
2017-06-21 13:57:44 UTC
Permalink
On Wed, Jun 21, 2017 at 1:16 AM, Catalin Marinas
Post by Thomas Garnier
Ensure the address limit is a user-mode segment before returning to
user-mode. Otherwise a process can corrupt kernel-mode memory and
elevate privileges [1].
The set_fs function sets the TIF_SETFS flag to force a slow path on
return. In the slow path, the address limit is checked to be USER_DS if
needed.
[1] https://bugs.chromium.org/p/project-zero/issues/detail?id=990
---
v10 redesigns the change to use work flags on set_fs as recommended by
Linus and agreed by others.
Based on next-20170609
---
arch/arm64/include/asm/thread_info.h | 4 +++-
arch/arm64/include/asm/uaccess.h | 3 +++
arch/arm64/kernel/signal.c | 5 +++++
3 files changed, 11 insertions(+), 1 deletion(-)
Thanks Catalin
--
Thomas
Kees Cook
2017-06-20 20:24:07 UTC
Permalink
Post by Thomas Garnier
Ensure the address limit is a user-mode segment before returning to
user-mode. Otherwise a process can corrupt kernel-mode memory and elevate
privileges [1].
The set_fs function sets the TIF_SETFS flag to force a slow path on
return. In the slow path, the address limit is checked to be USER_DS if
needed.
The addr_limit_user_check function is added as a cross-architecture
function to check the address limit.
[1] https://bugs.chromium.org/p/project-zero/issues/detail?id=990
Thanks for reworking this series!

The bad state correctly BUGs under the LKDTM test:

[ 21.171586] lkdtm: Performing direct entry CORRUPT_USER_DS
[ 21.172791] lkdtm: setting bad task size limit
[ 21.173742] ------------[ cut here ]------------
[ 21.174641] kernel BUG at ./include/linux/syscalls.h:220!
...
[ 21.193166] Call Trace:
[ 21.193617] ? trace_hardirqs_on_thunk+0x1a/0x1c
[ 21.194443] entry_SYSCALL64_slow_path+0x25/0x25


Tested-by: Kees Cook <***@chromium.org>

-Kees
--
Kees Cook
Pixel Security
Kees Cook
2017-06-28 17:52:55 UTC
Permalink
Post by Kees Cook
Post by Thomas Garnier
Ensure the address limit is a user-mode segment before returning to
user-mode. Otherwise a process can corrupt kernel-mode memory and elevate
privileges [1].
The set_fs function sets the TIF_SETFS flag to force a slow path on
return. In the slow path, the address limit is checked to be USER_DS if
needed.
The addr_limit_user_check function is added as a cross-architecture
function to check the address limit.
[1] https://bugs.chromium.org/p/project-zero/issues/detail?id=990
Thanks for reworking this series!
[ 21.171586] lkdtm: Performing direct entry CORRUPT_USER_DS
[ 21.172791] lkdtm: setting bad task size limit
[ 21.173742] ------------[ cut here ]------------
[ 21.174641] kernel BUG at ./include/linux/syscalls.h:220!
...
[ 21.193617] ? trace_hardirqs_on_thunk+0x1a/0x1c
[ 21.194443] entry_SYSCALL64_slow_path+0x25/0x25
Is everyone happy with this patch for x86? Does this need anything
more/different?

Thanks!

-Kees
--
Kees Cook
Pixel Security
Thomas Garnier
2017-07-06 20:38:19 UTC
Permalink
Post by Kees Cook
Post by Kees Cook
Post by Thomas Garnier
Ensure the address limit is a user-mode segment before returning to
user-mode. Otherwise a process can corrupt kernel-mode memory and elevate
privileges [1].
The set_fs function sets the TIF_SETFS flag to force a slow path on
return. In the slow path, the address limit is checked to be USER_DS if
needed.
The addr_limit_user_check function is added as a cross-architecture
function to check the address limit.
[1] https://bugs.chromium.org/p/project-zero/issues/detail?id=990
Thanks for reworking this series!
[ 21.171586] lkdtm: Performing direct entry CORRUPT_USER_DS
[ 21.172791] lkdtm: setting bad task size limit
[ 21.173742] ------------[ cut here ]------------
[ 21.174641] kernel BUG at ./include/linux/syscalls.h:220!
...
[ 21.193617] ? trace_hardirqs_on_thunk+0x1a/0x1c
[ 21.194443] entry_SYSCALL64_slow_path+0x25/0x25
Is everyone happy with this patch for x86? Does this need anything
more/different?
Asking again. Additional feedback? Anyone wants to pick-it up?
Post by Kees Cook
Thanks!
-Kees
--
Kees Cook
Pixel Security
--
Thomas
Thomas Gleixner
2017-07-06 20:48:46 UTC
Permalink
Post by Thomas Garnier
Post by Kees Cook
Post by Kees Cook
Post by Thomas Garnier
Ensure the address limit is a user-mode segment before returning to
user-mode. Otherwise a process can corrupt kernel-mode memory and elevate
privileges [1].
The set_fs function sets the TIF_SETFS flag to force a slow path on
return. In the slow path, the address limit is checked to be USER_DS if
needed.
The addr_limit_user_check function is added as a cross-architecture
function to check the address limit.
[1] https://bugs.chromium.org/p/project-zero/issues/detail?id=990
Thanks for reworking this series!
[ 21.171586] lkdtm: Performing direct entry CORRUPT_USER_DS
[ 21.172791] lkdtm: setting bad task size limit
[ 21.173742] ------------[ cut here ]------------
[ 21.174641] kernel BUG at ./include/linux/syscalls.h:220!
...
[ 21.193617] ? trace_hardirqs_on_thunk+0x1a/0x1c
[ 21.194443] entry_SYSCALL64_slow_path+0x25/0x25
Is everyone happy with this patch for x86? Does this need anything
more/different?
Asking again. Additional feedback? Anyone wants to pick-it up?
Can do. This needs to be a combo of all 3 I assume as the x86 one contains
the function used by all of them, right?

Thanks,

tglx
Thomas Garnier
2017-07-06 20:52:24 UTC
Permalink
Post by Thomas Gleixner
Post by Thomas Garnier
Post by Kees Cook
Post by Kees Cook
Post by Thomas Garnier
Ensure the address limit is a user-mode segment before returning to
user-mode. Otherwise a process can corrupt kernel-mode memory and elevate
privileges [1].
The set_fs function sets the TIF_SETFS flag to force a slow path on
return. In the slow path, the address limit is checked to be USER_DS if
needed.
The addr_limit_user_check function is added as a cross-architecture
function to check the address limit.
[1] https://bugs.chromium.org/p/project-zero/issues/detail?id=990
Thanks for reworking this series!
[ 21.171586] lkdtm: Performing direct entry CORRUPT_USER_DS
[ 21.172791] lkdtm: setting bad task size limit
[ 21.173742] ------------[ cut here ]------------
[ 21.174641] kernel BUG at ./include/linux/syscalls.h:220!
...
[ 21.193617] ? trace_hardirqs_on_thunk+0x1a/0x1c
[ 21.194443] entry_SYSCALL64_slow_path+0x25/0x25
Is everyone happy with this patch for x86? Does this need anything
more/different?
Asking again. Additional feedback? Anyone wants to pick-it up?
Can do. This needs to be a combo of all 3 I assume as the x86 one contains
the function used by all of them, right?
That is correct.
Post by Thomas Gleixner
Thanks,
tglx
--
Thomas
Loading...