Discussion:
[PATCH v7] ARM: net: JIT compiler for packet filters
(too old to reply)
Mircea Gherzan
2012-01-07 11:52:34 UTC
Permalink
Based of Matt Evans's PPC64 implementation.

The compiler generates ARM instructions but interworking is
supported for Thumb2 kernels.

Supports both little and big endian. Unaligned loads are emitted
for ARMv6+. Not all the BPF opcodes that deal with ancillary data
are supported. The scratch memory of the filter lives on the stack.
Hardware integer division is used if it is available.

Enabled in the same way as for x86-64 and PPC64:

echo 1 > /proc/sys/net/core/bpf_jit_enable

A value greater than 1 enables opcode output.

Signed-off-by: Mircea Gherzan <***@gmail.com>
---

Changes in v7:
* fix the intruction generation for LDX_MSH, OR_X, LSH_K,
RSH_K and JMP_JA
* fix the condition for saving the A register
* use fls() instead of the compiler builtin
* punt to the interpreter on absolute loads with K < 0
* check for invalid data references
* support the NEG opcode
* clear X in the prologue based on a context flag
* simplify the conditional jumps

Changes in v6:
* fix the code generation for the ANC_CPU opcode

Changes in v5:
* replace SEEN_LEN with SEEN_SKB
* set ctx->seen when handling some ancillary data opcodes

Changes in v4:
* first check if the JIT compiler is enabled
* fix the code generation for the LDX_MSH opcode

Changes in v3:
* no longer depend on EABI and !Thumb2
* add BLX "emulation" for ARMv4 without Thumb
* use the integer divide instruction on Cortex-A15
* fix the handling of the DIV_K opcode
* use a C wrapper for __aeabi_uidiv
* fix the generation of the epilogue (non-FP case)

Changes in v2:
* enable the compiler only for ARMv5+ because of the BLX instruction
* use the same comparison for the ARM version checks
* use misaligned accesses on ARMv6
* fix the SEEN_MEM
* fix the mem_words_used()

arch/arm/Kconfig | 1 +
arch/arm/Makefile | 1 +
arch/arm/net/Makefile | 3 +
arch/arm/net/bpf_jit_32.c | 912 +++++++++++++++++++++++++++++++++++++++++++++
arch/arm/net/bpf_jit_32.h | 190 ++++++++++
5 files changed, 1107 insertions(+), 0 deletions(-)
create mode 100644 arch/arm/net/Makefile
create mode 100644 arch/arm/net/bpf_jit_32.c
create mode 100644 arch/arm/net/bpf_jit_32.h

diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 3ee1818..c4ed36f 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -30,6 +30,7 @@ config ARM
select HAVE_SPARSE_IRQ
select GENERIC_IRQ_SHOW
select CPU_PM if (SUSPEND || CPU_IDLE)
+ select HAVE_BPF_JIT
help
The ARM series is a line of low-power-consumption RISC chip designs
licensed by ARM Ltd and targeted at embedded applications and
diff --git a/arch/arm/Makefile b/arch/arm/Makefile
index dfcf3b0..8810a10 100644
--- a/arch/arm/Makefile
+++ b/arch/arm/Makefile
@@ -255,6 +255,7 @@ core-$(CONFIG_VFP) += arch/arm/vfp/

# If we have a machine-specific directory, then include it in the build.
core-y += arch/arm/kernel/ arch/arm/mm/ arch/arm/common/
+core-y += arch/arm/net/
core-y += $(machdirs) $(platdirs)

drivers-$(CONFIG_OPROFILE) += arch/arm/oprofile/
diff --git a/arch/arm/net/Makefile b/arch/arm/net/Makefile
new file mode 100644
index 0000000..c2c1084
--- /dev/null
+++ b/arch/arm/net/Makefile
@@ -0,0 +1,3 @@
+# ARM-specific networking code
+
+obj-$(CONFIG_BPF_JIT) += bpf_jit_32.o
diff --git a/arch/arm/net/bpf_jit_32.c b/arch/arm/net/bpf_jit_32.c
new file mode 100644
index 0000000..21a9581
--- /dev/null
+++ b/arch/arm/net/bpf_jit_32.c
@@ -0,0 +1,912 @@
+/*
+ * Just-In-Time compiler for BPF filters on 32bit ARM
+ *
+ * Copyright (c) 2011 Mircea Gherzan <***@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; version 2 of the License.
+ */
+
+#include <linux/bitops.h>
+#include <linux/compiler.h>
+#include <linux/filter.h>
+#include <linux/moduleloader.h>
+#include <linux/netdevice.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+#include <asm/cacheflush.h>
+#include <asm/hwcap.h>
+
+#include "bpf_jit_32.h"
+
+/*
+ * ABI:
+ *
+ * r0 scratch register
+ * r4 BPF register A
+ * r5 BPF register X
+ * r6 pointer to the skb
+ * r7 skb->data
+ * r8 skb_headlen(skb)
+ */
+
+#define r_scratch ARM_R0
+/* r1-r3 are (also) used for the unaligned loads on the non-ARMv7 slowpath */
+#define r_off ARM_R1
+#define r_A ARM_R4
+#define r_X ARM_R5
+#define r_skb ARM_R6
+#define r_skb_data ARM_R7
+#define r_skb_hl ARM_R8
+
+#define SCRATCH_SP_OFFSET 0
+#define SCRATCH_OFF(k) (SCRATCH_SP_OFFSET + (k))
+
+#define SEEN_MEM ((1 << BPF_MEMWORDS) - 1)
+#define SEEN_MEM_WORD(k) (1 << (k))
+#define SEEN_X (1 << BPF_MEMWORDS)
+#define SEEN_CALL (1 << (BPF_MEMWORDS + 1))
+#define SEEN_SKB (1 << (BPF_MEMWORDS + 2))
+#define SEEN_DATA (1 << (BPF_MEMWORDS + 3))
+
+#define FLAG_NEED_X_RESET (1 << 0)
+
+struct jit_ctx {
+ const struct sk_filter *skf;
+ unsigned idx;
+ unsigned prologue_bytes;
+ int ret0_fp_idx;
+ u32 seen;
+ u32 flags;
+ u32 *offsets;
+ u32 *target;
+#if __LINUX_ARM_ARCH__ < 7
+ u16 epilogue_bytes;
+ u16 imm_count;
+ u32 *imms;
+#endif
+};
+
+int bpf_jit_enable __read_mostly;
+
+static u64 jit_get_skb_b(struct sk_buff *skb, unsigned offset)
+{
+ u8 ret;
+ int err;
+
+ err = skb_copy_bits(skb, offset, &ret, 1);
+
+ return (u64)err << 32 | ret;
+}
+
+static u64 jit_get_skb_h(struct sk_buff *skb, unsigned offset)
+{
+ u16 ret;
+ int err;
+
+ err = skb_copy_bits(skb, offset, &ret, 2);
+
+ return (u64)err << 32 | ntohs(ret);
+}
+
+static u64 jit_get_skb_w(struct sk_buff *skb, unsigned offset)
+{
+ u32 ret;
+ int err;
+
+ err = skb_copy_bits(skb, offset, &ret, 4);
+
+ return (u64)err << 32 | ntohl(ret);
+}
+
+/*
+ * Wrapper that handles both OABI and EABI and assures Thumb2 interworking
+ * (where the assembly routines like __aeabi_uidiv could cause problems).
+ */
+static u32 jit_udiv(u32 dividend, u32 divisor)
+{
+ return dividend / divisor;
+}
+
+static inline void _emit(int cond, u32 inst, struct jit_ctx *ctx)
+{
+ if (ctx->target != NULL)
+ ctx->target[ctx->idx] = inst | (cond << 28);
+
+ ctx->idx++;
+}
+
+/*
+ * Emit an instruction that will be executed unconditionally.
+ */
+static inline void emit(u32 inst, struct jit_ctx *ctx)
+{
+ _emit(ARM_COND_AL, inst, ctx);
+}
+
+static u16 saved_regs(struct jit_ctx *ctx)
+{
+ u16 ret = 0;
+
+ if ((ctx->skf->len > 1) ||
+ (ctx->skf->insns[0].code == BPF_S_RET_A))
+ ret |= 1 << r_A;
+
+#ifdef CONFIG_FRAME_POINTER
+ ret |= (1 << ARM_FP) | (1 << ARM_IP) | (1 << ARM_LR) | (1 << ARM_PC);
+#else
+ if (ctx->seen & SEEN_CALL)
+ ret |= 1 << ARM_LR;
+#endif
+ if (ctx->seen & (SEEN_DATA | SEEN_SKB))
+ ret |= 1 << r_skb;
+ if (ctx->seen & SEEN_DATA)
+ ret |= (1 << r_skb_data) | (1 << r_skb_hl);
+ if (ctx->seen & SEEN_X)
+ ret |= 1 << r_X;
+
+ return ret;
+}
+
+static inline int mem_words_used(struct jit_ctx *ctx)
+{
+ /* yes, we do waste some stack space IF there are "holes" in the set" */
+ return fls(ctx->seen & SEEN_MEM);
+}
+
+static inline bool is_load_to_a(u16 inst)
+{
+ switch (inst) {
+ case BPF_S_LD_W_LEN:
+ case BPF_S_LD_W_ABS:
+ case BPF_S_LD_H_ABS:
+ case BPF_S_LD_B_ABS:
+ case BPF_S_ANC_CPU:
+ case BPF_S_ANC_IFINDEX:
+ case BPF_S_ANC_MARK:
+ case BPF_S_ANC_PROTOCOL:
+ case BPF_S_ANC_RXHASH:
+ case BPF_S_ANC_QUEUE:
+ return true;
+ default:
+ return false;
+ }
+}
+
+static void build_prologue(struct jit_ctx *ctx)
+{
+ u16 reg_set = saved_regs(ctx);
+ u16 first_inst = ctx->skf->insns[0].code;
+ u16 off;
+
+#ifdef CONFIG_FRAME_POINTER
+ emit(ARM_MOV_R(ARM_IP, ARM_SP), ctx);
+ emit(ARM_PUSH(reg_set), ctx);
+ emit(ARM_SUB_I(ARM_FP, ARM_IP, 4), ctx);
+#else
+ if (reg_set)
+ emit(ARM_PUSH(reg_set), ctx);
+#endif
+
+ if (ctx->seen & (SEEN_DATA | SEEN_SKB))
+ emit(ARM_MOV_R(r_skb, ARM_R0), ctx);
+
+ if (ctx->seen & SEEN_DATA) {
+ off = offsetof(struct sk_buff, data);
+ emit(ARM_LDR_I(r_skb_data, r_skb, off), ctx);
+ /* headlen = len - data_len */
+ off = offsetof(struct sk_buff, len);
+ emit(ARM_LDR_I(r_skb_hl, r_skb, off), ctx);
+ off = offsetof(struct sk_buff, data_len);
+ emit(ARM_LDR_I(r_scratch, r_skb, off), ctx);
+ emit(ARM_SUB_R(r_skb_hl, r_skb_hl, r_scratch), ctx);
+ }
+
+ if (ctx->flags & FLAG_NEED_X_RESET)
+ emit(ARM_MOV_I(r_X, 0), ctx);
+
+ /* do not leak kernel data to userspace */
+ if ((first_inst != BPF_S_RET_K) && !(is_load_to_a(first_inst)))
+ emit(ARM_MOV_I(r_A, 0), ctx);
+
+ /* stack space for the BPF_MEM words */
+ if (ctx->seen & SEEN_MEM)
+ emit(ARM_SUB_I(ARM_SP, ARM_SP, mem_words_used(ctx) * 4), ctx);
+}
+
+static void build_epilogue(struct jit_ctx *ctx)
+{
+ u16 reg_set = saved_regs(ctx);
+
+ if (ctx->seen & SEEN_MEM)
+ emit(ARM_ADD_I(ARM_SP, ARM_SP, mem_words_used(ctx) * 4), ctx);
+
+ reg_set &= ~(1 << ARM_LR);
+
+#ifdef CONFIG_FRAME_POINTER
+ /* the first instruction of the prologue was: mov ip, sp */
+ reg_set &= ~(1 << ARM_IP);
+ reg_set |= (1 << ARM_SP);
+ emit(ARM_LDM(ARM_SP, reg_set), ctx);
+#else
+ if (reg_set) {
+ if (ctx->seen & SEEN_CALL)
+ reg_set |= 1 << ARM_PC;
+ emit(ARM_POP(reg_set), ctx);
+ }
+
+ if (!(ctx->seen & SEEN_CALL))
+ emit(ARM_BX(ARM_LR), ctx);
+#endif
+}
+
+static int16_t imm8m(u32 x)
+{
+ u32 rot;
+
+ for (rot = 0; rot < 16; rot++)
+ if ((x & ~ror32(0xff, 2 * rot)) == 0)
+ return rol32(x, 2 * rot) | (rot << 8);
+
+ return -1;
+}
+
+#if __LINUX_ARM_ARCH__ < 7
+
+static u16 imm_offset(u32 k, struct jit_ctx *ctx)
+{
+ unsigned i = 0, offset;
+ u16 imm;
+
+ /* on the "fake" run we just count them (duplicates included) */
+ if (ctx->target == NULL) {
+ ctx->imm_count++;
+ return 0;
+ }
+
+ while ((i < ctx->imm_count) && ctx->imms[i]) {
+ if (ctx->imms[i] == k)
+ break;
+ i++;
+ }
+
+ if (ctx->imms[i] == 0)
+ ctx->imms[i] = k;
+
+ /* constants go just after the epilogue */
+ offset = ctx->offsets[ctx->skf->len];
+ offset += ctx->prologue_bytes;
+ offset += ctx->epilogue_bytes;
+ offset += i * 4;
+
+ ctx->target[offset / 4] = k;
+
+ /* PC in ARM mode == address of the instruction + 8 */
+ imm = offset - (8 + ctx->idx * 4);
+
+ return imm;
+}
+
+#endif /* __LINUX_ARM_ARCH__ */
+
+/*
+ * Move an immediate that's not an imm8m to a core register.
+ */
+static inline void emit_mov_i_no8m(int rd, u32 val, struct jit_ctx *ctx)
+{
+#if __LINUX_ARM_ARCH__ < 7
+ emit(ARM_LDR_I(rd, ARM_PC, imm_offset(val, ctx)), ctx);
+#else
+ emit(ARM_MOVW(rd, val & 0xffff), ctx);
+ if (val > 0xffff)
+ emit(ARM_MOVT(rd, val >> 16), ctx);
+#endif
+}
+
+static inline void emit_mov_i(int rd, u32 val, struct jit_ctx *ctx)
+{
+ int imm12 = imm8m(val);
+
+ if (imm12 >= 0)
+ emit(ARM_MOV_I(rd, imm12), ctx);
+ else
+ emit_mov_i_no8m(rd, val, ctx);
+}
+
+#if __LINUX_ARM_ARCH__ < 6
+
+static void emit_load_be32(u8 cond, u8 r_res, u8 r_addr, struct jit_ctx *ctx)
+{
+ _emit(cond, ARM_LDRB_I(ARM_R3, r_addr, 1), ctx);
+ _emit(cond, ARM_LDRB_I(ARM_R1, r_addr, 0), ctx);
+ _emit(cond, ARM_LDRB_I(ARM_R2, r_addr, 3), ctx);
+ _emit(cond, ARM_LSL_I(ARM_R3, ARM_R3, 16), ctx);
+ _emit(cond, ARM_LDRB_I(ARM_R0, r_addr, 2), ctx);
+ _emit(cond, ARM_ORR_S(ARM_R3, ARM_R3, ARM_R1, SRTYPE_LSL, 24), ctx);
+ _emit(cond, ARM_ORR_R(ARM_R3, ARM_R3, ARM_R2), ctx);
+ _emit(cond, ARM_ORR_S(r_res, ARM_R3, ARM_R0, SRTYPE_LSL, 8), ctx);
+}
+
+static void emit_load_be16(u8 cond, u8 r_res, u8 r_addr, struct jit_ctx *ctx)
+{
+ _emit(cond, ARM_LDRB_I(ARM_R1, r_addr, 0), ctx);
+ _emit(cond, ARM_LDRB_I(ARM_R2, r_addr, 1), ctx);
+ _emit(cond, ARM_ORR_S(r_res, ARM_R2, ARM_R1, SRTYPE_LSL, 8), ctx);
+}
+
+static inline void emit_swap16(u8 r_dst, u8 r_src, struct jit_ctx *ctx)
+{
+ emit(ARM_LSL_R(ARM_R1, r_src, 8), ctx);
+ emit(ARM_ORR_S(r_dst, ARM_R1, r_src, SRTYPE_LSL, 8), ctx);
+ emit(ARM_LSL_I(r_dst, r_dst, 8), ctx);
+ emit(ARM_LSL_R(r_dst, r_dst, 8), ctx);
+}
+
+#else /* ARMv6+ */
+
+static void emit_load_be32(u8 cond, u8 r_res, u8 r_addr, struct jit_ctx *ctx)
+{
+ _emit(cond, ARM_LDR_I(r_res, r_addr, 0), ctx);
+#ifdef __LITTLE_ENDIAN
+ _emit(cond, ARM_REV(r_res, r_res), ctx);
+#endif
+}
+
+static void emit_load_be16(u8 cond, u8 r_res, u8 r_addr, struct jit_ctx *ctx)
+{
+ _emit(cond, ARM_LDRH_I(r_res, r_addr, 0), ctx);
+#ifdef __LITTLE_ENDIAN
+ _emit(cond, ARM_REV16(r_res, r_res), ctx);
+#endif
+}
+
+static inline void emit_swap16(u8 r_dst __maybe_unused,
+ u8 r_src __maybe_unused,
+ struct jit_ctx *ctx __maybe_unused)
+{
+#ifdef __LITTLE_ENDIAN
+ emit(ARM_REV16(r_dst, r_src), ctx);
+#endif
+}
+
+#endif /* __LINUX_ARM_ARCH__ < 6 */
+
+
+/* Compute the immediate value for a PC-relative branch. */
+static inline u32 b_imm(unsigned tgt, struct jit_ctx *ctx)
+{
+ u32 imm;
+
+ if (ctx->target == NULL)
+ return 0;
+ /*
+ * BPF allows only forward jumps and the offset of the target is
+ * still the one computed during the first pass.
+ */
+ imm = ctx->offsets[tgt] + ctx->prologue_bytes - (ctx->idx * 4 + 8);
+
+ return imm >> 2;
+}
+
+#define OP_IMM3(op, r1, r2, imm_val, ctx) \
+ do { \
+ imm12 = imm8m(imm_val); \
+ if (imm12 < 0) { \
+ emit_mov_i_no8m(r_scratch, imm_val, ctx); \
+ emit(op ## _R((r1), (r2), r_scratch), ctx); \
+ } else { \
+ emit(op ## _I((r1), (r2), imm12), ctx); \
+ } \
+ } while (0)
+
+static inline void emit_err_ret(u8 cond, struct jit_ctx *ctx)
+{
+ if (ctx->ret0_fp_idx >= 0) {
+ _emit(cond, ARM_B(b_imm(ctx->ret0_fp_idx, ctx)), ctx);
+ /* NOP to keep the size constant between passes */
+ emit(ARM_MOV_R(ARM_R0, ARM_R0), ctx);
+ } else {
+ _emit(cond, ARM_MOV_I(ARM_R0, 0), ctx);
+ _emit(cond, ARM_B(b_imm(ctx->skf->len, ctx)), ctx);
+ }
+}
+
+static inline void emit_blx_r(u8 tgt_reg, struct jit_ctx *ctx)
+{
+#if __LINUX_ARM_ARCH__ < 5
+ emit(ARM_MOV_R(ARM_LR, ARM_PC), ctx);
+
+ if (elf_hwcap & HWCAP_THUMB)
+ emit(ARM_BX(tgt_reg), ctx);
+ else
+ emit(ARM_MOV_R(ARM_PC, tgt_reg), ctx);
+#else
+ emit(ARM_BLX_R(tgt_reg), ctx);
+#endif
+}
+
+static inline void emit_udiv(u8 rd, u8 rm, u8 rn, struct jit_ctx *ctx)
+{
+#if __LINUX_ARM_ARCH__ == 7
+ if (elf_hwcap & HWCAP_IDIVA) {
+ emit(ARM_UDIV(rd, rm, rn), ctx);
+ return;
+ }
+#endif
+ if (rm != ARM_R0)
+ emit(ARM_MOV_R(ARM_R0, rm), ctx);
+ if (rn != ARM_R1)
+ emit(ARM_MOV_R(ARM_R1, rn), ctx);
+
+ ctx->seen |= SEEN_CALL;
+ emit_mov_i(ARM_R3, (u32)jit_udiv, ctx);
+ emit_blx_r(ARM_R3, ctx);
+
+ if (rd != ARM_R0)
+ emit(ARM_MOV_R(rd, ARM_R0), ctx);
+}
+
+static inline void update_on_xread(struct jit_ctx *ctx)
+{
+ if (!(ctx->seen & SEEN_X))
+ ctx->flags |= FLAG_NEED_X_RESET;
+
+ ctx->seen |= SEEN_X;
+}
+
+static int build_body(struct jit_ctx *ctx)
+{
+ void *load_func[] = {jit_get_skb_b, jit_get_skb_h, jit_get_skb_w};
+ const struct sk_filter *prog = ctx->skf;
+ const struct sock_filter *inst;
+ unsigned i, load_order, off, condt;
+ int imm12;
+ u32 k;
+
+ for (i = 0; i < prog->len; i++) {
+ inst = &(prog->insns[i]);
+ /* K as an immediate value operand */
+ k = inst->k;
+
+ /* compute offsets only in the fake pass */
+ if (ctx->target == NULL)
+ ctx->offsets[i] = ctx->idx * 4;
+
+ switch (inst->code) {
+ case BPF_S_LD_IMM:
+ emit_mov_i(r_A, k, ctx);
+ break;
+ case BPF_S_LD_W_LEN:
+ ctx->seen |= SEEN_SKB;
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, len) != 4);
+ emit(ARM_LDR_I(r_A, r_skb,
+ offsetof(struct sk_buff, len)), ctx);
+ break;
+ case BPF_S_LD_MEM:
+ /* A = scratch[k] */
+ ctx->seen |= SEEN_MEM_WORD(k);
+ emit(ARM_LDR_I(r_A, ARM_SP, SCRATCH_OFF(k)), ctx);
+ break;
+ case BPF_S_LD_W_ABS:
+ load_order = 2;
+ goto load;
+ case BPF_S_LD_H_ABS:
+ load_order = 1;
+ goto load;
+ case BPF_S_LD_B_ABS:
+ load_order = 0;
+load:
+ /* the interpreter will deal with the negative K */
+ if (k < 0)
+ return -1;
+ emit_mov_i(r_off, k, ctx);
+load_common:
+ ctx->seen |= SEEN_DATA | SEEN_CALL;
+
+ if (load_order > 0) {
+ emit(ARM_SUB_I(r_scratch, r_skb_hl,
+ 1 << load_order), ctx);
+ emit(ARM_CMP_R(r_scratch, r_off), ctx);
+ condt = ARM_COND_HS;
+ } else {
+ emit(ARM_CMP_R(r_skb_hl, r_off), ctx);
+ condt = ARM_COND_HI;
+ }
+
+ _emit(condt, ARM_ADD_R(r_scratch, r_off, r_skb_data),
+ ctx);
+
+ if (load_order == 0)
+ _emit(condt, ARM_LDRB_I(r_A, r_scratch, 0),
+ ctx);
+ else if (load_order == 1)
+ emit_load_be16(condt, r_A, r_scratch, ctx);
+ else if (load_order == 2)
+ emit_load_be32(condt, r_A, r_scratch, ctx);
+
+ _emit(condt, ARM_B(b_imm(i + 1, ctx)), ctx);
+
+ /* the slowpath */
+ emit_mov_i(ARM_R3, (u32)load_func[load_order], ctx);
+ emit(ARM_MOV_R(ARM_R0, r_skb), ctx);
+ /* the offset is already in R1 */
+ emit_blx_r(ARM_R3, ctx);
+ /* check the result of skb_copy_bits */
+ emit(ARM_CMP_I(ARM_R1, 0), ctx);
+ emit_err_ret(ARM_COND_NE, ctx);
+ emit(ARM_MOV_R(r_A, ARM_R0), ctx);
+ break;
+ case BPF_S_LD_W_IND:
+ load_order = 2;
+ goto load_ind;
+ case BPF_S_LD_H_IND:
+ load_order = 1;
+ goto load_ind;
+ case BPF_S_LD_B_IND:
+ load_order = 0;
+load_ind:
+ OP_IMM3(ARM_ADD, r_off, r_X, k, ctx);
+ goto load_common;
+ case BPF_S_LDX_IMM:
+ ctx->seen |= SEEN_X;
+ emit_mov_i(r_X, k, ctx);
+ break;
+ case BPF_S_LDX_W_LEN:
+ ctx->seen |= SEEN_X | SEEN_SKB;
+ emit(ARM_LDR_I(r_X, r_skb,
+ offsetof(struct sk_buff, len)), ctx);
+ break;
+ case BPF_S_LDX_MEM:
+ ctx->seen |= SEEN_X | SEEN_MEM_WORD(k);
+ emit(ARM_LDR_I(r_X, ARM_SP, SCRATCH_OFF(k)), ctx);
+ break;
+ case BPF_S_LDX_B_MSH:
+ /* x = ((*(frame + k)) & 0xf) << 2; */
+ ctx->seen |= SEEN_X | SEEN_DATA | SEEN_CALL;
+ /* the interpreter should deal with the negative K */
+ if (k < 0)
+ return -1;
+ /* offset in r1: we might have to take the slow path */
+ emit_mov_i(r_off, k, ctx);
+ emit(ARM_CMP_R(r_skb_hl, r_off), ctx);
+
+ /* load in r0: common with the slowpath */
+ _emit(ARM_COND_HI, ARM_LDRB_R(ARM_R0, r_skb_data,
+ ARM_R1), ctx);
+ /*
+ * emit_mov_i() might generate one or two instructions,
+ * the same holds for emit_blx_r()
+ */
+ _emit(ARM_COND_HI, ARM_B(b_imm(i + 1, ctx) - 2), ctx);
+
+ emit(ARM_MOV_R(ARM_R0, r_skb), ctx);
+ /* r_off is r1 */
+ emit_mov_i(ARM_R3, (u32)jit_get_skb_b, ctx);
+ emit_blx_r(ARM_R3, ctx);
+
+ emit(ARM_AND_I(r_X, ARM_R0, 0x00f), ctx);
+ emit(ARM_LSL_I(r_X, r_X, 2), ctx);
+ break;
+ case BPF_S_ST:
+ ctx->seen |= SEEN_MEM_WORD(k);
+ emit(ARM_STR_I(r_A, ARM_SP, SCRATCH_OFF(k)), ctx);
+ break;
+ case BPF_S_STX:
+ update_on_xread(ctx);
+ ctx->seen |= SEEN_MEM_WORD(k);
+ emit(ARM_STR_I(r_X, ARM_SP, SCRATCH_OFF(k)), ctx);
+ break;
+ case BPF_S_ALU_ADD_K:
+ /* A += K */
+ OP_IMM3(ARM_ADD, r_A, r_A, k, ctx);
+ break;
+ case BPF_S_ALU_ADD_X:
+ update_on_xread(ctx);
+ emit(ARM_ADD_R(r_A, r_A, r_X), ctx);
+ break;
+ case BPF_S_ALU_SUB_K:
+ /* A -= K */
+ OP_IMM3(ARM_SUB, r_A, r_A, k, ctx);
+ break;
+ case BPF_S_ALU_SUB_X:
+ update_on_xread(ctx);
+ emit(ARM_SUB_R(r_A, r_A, r_X), ctx);
+ break;
+ case BPF_S_ALU_MUL_K:
+ /* A *= K */
+ emit_mov_i(r_scratch, k, ctx);
+ emit(ARM_MUL(r_A, r_A, r_scratch), ctx);
+ break;
+ case BPF_S_ALU_MUL_X:
+ update_on_xread(ctx);
+ emit(ARM_MUL(r_A, r_A, r_X), ctx);
+ break;
+ case BPF_S_ALU_DIV_K:
+ /* current k == reciprocal_value(userspace k) */
+ emit_mov_i(r_scratch, k, ctx);
+ /* A = top 32 bits of the product */
+ emit(ARM_UMULL(r_scratch, r_A, r_A, r_scratch), ctx);
+ break;
+ case BPF_S_ALU_DIV_X:
+ update_on_xread(ctx);
+ emit(ARM_CMP_I(r_X, 0), ctx);
+ emit_err_ret(ARM_COND_EQ, ctx);
+ emit_udiv(r_A, r_A, r_X, ctx);
+ break;
+ case BPF_S_ALU_OR_K:
+ /* A |= K */
+ OP_IMM3(ARM_ORR, r_A, r_A, k, ctx);
+ break;
+ case BPF_S_ALU_OR_X:
+ update_on_xread(ctx);
+ emit(ARM_ORR_R(r_A, r_A, r_X), ctx);
+ break;
+ case BPF_S_ALU_AND_K:
+ /* A &= K */
+ OP_IMM3(ARM_AND, r_A, r_A, k, ctx);
+ break;
+ case BPF_S_ALU_AND_X:
+ update_on_xread(ctx);
+ emit(ARM_AND_R(r_A, r_A, r_X), ctx);
+ break;
+ case BPF_S_ALU_LSH_K:
+ if (unlikely(k > 31))
+ return -1;
+ emit(ARM_LSL_I(r_A, r_A, k), ctx);
+ break;
+ case BPF_S_ALU_LSH_X:
+ update_on_xread(ctx);
+ emit(ARM_LSL_R(r_A, r_A, r_X), ctx);
+ break;
+ case BPF_S_ALU_RSH_K:
+ if (unlikely(k > 31))
+ return -1;
+ emit(ARM_LSR_I(r_A, r_A, k), ctx);
+ break;
+ case BPF_S_ALU_RSH_X:
+ update_on_xread(ctx);
+ emit(ARM_LSR_R(r_A, r_A, r_X), ctx);
+ break;
+ case BPF_S_ALU_NEG:
+ /* A = -A */
+ emit(ARM_RSB_I(r_A, r_A, 0), ctx);
+ break;
+ case BPF_S_JMP_JA:
+ /* pc += K */
+ emit(ARM_B(b_imm(i + k + 1, ctx)), ctx);
+ break;
+ case BPF_S_JMP_JEQ_K:
+ /* pc += (A == K) ? pc->jt : pc->jf */
+ condt = ARM_COND_EQ;
+ goto cmp_imm;
+ case BPF_S_JMP_JGT_K:
+ /* pc += (A > K) ? pc->jt : pc->jf */
+ condt = ARM_COND_HI;
+ goto cmp_imm;
+ case BPF_S_JMP_JGE_K:
+ /* pc += (A >= K) ? pc->jt : pc->jf */
+ condt = ARM_COND_HS;
+cmp_imm:
+ imm12 = imm8m(k);
+ if (imm12 < 0) {
+ emit_mov_i_no8m(r_scratch, k, ctx);
+ emit(ARM_CMP_R(r_A, r_scratch), ctx);
+ } else {
+ emit(ARM_CMP_I(r_A, imm12), ctx);
+ }
+cond_jump:
+ if (inst->jt)
+ _emit(condt, ARM_B(b_imm(i + inst->jt + 1,
+ ctx)), ctx);
+ if (inst->jf)
+ _emit(condt ^ 1, ARM_B(b_imm(i + inst->jf + 1,
+ ctx)), ctx);
+ break;
+ case BPF_S_JMP_JEQ_X:
+ /* pc += (A == X) ? pc->jt : pc->jf */
+ condt = ARM_COND_EQ;
+ goto cmp_x;
+ case BPF_S_JMP_JGT_X:
+ /* pc += (A > X) ? pc->jt : pc->jf */
+ condt = ARM_COND_HI;
+ goto cmp_x;
+ case BPF_S_JMP_JGE_X:
+ /* pc += (A >= X) ? pc->jt : pc->jf */
+ condt = ARM_COND_CS;
+cmp_x:
+ update_on_xread(ctx);
+ emit(ARM_CMP_R(r_A, r_X), ctx);
+ goto cond_jump;
+ case BPF_S_JMP_JSET_K:
+ /* pc += (A & K) ? pc->jt : pc->jf */
+ condt = ARM_COND_NE;
+ /* not set iff all zeroes iff Z==1 iff EQ */
+
+ imm12 = imm8m(k);
+ if (imm12 < 0) {
+ emit_mov_i_no8m(r_scratch, k, ctx);
+ emit(ARM_TST_R(r_A, r_scratch), ctx);
+ } else {
+ emit(ARM_TST_I(r_A, imm12), ctx);
+ }
+ goto cond_jump;
+ case BPF_S_JMP_JSET_X:
+ /* pc += (A & X) ? pc->jt : pc->jf */
+ update_on_xread(ctx);
+ condt = ARM_COND_NE;
+ emit(ARM_TST_R(r_A, r_X), ctx);
+ goto cond_jump;
+ case BPF_S_RET_A:
+ emit(ARM_MOV_R(ARM_R0, r_A), ctx);
+ goto b_epilogue;
+ case BPF_S_RET_K:
+ if ((k == 0) && (ctx->ret0_fp_idx < 0))
+ ctx->ret0_fp_idx = i;
+ emit_mov_i(ARM_R0, k, ctx);
+b_epilogue:
+ if (i != ctx->skf->len - 1)
+ emit(ARM_B(b_imm(prog->len, ctx)), ctx);
+ break;
+ case BPF_S_MISC_TAX:
+ /* X = A */
+ ctx->seen |= SEEN_X;
+ emit(ARM_MOV_R(r_X, r_A), ctx);
+ break;
+ case BPF_S_MISC_TXA:
+ /* A = X */
+ update_on_xread(ctx);
+ emit(ARM_MOV_R(r_A, r_X), ctx);
+ break;
+ case BPF_S_ANC_PROTOCOL:
+ /* A = ntohs(skb->protocol) */
+ ctx->seen |= SEEN_SKB;
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff,
+ protocol) != 2);
+ off = offsetof(struct sk_buff, protocol);
+ emit(ARM_LDRH_I(r_scratch, r_skb, off), ctx);
+ emit_swap16(r_A, r_scratch, ctx);
+ break;
+ case BPF_S_ANC_CPU:
+ /* r_scratch = current_thread_info() */
+ OP_IMM3(ARM_BIC, r_scratch, ARM_SP, THREAD_SIZE - 1, ctx);
+ /* A = current_thread_info()->cpu */
+ BUILD_BUG_ON(FIELD_SIZEOF(struct thread_info, cpu) != 4);
+ off = offsetof(struct thread_info, cpu);
+ emit(ARM_LDR_I(r_A, r_scratch, off), ctx);
+ break;
+ case BPF_S_ANC_IFINDEX:
+ /* A = skb->dev->ifindex */
+ ctx->seen |= SEEN_SKB;
+ off = offsetof(struct sk_buff, dev);
+ emit(ARM_LDR_I(r_scratch, r_skb, off), ctx);
+
+ emit(ARM_CMP_I(r_scratch, 0), ctx);
+ emit_err_ret(ARM_COND_EQ, ctx);
+
+ BUILD_BUG_ON(FIELD_SIZEOF(struct net_device,
+ ifindex) != 4);
+ off = offsetof(struct net_device, ifindex);
+ emit(ARM_LDR_I(r_A, r_scratch, off), ctx);
+ break;
+ case BPF_S_ANC_MARK:
+ ctx->seen |= SEEN_SKB;
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, mark) != 4);
+ off = offsetof(struct sk_buff, mark);
+ emit(ARM_LDR_I(r_A, r_skb, off), ctx);
+ break;
+ case BPF_S_ANC_RXHASH:
+ ctx->seen |= SEEN_SKB;
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, rxhash) != 4);
+ off = offsetof(struct sk_buff, rxhash);
+ emit(ARM_LDR_I(r_A, r_skb, off), ctx);
+ break;
+ case BPF_S_ANC_QUEUE:
+ ctx->seen |= SEEN_SKB;
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff,
+ queue_mapping) != 2);
+ BUILD_BUG_ON(offsetof(struct sk_buff,
+ queue_mapping) > 0xff);
+ off = offsetof(struct sk_buff, queue_mapping);
+ emit(ARM_LDRH_I(r_A, r_skb, off), ctx);
+ break;
+ default:
+ return -1;
+ }
+ }
+
+ /* compute offsets only during the first pass */
+ if (ctx->target == NULL)
+ ctx->offsets[i] = ctx->idx * 4;
+
+ return 0;
+}
+
+
+void bpf_jit_compile(struct sk_filter *fp)
+{
+ struct jit_ctx ctx;
+ unsigned tmp_idx;
+ unsigned alloc_size;
+
+ if (!bpf_jit_enable)
+ return;
+
+ memset(&ctx, 0, sizeof(ctx));
+ ctx.skf = fp;
+ ctx.ret0_fp_idx = -1;
+
+ ctx.offsets = kzalloc(GFP_KERNEL, 4 * (ctx.skf->len + 1));
+ if (ctx.offsets == NULL)
+ return;
+
+ /* fake pass to fill in the ctx->seen */
+ if (unlikely(build_body(&ctx)))
+ goto out;
+
+ tmp_idx = ctx.idx;
+ build_prologue(&ctx);
+ ctx.prologue_bytes = (ctx.idx - tmp_idx) * 4;
+
+#if __LINUX_ARM_ARCH__ < 7
+ tmp_idx = ctx.idx;
+ build_epilogue(&ctx);
+ ctx.epilogue_bytes = (ctx.idx - tmp_idx) * 4;
+
+ ctx.idx += ctx.imm_count;
+ if (ctx.imm_count) {
+ ctx.imms = kzalloc(GFP_KERNEL, 4 * ctx.imm_count);
+ if (ctx.imms == NULL)
+ goto out;
+ }
+#else
+ /* there's nothing after the epilogue on ARMv7 */
+ build_epilogue(&ctx);
+#endif
+
+ alloc_size = 4 * ctx.idx;
+ ctx.target = module_alloc(max(sizeof(struct work_struct),
+ alloc_size));
+ if (unlikely(ctx.target == NULL))
+ goto out;
+
+ ctx.idx = 0;
+ build_prologue(&ctx);
+ build_body(&ctx);
+ build_epilogue(&ctx);
+
+ flush_icache_range((u32)ctx.target, (u32)(ctx.target + ctx.idx));
+
+#if __LINUX_ARM_ARCH__ < 7
+ if (ctx.imm_count)
+ kfree(ctx.imms);
+#endif
+
+ if (bpf_jit_enable > 1)
+ print_hex_dump(KERN_INFO, "BPF JIT code: ",
+ DUMP_PREFIX_ADDRESS, 16, 4, ctx.target,
+ alloc_size, false);
+
+ fp->bpf_func = (void *)ctx.target;
+out:
+ kfree(ctx.offsets);
+ return;
+}
+
+static void bpf_jit_free_worker(struct work_struct *work)
+{
+ module_free(NULL, work);
+}
+
+void bpf_jit_free(struct sk_filter *fp)
+{
+ struct work_struct *work;
+
+ if (fp->bpf_func != sk_run_filter) {
+ work = (struct work_struct *)fp->bpf_func;
+
+ INIT_WORK(work, bpf_jit_free_worker);
+ schedule_work(work);
+ }
+}
+
diff --git a/arch/arm/net/bpf_jit_32.h b/arch/arm/net/bpf_jit_32.h
new file mode 100644
index 0000000..99ae5e3
--- /dev/null
+++ b/arch/arm/net/bpf_jit_32.h
@@ -0,0 +1,190 @@
+/*
+ * Just-In-Time compiler for BPF filters on 32bit ARM
+ *
+ * Copyright (c) 2011 Mircea Gherzan <***@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; version 2 of the License.
+ */
+
+#ifndef PFILTER_OPCODES_ARM_H
+#define PFILTER_OPCODES_ARM_H
+
+#define ARM_R0 0
+#define ARM_R1 1
+#define ARM_R2 2
+#define ARM_R3 3
+#define ARM_R4 4
+#define ARM_R5 5
+#define ARM_R6 6
+#define ARM_R7 7
+#define ARM_R8 8
+#define ARM_R9 9
+#define ARM_R10 10
+#define ARM_FP 11
+#define ARM_IP 12
+#define ARM_SP 13
+#define ARM_LR 14
+#define ARM_PC 15
+
+#define ARM_COND_EQ 0x0
+#define ARM_COND_NE 0x1
+#define ARM_COND_CS 0x2
+#define ARM_COND_HS ARM_COND_CS
+#define ARM_COND_CC 0x3
+#define ARM_COND_LO ARM_COND_CC
+#define ARM_COND_MI 0x4
+#define ARM_COND_PL 0x5
+#define ARM_COND_VS 0x6
+#define ARM_COND_VC 0x7
+#define ARM_COND_HI 0x8
+#define ARM_COND_LS 0x9
+#define ARM_COND_GE 0xa
+#define ARM_COND_LT 0xb
+#define ARM_COND_GT 0xc
+#define ARM_COND_LE 0xd
+#define ARM_COND_AL 0xe
+
+/* register shift types */
+#define SRTYPE_LSL 0
+#define SRTYPE_LSR 1
+#define SRTYPE_ASR 2
+#define SRTYPE_ROR 3
+
+#define ARM_INST_ADD_R 0x00800000
+#define ARM_INST_ADD_I 0x02800000
+
+#define ARM_INST_AND_R 0x00000000
+#define ARM_INST_AND_I 0x02000000
+
+#define ARM_INST_BIC_R 0x01c00000
+#define ARM_INST_BIC_I 0x03c00000
+
+#define ARM_INST_B 0x0a000000
+#define ARM_INST_BX 0x012FFF10
+#define ARM_INST_BLX_R 0x012fff30
+
+#define ARM_INST_CMP_R 0x01500000
+#define ARM_INST_CMP_I 0x03500000
+
+#define ARM_INST_LDRB_I 0x05d00000
+#define ARM_INST_LDRB_R 0x07d00000
+#define ARM_INST_LDRH_I 0x01d000b0
+#define ARM_INST_LDR_I 0x05900000
+
+#define ARM_INST_LDM 0x08900000
+
+#define ARM_INST_LSL_I 0x01a00000
+#define ARM_INST_LSL_R 0x01a00010
+
+#define ARM_INST_LSR_I 0x01a00020
+#define ARM_INST_LSR_R 0x01a00030
+
+#define ARM_INST_MOV_R 0x01a00000
+#define ARM_INST_MOV_I 0x03a00000
+#define ARM_INST_MOVW 0x03000000
+#define ARM_INST_MOVT 0x03400000
+
+#define ARM_INST_MUL 0x00000090
+
+#define ARM_INST_POP 0x08bd0000
+#define ARM_INST_PUSH 0x092d0000
+
+#define ARM_INST_ORR_R 0x01800000
+#define ARM_INST_ORR_I 0x03800000
+
+#define ARM_INST_REV 0x06bf0f30
+#define ARM_INST_REV16 0x06bf0fb0
+
+#define ARM_INST_RSB_I 0x02600000
+
+#define ARM_INST_SUB_R 0x00400000
+#define ARM_INST_SUB_I 0x02400000
+
+#define ARM_INST_STR_I 0x05800000
+
+#define ARM_INST_TST_R 0x01100000
+#define ARM_INST_TST_I 0x03100000
+
+#define ARM_INST_UDIV 0x0730f010
+
+#define ARM_INST_UMULL 0x00800090
+
+/* register */
+#define _AL3_R(op, rd, rn, rm) ((op ## _R) | (rd) << 12 | (rn) << 16 | (rm))
+/* immediate */
+#define _AL3_I(op, rd, rn, imm) ((op ## _I) | (rd) << 12 | (rn) << 16 | (imm))
+
+#define ARM_ADD_R(rd, rn, rm) _AL3_R(ARM_INST_ADD, rd, rn, rm)
+#define ARM_ADD_I(rd, rn, imm) _AL3_I(ARM_INST_ADD, rd, rn, imm)
+
+#define ARM_AND_R(rd, rn, rm) _AL3_R(ARM_INST_AND, rd, rn, rm)
+#define ARM_AND_I(rd, rn, imm) _AL3_I(ARM_INST_AND, rd, rn, imm)
+
+#define ARM_BIC_R(rd, rn, rm) _AL3_R(ARM_INST_BIC, rd, rn, rm)
+#define ARM_BIC_I(rd, rn, imm) _AL3_I(ARM_INST_BIC, rd, rn, imm)
+
+#define ARM_B(imm24) (ARM_INST_B | ((imm24) & 0xffffff))
+#define ARM_BX(rm) (ARM_INST_BX | (rm))
+#define ARM_BLX_R(rm) (ARM_INST_BLX_R | (rm))
+
+#define ARM_CMP_R(rn, rm) _AL3_R(ARM_INST_CMP, 0, rn, rm)
+#define ARM_CMP_I(rn, imm) _AL3_I(ARM_INST_CMP, 0, rn, imm)
+
+#define ARM_LDR_I(rt, rn, off) (ARM_INST_LDR_I | (rt) << 12 | (rn) << 16 \
+ | (off))
+#define ARM_LDRB_I(rt, rn, off) (ARM_INST_LDRB_I | (rt) << 12 | (rn) << 16 \
+ | (off))
+#define ARM_LDRB_R(rt, rn, rm) (ARM_INST_LDRB_R | (rt) << 12 | (rn) << 16 \
+ | (rm))
+#define ARM_LDRH_I(rt, rn, off) (ARM_INST_LDRH_I | (rt) << 12 | (rn) << 16 \
+ | (((off) & 0xf0) << 4) | ((off) & 0xf))
+
+#define ARM_LDM(rn, regs) (ARM_INST_LDM | (rn) << 16 | (regs))
+
+#define ARM_LSL_R(rd, rn, rm) (_AL3_R(ARM_INST_LSL, rd, 0, rn) | (rm) << 8)
+#define ARM_LSL_I(rd, rn, imm) (_AL3_I(ARM_INST_LSL, rd, 0, rn) | (imm) << 7)
+
+#define ARM_LSR_R(rd, rn, rm) (_AL3_R(ARM_INST_LSR, rd, 0, rn) | (rm) << 8)
+#define ARM_LSR_I(rd, rn, imm) (_AL3_I(ARM_INST_LSR, rd, 0, rn) | (imm) << 7)
+
+#define ARM_MOV_R(rd, rm) _AL3_R(ARM_INST_MOV, rd, 0, rm)
+#define ARM_MOV_I(rd, imm) _AL3_I(ARM_INST_MOV, rd, 0, imm)
+
+#define ARM_MOVW(rd, imm) \
+ (ARM_INST_MOVW | ((imm) >> 12) << 16 | (rd) << 12 | ((imm) & 0x0fff))
+
+#define ARM_MOVT(rd, imm) \
+ (ARM_INST_MOVT | ((imm) >> 12) << 16 | (rd) << 12 | ((imm) & 0x0fff))
+
+#define ARM_MUL(rd, rm, rn) (ARM_INST_MUL | (rd) << 16 | (rm) << 8 | (rn))
+
+#define ARM_POP(regs) (ARM_INST_POP | (regs))
+#define ARM_PUSH(regs) (ARM_INST_PUSH | (regs))
+
+#define ARM_ORR_R(rd, rn, rm) _AL3_R(ARM_INST_ORR, rd, rn, rm)
+#define ARM_ORR_I(rd, rn, imm) _AL3_I(ARM_INST_ORR, rd, rn, imm)
+#define ARM_ORR_S(rd, rn, rm, type, rs) \
+ (ARM_ORR_R(rd, rn, rm) | (type) << 5 | (rs) << 7)
+
+#define ARM_REV(rd, rm) (ARM_INST_REV | (rd) << 12 | (rm))
+#define ARM_REV16(rd, rm) (ARM_INST_REV16 | (rd) << 12 | (rm))
+
+#define ARM_RSB_I(rd, rn, imm) _AL3_I(ARM_INST_RSB, rd, rn, imm)
+
+#define ARM_SUB_R(rd, rn, rm) _AL3_R(ARM_INST_SUB, rd, rn, rm)
+#define ARM_SUB_I(rd, rn, imm) _AL3_I(ARM_INST_SUB, rd, rn, imm)
+
+#define ARM_STR_I(rt, rn, off) (ARM_INST_STR_I | (rt) << 12 | (rn) << 16 \
+ | (off))
+
+#define ARM_TST_R(rn, rm) _AL3_R(ARM_INST_TST, 0, rn, rm)
+#define ARM_TST_I(rn, imm) _AL3_I(ARM_INST_TST, 0, rn, imm)
+
+#define ARM_UDIV(rd, rn, rm) (ARM_INST_UDIV | (rd) << 16 | (rn) | (rm) << 8)
+
+#define ARM_UMULL(rd_lo, rd_hi, rn, rm) (ARM_INST_UMULL | (rd_hi) << 16 \
+ | (rd_lo) << 12 | (rm) << 8 | rn)
+
+#endif /* PFILTER_OPCODES_ARM_H */
--
1.7.7.3
Eric Dumazet
2012-01-09 07:14:50 UTC
Permalink
Post by Mircea Gherzan
Based of Matt Evans's PPC64 implementation.
The compiler generates ARM instructions but interworking is
supported for Thumb2 kernels.
Supports both little and big endian. Unaligned loads are emitted
for ARMv6+. Not all the BPF opcodes that deal with ancillary data
are supported. The scratch memory of the filter lives on the stack.
Hardware integer division is used if it is available.
echo 1 > /proc/sys/net/core/bpf_jit_enable
A value greater than 1 enables opcode output.
---
* fix the intruction generation for LDX_MSH, OR_X, LSH_K,
RSH_K and JMP_JA
* fix the condition for saving the A register
* use fls() instead of the compiler builtin
* punt to the interpreter on absolute loads with K < 0
* check for invalid data references
* support the NEG opcode
* clear X in the prologue based on a context flag
* simplify the conditional jumps
* fix the code generation for the ANC_CPU opcode
* replace SEEN_LEN with SEEN_SKB
* set ctx->seen when handling some ancillary data opcodes
* first check if the JIT compiler is enabled
* fix the code generation for the LDX_MSH opcode
* no longer depend on EABI and !Thumb2
* add BLX "emulation" for ARMv4 without Thumb
* use the integer divide instruction on Cortex-A15
* fix the handling of the DIV_K opcode
* use a C wrapper for __aeabi_uidiv
* fix the generation of the epilogue (non-FP case)
* enable the compiler only for ARMv5+ because of the BLX instruction
* use the same comparison for the ARM version checks
* use misaligned accesses on ARMv6
* fix the SEEN_MEM
* fix the mem_words_used()
arch/arm/Kconfig | 1 +
arch/arm/Makefile | 1 +
arch/arm/net/Makefile | 3 +
arch/arm/net/bpf_jit_32.c | 912 +++++++++++++++++++++++++++++++++++++++++++++
arch/arm/net/bpf_jit_32.h | 190 ++++++++++
5 files changed, 1107 insertions(+), 0 deletions(-)
create mode 100644 arch/arm/net/Makefile
create mode 100644 arch/arm/net/bpf_jit_32.c
create mode 100644 arch/arm/net/bpf_jit_32.h
Acked-by: Eric Dumazet <***@gmail.com>
Mircea Gherzan
2012-01-09 08:58:34 UTC
Permalink
Post by Mircea Gherzan
Based of Matt Evans's PPC64 implementation.
The compiler generates ARM instructions but interworking is
supported for Thumb2 kernels.
Supports both little and big endian. Unaligned loads are emitted
for ARMv6+. Not all the BPF opcodes that deal with ancillary data
are supported. The scratch memory of the filter lives on the stack.
Hardware integer division is used if it is available.
echo 1 > /proc/sys/net/core/bpf_jit_enable
A value greater than 1 enables opcode output.
---
* fix the intruction generation for LDX_MSH, OR_X, LSH_K,
RSH_K and JMP_JA
* fix the condition for saving the A register
* use fls() instead of the compiler builtin
* punt to the interpreter on absolute loads with K < 0
* check for invalid data references
* support the NEG opcode
* clear X in the prologue based on a context flag
* simplify the conditional jumps
* fix the code generation for the ANC_CPU opcode
* replace SEEN_LEN with SEEN_SKB
* set ctx->seen when handling some ancillary data opcodes
* first check if the JIT compiler is enabled
* fix the code generation for the LDX_MSH opcode
* no longer depend on EABI and !Thumb2
* add BLX "emulation" for ARMv4 without Thumb
* use the integer divide instruction on Cortex-A15
* fix the handling of the DIV_K opcode
* use a C wrapper for __aeabi_uidiv
* fix the generation of the epilogue (non-FP case)
* enable the compiler only for ARMv5+ because of the BLX instruction
* use the same comparison for the ARM version checks
* use misaligned accesses on ARMv6
* fix the SEEN_MEM
* fix the mem_words_used()
arch/arm/Kconfig | 1 +
arch/arm/Makefile | 1 +
arch/arm/net/Makefile | 3 +
arch/arm/net/bpf_jit_32.c | 912 +++++++++++++++++++++++++++++++++++++++++++++
arch/arm/net/bpf_jit_32.h | 190 ++++++++++
5 files changed, 1107 insertions(+), 0 deletions(-)
create mode 100644 arch/arm/net/Makefile
create mode 100644 arch/arm/net/bpf_jit_32.c
create mode 100644 arch/arm/net/bpf_jit_32.h
Updated it in the patch tracking system as 7259/2. Since we're in the
middle of the merge window, can this one get into 3.3?

Thanks,
Mircea
Mircea Gherzan
2012-02-13 15:36:29 UTC
Permalink
Hi,
Post by Mircea Gherzan
Based of Matt Evans's PPC64 implementation.
The compiler generates ARM instructions but interworking is
supported for Thumb2 kernels.
Supports both little and big endian. Unaligned loads are emitted
for ARMv6+. Not all the BPF opcodes that deal with ancillary data
are supported. The scratch memory of the filter lives on the stack.
Hardware integer division is used if it is available.
echo 1 > /proc/sys/net/core/bpf_jit_enable
A value greater than 1 enables opcode output.
---
* fix the intruction generation for LDX_MSH, OR_X, LSH_K,
RSH_K and JMP_JA
* fix the condition for saving the A register
* use fls() instead of the compiler builtin
* punt to the interpreter on absolute loads with K < 0
* check for invalid data references
* support the NEG opcode
* clear X in the prologue based on a context flag
* simplify the conditional jumps
* fix the code generation for the ANC_CPU opcode
* replace SEEN_LEN with SEEN_SKB
* set ctx->seen when handling some ancillary data opcodes
* first check if the JIT compiler is enabled
* fix the code generation for the LDX_MSH opcode
* no longer depend on EABI and !Thumb2
* add BLX "emulation" for ARMv4 without Thumb
* use the integer divide instruction on Cortex-A15
* fix the handling of the DIV_K opcode
* use a C wrapper for __aeabi_uidiv
* fix the generation of the epilogue (non-FP case)
* enable the compiler only for ARMv5+ because of the BLX instruction
* use the same comparison for the ARM version checks
* use misaligned accesses on ARMv6
* fix the SEEN_MEM
* fix the mem_words_used()
arch/arm/Kconfig | 1 +
arch/arm/Makefile | 1 +
arch/arm/net/Makefile | 3 +
arch/arm/net/bpf_jit_32.c | 912 +++++++++++++++++++++++++++++++++++++++++++++
arch/arm/net/bpf_jit_32.h | 190 ++++++++++
5 files changed, 1107 insertions(+), 0 deletions(-)
create mode 100644 arch/arm/net/Makefile
create mode 100644 arch/arm/net/bpf_jit_32.c
create mode 100644 arch/arm/net/bpf_jit_32.h
Gentle ping. This patch has been in the tracking system for over a
month. Is there any reason not to apply it in arm/for-next?

Thanks,
Mircea
Russell King - ARM Linux
2012-02-13 16:02:49 UTC
Permalink
Post by Mircea Gherzan
Gentle ping. This patch has been in the tracking system for over a
month. Is there any reason not to apply it in arm/for-next?
Yes. It needs quite a review to make sure that there's absolutely no
possibility for userspace to be able to generate malicious ARM code
and then have it executed. Or put it another way: security paranoia.

I'm afraid that I've not been able to look at it at all yet, and I
haven't seen anyone looking at the code from that aspect.
Mircea Gherzan
2012-03-15 07:23:43 UTC
Permalink
Post by Russell King - ARM Linux
Post by Mircea Gherzan
Gentle ping. This patch has been in the tracking system for over a
month. Is there any reason not to apply it in arm/for-next?
Yes. It needs quite a review to make sure that there's absolutely no
possibility for userspace to be able to generate malicious ARM code
and then have it executed. Or put it another way: security paranoia.
Are there any specific security aspects you're thinking of? Leaks to
userspace, divisions by zero and invalid packet offsets are already
taken care of.
Post by Russell King - ARM Linux
I'm afraid that I've not been able to look at it at all yet, and I
haven't seen anyone looking at the code from that aspect.
Thanks,
Mircea
Russell King - ARM Linux
2012-03-15 11:41:51 UTC
Permalink
I'm unconvinced that this is safe.
Post by Mircea Gherzan
+static int build_body(struct jit_ctx *ctx)
+{
+ void *load_func[] = {jit_get_skb_b, jit_get_skb_h, jit_get_skb_w};
+ const struct sk_filter *prog = ctx->skf;
+ const struct sock_filter *inst;
+ unsigned i, load_order, off, condt;
+ int imm12;
+ u32 k;
+
+ for (i = 0; i < prog->len; i++) {
+ inst = &(prog->insns[i]);
+ /* K as an immediate value operand */
+ k = inst->k;
+
+ /* compute offsets only in the fake pass */
+ if (ctx->target == NULL)
+ ctx->offsets[i] = ctx->idx * 4;
+
+ switch (inst->code) {
+ emit_mov_i(r_A, k, ctx);
+ break;
+ ctx->seen |= SEEN_SKB;
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, len) != 4);
+ emit(ARM_LDR_I(r_A, r_skb,
+ offsetof(struct sk_buff, len)), ctx);
+ break;
+ /* A = scratch[k] */
+ ctx->seen |= SEEN_MEM_WORD(k);
+ emit(ARM_LDR_I(r_A, ARM_SP, SCRATCH_OFF(k)), ctx);
+ break;
+ load_order = 2;
+ goto load;
+ load_order = 1;
+ goto load;
+ load_order = 0;
+ /* the interpreter will deal with the negative K */
+ if (k < 0)
+ return -1;
k is a u32, so won't this will always be false - negative numbers will
appear as huge positive numbers instead.
Post by Mircea Gherzan
+ emit_mov_i(r_off, k, ctx);
+ ctx->seen |= SEEN_DATA | SEEN_CALL;
+
+ if (load_order > 0) {
+ emit(ARM_SUB_I(r_scratch, r_skb_hl,
+ 1 << load_order), ctx);
+ emit(ARM_CMP_R(r_scratch, r_off), ctx);
+ condt = ARM_COND_HS;
+ } else {
+ emit(ARM_CMP_R(r_skb_hl, r_off), ctx);
+ condt = ARM_COND_HI;
+ }
+
+ _emit(condt, ARM_ADD_R(r_scratch, r_off, r_skb_data),
+ ctx);
+
+ if (load_order == 0)
+ _emit(condt, ARM_LDRB_I(r_A, r_scratch, 0),
+ ctx);
+ else if (load_order == 1)
+ emit_load_be16(condt, r_A, r_scratch, ctx);
+ else if (load_order == 2)
+ emit_load_be32(condt, r_A, r_scratch, ctx);
I think this is creating code to do something like this:

sub r_scratch, r_skb_hl, #1 << load_order
cmp r_scratch, k
ldrhs dest, [r_skb_data, k]

Isn't this going to perform the loads for any k greater than the header
length? What stops k being larger than the packet? And is this correct
in the first place?
Post by Mircea Gherzan
+
+ _emit(condt, ARM_B(b_imm(i + 1, ctx)), ctx);
+
+ /* the slowpath */
+ emit_mov_i(ARM_R3, (u32)load_func[load_order], ctx);
+ emit(ARM_MOV_R(ARM_R0, r_skb), ctx);
+ /* the offset is already in R1 */
+ emit_blx_r(ARM_R3, ctx);
+ /* check the result of skb_copy_bits */
+ emit(ARM_CMP_I(ARM_R1, 0), ctx);
+ emit_err_ret(ARM_COND_NE, ctx);
+ emit(ARM_MOV_R(r_A, ARM_R0), ctx);
+ break;
+ load_order = 2;
+ goto load_ind;
+ load_order = 1;
+ goto load_ind;
+ load_order = 0;
+ OP_IMM3(ARM_ADD, r_off, r_X, k, ctx);
+ goto load_common;
+ ctx->seen |= SEEN_X;
+ emit_mov_i(r_X, k, ctx);
+ break;
+ ctx->seen |= SEEN_X | SEEN_SKB;
+ emit(ARM_LDR_I(r_X, r_skb,
+ offsetof(struct sk_buff, len)), ctx);
+ break;
+ ctx->seen |= SEEN_X | SEEN_MEM_WORD(k);
+ emit(ARM_LDR_I(r_X, ARM_SP, SCRATCH_OFF(k)), ctx);
What about this - SCRATCH_OFF() is just a simple:
+#define SCRATCH_OFF(k) (SCRATCH_SP_OFFSET + (k))

So what if 'k' is a carefully chosen 32-bit number outside of the range
of the stack? Shouldn't there be a check for k < BPF_MEMWORDS to fail
the translation where-ever SEEN_MEM_WORD(k) or SCRATCH_OFF(k) is used?
Post by Mircea Gherzan
+ break;
+ /* x = ((*(frame + k)) & 0xf) << 2; */
So, k is passed in from userspace. What stops k being out of the bounds
of 'frame' and referring to other kernel space data via carefully crafted
bpf code?
Post by Mircea Gherzan
+ ctx->seen |= SEEN_X | SEEN_DATA | SEEN_CALL;
+ /* the interpreter should deal with the negative K */
+ if (k < 0)
+ return -1;
+ /* offset in r1: we might have to take the slow path */
+ emit_mov_i(r_off, k, ctx);
+ emit(ARM_CMP_R(r_skb_hl, r_off), ctx);
+
+ /* load in r0: common with the slowpath */
+ _emit(ARM_COND_HI, ARM_LDRB_R(ARM_R0, r_skb_data,
+ ARM_R1), ctx);
+ /*
+ * emit_mov_i() might generate one or two instructions,
+ * the same holds for emit_blx_r()
+ */
+ _emit(ARM_COND_HI, ARM_B(b_imm(i + 1, ctx) - 2), ctx);
+
+ emit(ARM_MOV_R(ARM_R0, r_skb), ctx);
+ /* r_off is r1 */
+ emit_mov_i(ARM_R3, (u32)jit_get_skb_b, ctx);
+ emit_blx_r(ARM_R3, ctx);
+
+ emit(ARM_AND_I(r_X, ARM_R0, 0x00f), ctx);
+ emit(ARM_LSL_I(r_X, r_X, 2), ctx);
+ break;
+ ctx->seen |= SEEN_MEM_WORD(k);
+ emit(ARM_STR_I(r_A, ARM_SP, SCRATCH_OFF(k)), ctx);
+ break;
+ update_on_xread(ctx);
+ ctx->seen |= SEEN_MEM_WORD(k);
+ emit(ARM_STR_I(r_X, ARM_SP, SCRATCH_OFF(k)), ctx);
+ break;
+ /* A += K */
+ OP_IMM3(ARM_ADD, r_A, r_A, k, ctx);
+ break;
+ update_on_xread(ctx);
+ emit(ARM_ADD_R(r_A, r_A, r_X), ctx);
+ break;
+ /* A -= K */
+ OP_IMM3(ARM_SUB, r_A, r_A, k, ctx);
+ break;
+ update_on_xread(ctx);
+ emit(ARM_SUB_R(r_A, r_A, r_X), ctx);
+ break;
+ /* A *= K */
+ emit_mov_i(r_scratch, k, ctx);
+ emit(ARM_MUL(r_A, r_A, r_scratch), ctx);
+ break;
+ update_on_xread(ctx);
+ emit(ARM_MUL(r_A, r_A, r_X), ctx);
+ break;
+ /* current k == reciprocal_value(userspace k) */
+ emit_mov_i(r_scratch, k, ctx);
+ /* A = top 32 bits of the product */
+ emit(ARM_UMULL(r_scratch, r_A, r_A, r_scratch), ctx);
+ break;
+ update_on_xread(ctx);
+ emit(ARM_CMP_I(r_X, 0), ctx);
+ emit_err_ret(ARM_COND_EQ, ctx);
+ emit_udiv(r_A, r_A, r_X, ctx);
+ break;
+ /* A |= K */
+ OP_IMM3(ARM_ORR, r_A, r_A, k, ctx);
+ break;
+ update_on_xread(ctx);
+ emit(ARM_ORR_R(r_A, r_A, r_X), ctx);
+ break;
+ /* A &= K */
+ OP_IMM3(ARM_AND, r_A, r_A, k, ctx);
+ break;
+ update_on_xread(ctx);
+ emit(ARM_AND_R(r_A, r_A, r_X), ctx);
+ break;
+ if (unlikely(k > 31))
+ return -1;
+ emit(ARM_LSL_I(r_A, r_A, k), ctx);
+ break;
+ update_on_xread(ctx);
+ emit(ARM_LSL_R(r_A, r_A, r_X), ctx);
+ break;
+ if (unlikely(k > 31))
+ return -1;
+ emit(ARM_LSR_I(r_A, r_A, k), ctx);
+ break;
+ update_on_xread(ctx);
+ emit(ARM_LSR_R(r_A, r_A, r_X), ctx);
+ break;
+ /* A = -A */
+ emit(ARM_RSB_I(r_A, r_A, 0), ctx);
+ break;
+ /* pc += K */
+ emit(ARM_B(b_imm(i + k + 1, ctx)), ctx);
This seems to allow the offset[] array to be indexed by an out of bounds
32-bit offset (k). What stops that, therefore what prevents reading a
target jump address from a random location in kernel space?

I think b_imm() should check that it's not going to overflow its array
of offsets.
Mircea Gherzan
2012-03-16 07:53:40 UTC
Permalink
Post by Russell King - ARM Linux
I'm unconvinced that this is safe.
Post by Mircea Gherzan
+static int build_body(struct jit_ctx *ctx)
+{
+ void *load_func[] = {jit_get_skb_b, jit_get_skb_h, jit_get_skb_w};
+ const struct sk_filter *prog = ctx->skf;
+ const struct sock_filter *inst;
+ unsigned i, load_order, off, condt;
+ int imm12;
+ u32 k;
+
+ for (i = 0; i < prog->len; i++) {
+ inst = &(prog->insns[i]);
+ /* K as an immediate value operand */
+ k = inst->k;
+
+ /* compute offsets only in the fake pass */
+ if (ctx->target == NULL)
+ ctx->offsets[i] = ctx->idx * 4;
+
+ switch (inst->code) {
+ emit_mov_i(r_A, k, ctx);
+ break;
+ ctx->seen |= SEEN_SKB;
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, len) != 4);
+ emit(ARM_LDR_I(r_A, r_skb,
+ offsetof(struct sk_buff, len)), ctx);
+ break;
+ /* A = scratch[k] */
+ ctx->seen |= SEEN_MEM_WORD(k);
+ emit(ARM_LDR_I(r_A, ARM_SP, SCRATCH_OFF(k)), ctx);
+ break;
+ load_order = 2;
+ goto load;
+ load_order = 1;
+ goto load;
+ load_order = 0;
+ /* the interpreter will deal with the negative K */
+ if (k < 0)
+ return -1;
k is a u32, so won't this will always be false - negative numbers will
appear as huge positive numbers instead.
Good catch, thanks.
Post by Russell King - ARM Linux
Post by Mircea Gherzan
+ emit_mov_i(r_off, k, ctx);
+ ctx->seen |= SEEN_DATA | SEEN_CALL;
+
+ if (load_order > 0) {
+ emit(ARM_SUB_I(r_scratch, r_skb_hl,
+ 1 << load_order), ctx);
+ emit(ARM_CMP_R(r_scratch, r_off), ctx);
+ condt = ARM_COND_HS;
+ } else {
+ emit(ARM_CMP_R(r_skb_hl, r_off), ctx);
+ condt = ARM_COND_HI;
+ }
+
+ _emit(condt, ARM_ADD_R(r_scratch, r_off, r_skb_data),
+ ctx);
+
+ if (load_order == 0)
+ _emit(condt, ARM_LDRB_I(r_A, r_scratch, 0),
+ ctx);
+ else if (load_order == 1)
+ emit_load_be16(condt, r_A, r_scratch, ctx);
+ else if (load_order == 2)
+ emit_load_be32(condt, r_A, r_scratch, ctx);
sub r_scratch, r_skb_hl, #1 << load_order
cmp r_scratch, k
ldrhs dest, [r_skb_data, k]
Isn't this going to perform the loads for any k greater than the header
length? What stops k being larger than the packet? And is this correct
in the first place?
If K is an offset within the header (scratch "higher or same") then use
a direct load, otherwise land on the slowpath of skb_copy_bits(). If K
is larger than the packet then the skb_copy_bits() will return an error.

I don't see anything wrong with this.
Post by Russell King - ARM Linux
Post by Mircea Gherzan
+
+ _emit(condt, ARM_B(b_imm(i + 1, ctx)), ctx);
+
+ /* the slowpath */
+ emit_mov_i(ARM_R3, (u32)load_func[load_order], ctx);
+ emit(ARM_MOV_R(ARM_R0, r_skb), ctx);
+ /* the offset is already in R1 */
+ emit_blx_r(ARM_R3, ctx);
+ /* check the result of skb_copy_bits */
+ emit(ARM_CMP_I(ARM_R1, 0), ctx);
+ emit_err_ret(ARM_COND_NE, ctx);
+ emit(ARM_MOV_R(r_A, ARM_R0), ctx);
+ break;
+ load_order = 2;
+ goto load_ind;
+ load_order = 1;
+ goto load_ind;
+ load_order = 0;
+ OP_IMM3(ARM_ADD, r_off, r_X, k, ctx);
+ goto load_common;
+ ctx->seen |= SEEN_X;
+ emit_mov_i(r_X, k, ctx);
+ break;
+ ctx->seen |= SEEN_X | SEEN_SKB;
+ emit(ARM_LDR_I(r_X, r_skb,
+ offsetof(struct sk_buff, len)), ctx);
+ break;
+ ctx->seen |= SEEN_X | SEEN_MEM_WORD(k);
+ emit(ARM_LDR_I(r_X, ARM_SP, SCRATCH_OFF(k)), ctx);
+#define SCRATCH_OFF(k) (SCRATCH_SP_OFFSET + (k))
So what if 'k' is a carefully chosen 32-bit number outside of the range
of the stack? Shouldn't there be a check for k < BPF_MEMWORDS to fail
the translation where-ever SEEN_MEM_WORD(k) or SCRATCH_OFF(k) is used?
The "scratch memory" accesses are checked in the the sk_chk_filter(), so
there's no need to do anything here.
Post by Russell King - ARM Linux
Post by Mircea Gherzan
+ break;
+ /* x = ((*(frame + k)) & 0xf) << 2; */
So, k is passed in from userspace. What stops k being out of the bounds
of 'frame' and referring to other kernel space data via carefully crafted
bpf code?
If k is out of bounds then we land once again on the skb_copy_bits()
slowpath.
Post by Russell King - ARM Linux
Post by Mircea Gherzan
+ ctx->seen |= SEEN_X | SEEN_DATA | SEEN_CALL;
+ /* the interpreter should deal with the negative K */
+ if (k < 0)
+ return -1;
+ /* offset in r1: we might have to take the slow path */
+ emit_mov_i(r_off, k, ctx);
+ emit(ARM_CMP_R(r_skb_hl, r_off), ctx);
+
+ /* load in r0: common with the slowpath */
+ _emit(ARM_COND_HI, ARM_LDRB_R(ARM_R0, r_skb_data,
+ ARM_R1), ctx);
+ /*
+ * emit_mov_i() might generate one or two instructions,
+ * the same holds for emit_blx_r()
+ */
+ _emit(ARM_COND_HI, ARM_B(b_imm(i + 1, ctx) - 2), ctx);
+
+ emit(ARM_MOV_R(ARM_R0, r_skb), ctx);
+ /* r_off is r1 */
+ emit_mov_i(ARM_R3, (u32)jit_get_skb_b, ctx);
+ emit_blx_r(ARM_R3, ctx);
+
+ emit(ARM_AND_I(r_X, ARM_R0, 0x00f), ctx);
+ emit(ARM_LSL_I(r_X, r_X, 2), ctx);
+ break;
+ ctx->seen |= SEEN_MEM_WORD(k);
+ emit(ARM_STR_I(r_A, ARM_SP, SCRATCH_OFF(k)), ctx);
+ break;
+ update_on_xread(ctx);
+ ctx->seen |= SEEN_MEM_WORD(k);
+ emit(ARM_STR_I(r_X, ARM_SP, SCRATCH_OFF(k)), ctx);
+ break;
+ /* A += K */
+ OP_IMM3(ARM_ADD, r_A, r_A, k, ctx);
+ break;
+ update_on_xread(ctx);
+ emit(ARM_ADD_R(r_A, r_A, r_X), ctx);
+ break;
+ /* A -= K */
+ OP_IMM3(ARM_SUB, r_A, r_A, k, ctx);
+ break;
+ update_on_xread(ctx);
+ emit(ARM_SUB_R(r_A, r_A, r_X), ctx);
+ break;
+ /* A *= K */
+ emit_mov_i(r_scratch, k, ctx);
+ emit(ARM_MUL(r_A, r_A, r_scratch), ctx);
+ break;
+ update_on_xread(ctx);
+ emit(ARM_MUL(r_A, r_A, r_X), ctx);
+ break;
+ /* current k == reciprocal_value(userspace k) */
+ emit_mov_i(r_scratch, k, ctx);
+ /* A = top 32 bits of the product */
+ emit(ARM_UMULL(r_scratch, r_A, r_A, r_scratch), ctx);
+ break;
+ update_on_xread(ctx);
+ emit(ARM_CMP_I(r_X, 0), ctx);
+ emit_err_ret(ARM_COND_EQ, ctx);
+ emit_udiv(r_A, r_A, r_X, ctx);
+ break;
+ /* A |= K */
+ OP_IMM3(ARM_ORR, r_A, r_A, k, ctx);
+ break;
+ update_on_xread(ctx);
+ emit(ARM_ORR_R(r_A, r_A, r_X), ctx);
+ break;
+ /* A &= K */
+ OP_IMM3(ARM_AND, r_A, r_A, k, ctx);
+ break;
+ update_on_xread(ctx);
+ emit(ARM_AND_R(r_A, r_A, r_X), ctx);
+ break;
+ if (unlikely(k > 31))
+ return -1;
+ emit(ARM_LSL_I(r_A, r_A, k), ctx);
+ break;
+ update_on_xread(ctx);
+ emit(ARM_LSL_R(r_A, r_A, r_X), ctx);
+ break;
+ if (unlikely(k > 31))
+ return -1;
+ emit(ARM_LSR_I(r_A, r_A, k), ctx);
+ break;
+ update_on_xread(ctx);
+ emit(ARM_LSR_R(r_A, r_A, r_X), ctx);
+ break;
+ /* A = -A */
+ emit(ARM_RSB_I(r_A, r_A, 0), ctx);
+ break;
+ /* pc += K */
+ emit(ARM_B(b_imm(i + k + 1, ctx)), ctx);
This seems to allow the offset[] array to be indexed by an out of bounds
32-bit offset (k). What stops that, therefore what prevents reading a
target jump address from a random location in kernel space?
The offset of the jump target is checked in sk_chk_filter(), so no need
to worry about anything here.
Post by Russell King - ARM Linux
I think b_imm() should check that it's not going to overflow its array
of offsets.
Thanks,
Mircea

Continue reading on narkive:
Loading...