Discussion:
[PATCH resend 01/18] crypto/algapi - use separate dst and src operands for __crypto_xor()
Ard Biesheuvel
2017-07-24 10:28:03 UTC
Permalink
In preparation of introducing crypto_xor_cpy(), which will use separate
operands for input and output, modify the __crypto_xor() implementation,
which it will share with the existing crypto_xor(), which provides the
actual functionality when not using the inline version.

Signed-off-by: Ard Biesheuvel <***@linaro.org>
---
crypto/algapi.c | 25 ++++++++++++--------
include/crypto/algapi.h | 4 ++--
2 files changed, 17 insertions(+), 12 deletions(-)

diff --git a/crypto/algapi.c b/crypto/algapi.c
index e4cc7615a139..aa699ff6c876 100644
--- a/crypto/algapi.c
+++ b/crypto/algapi.c
@@ -975,13 +975,15 @@ void crypto_inc(u8 *a, unsigned int size)
}
EXPORT_SYMBOL_GPL(crypto_inc);

-void __crypto_xor(u8 *dst, const u8 *src, unsigned int len)
+void __crypto_xor(u8 *dst, const u8 *src1, const u8 *src2, unsigned int len)
{
int relalign = 0;

if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) {
int size = sizeof(unsigned long);
- int d = ((unsigned long)dst ^ (unsigned long)src) & (size - 1);
+ int d = (((unsigned long)dst ^ (unsigned long)src1) |
+ ((unsigned long)dst ^ (unsigned long)src2)) &
+ (size - 1);

relalign = d ? 1 << __ffs(d) : size;

@@ -992,34 +994,37 @@ void __crypto_xor(u8 *dst, const u8 *src, unsigned int len)
* process the remainder of the input using optimal strides.
*/
while (((unsigned long)dst & (relalign - 1)) && len > 0) {
- *dst++ ^= *src++;
+ *dst++ = *src1++ ^ *src2++;
len--;
}
}

while (IS_ENABLED(CONFIG_64BIT) && len >= 8 && !(relalign & 7)) {
- *(u64 *)dst ^= *(u64 *)src;
+ *(u64 *)dst = *(u64 *)src1 ^ *(u64 *)src2;
dst += 8;
- src += 8;
+ src1 += 8;
+ src2 += 8;
len -= 8;
}

while (len >= 4 && !(relalign & 3)) {
- *(u32 *)dst ^= *(u32 *)src;
+ *(u32 *)dst = *(u32 *)src1 ^ *(u32 *)src2;
dst += 4;
- src += 4;
+ src1 += 4;
+ src2 += 4;
len -= 4;
}

while (len >= 2 && !(relalign & 1)) {
- *(u16 *)dst ^= *(u16 *)src;
+ *(u16 *)dst = *(u16 *)src1 ^ *(u16 *)src2;
dst += 2;
- src += 2;
+ src1 += 2;
+ src2 += 2;
len -= 2;
}

while (len--)
- *dst++ ^= *src++;
+ *dst++ = *src1++ ^ *src2++;
}
EXPORT_SYMBOL_GPL(__crypto_xor);

diff --git a/include/crypto/algapi.h b/include/crypto/algapi.h
index 436c4c2683c7..fd547f946bf8 100644
--- a/include/crypto/algapi.h
+++ b/include/crypto/algapi.h
@@ -192,7 +192,7 @@ static inline unsigned int crypto_queue_len(struct crypto_queue *queue)
}

void crypto_inc(u8 *a, unsigned int size);
-void __crypto_xor(u8 *dst, const u8 *src, unsigned int size);
+void __crypto_xor(u8 *dst, const u8 *src1, const u8 *src2, unsigned int size);

static inline void crypto_xor(u8 *dst, const u8 *src, unsigned int size)
{
@@ -207,7 +207,7 @@ static inline void crypto_xor(u8 *dst, const u8 *src, unsigned int size)
size -= sizeof(unsigned long);
}
} else {
- __crypto_xor(dst, src, size);
+ __crypto_xor(dst, dst, src, size);
}
}
--
2.9.3
Ard Biesheuvel
2017-07-24 10:28:06 UTC
Permalink
The arm64 kernel will shortly disallow nested kernel mode NEON, so
add a fallback to scalar C code that can be invoked in that case.

Signed-off-by: Ard Biesheuvel <***@linaro.org>
---
arch/arm64/crypto/crct10dif-ce-glue.c | 13 +++++++++----
1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/arch/arm64/crypto/crct10dif-ce-glue.c b/arch/arm64/crypto/crct10dif-ce-glue.c
index 60cb590c2590..96f0cae4a022 100644
--- a/arch/arm64/crypto/crct10dif-ce-glue.c
+++ b/arch/arm64/crypto/crct10dif-ce-glue.c
@@ -1,7 +1,7 @@
/*
* Accelerated CRC-T10DIF using arm64 NEON and Crypto Extensions instructions
*
- * Copyright (C) 2016 Linaro Ltd <***@linaro.org>
+ * Copyright (C) 2016 - 2017 Linaro Ltd <***@linaro.org>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
@@ -18,6 +18,7 @@
#include <crypto/internal/hash.h>

#include <asm/neon.h>
+#include <asm/simd.h>

#define CRC_T10DIF_PMULL_CHUNK_SIZE 16U

@@ -48,9 +49,13 @@ static int crct10dif_update(struct shash_desc *desc, const u8 *data,
}

if (length > 0) {
- kernel_neon_begin_partial(14);
- *crc = crc_t10dif_pmull(*crc, data, length);
- kernel_neon_end();
+ if (may_use_simd()) {
+ kernel_neon_begin();
+ *crc = crc_t10dif_pmull(*crc, data, length);
+ kernel_neon_end();
+ } else {
+ *crc = crc_t10dif_generic(*crc, data, length);
+ }
}

return 0;
--
2.9.3
Ard Biesheuvel
2017-07-24 10:28:04 UTC
Permalink
There are quite a number of occurrences in the kernel of the pattern

if (dst != src)
memcpy(dst, src, walk.total % AES_BLOCK_SIZE);
crypto_xor(dst, final, walk.total % AES_BLOCK_SIZE);

or

crypto_xor(keystream, src, nbytes);
memcpy(dst, keystream, nbytes);

where crypto_xor() is preceded or followed by a memcpy() invocation
that is only there because crypto_xor() uses its output parameter as
one of the inputs. To avoid having to add new instances of this pattern
in the arm64 code, which will be refactored to implement non-SIMD
fallbacks, add an alternative implementation called crypto_xor_cpy(),
taking separate input and output arguments. This removes the need for
the separate memcpy().

Signed-off-by: Ard Biesheuvel <***@linaro.org>
---
arch/arm/crypto/aes-ce-glue.c | 4 +---
arch/arm/crypto/aes-neonbs-glue.c | 5 ++---
arch/arm64/crypto/aes-glue.c | 4 +---
arch/arm64/crypto/aes-neonbs-glue.c | 5 ++---
arch/sparc/crypto/aes_glue.c | 3 +--
arch/x86/crypto/aesni-intel_glue.c | 4 ++--
arch/x86/crypto/blowfish_glue.c | 3 +--
arch/x86/crypto/cast5_avx_glue.c | 3 +--
arch/x86/crypto/des3_ede_glue.c | 3 +--
crypto/ctr.c | 3 +--
crypto/pcbc.c | 12 ++++--------
drivers/crypto/vmx/aes_ctr.c | 3 +--
drivers/md/dm-crypt.c | 11 +++++------
include/crypto/algapi.h | 19 +++++++++++++++++++
14 files changed, 42 insertions(+), 40 deletions(-)

diff --git a/arch/arm/crypto/aes-ce-glue.c b/arch/arm/crypto/aes-ce-glue.c
index 0f966a8ca1ce..d0a9cec73707 100644
--- a/arch/arm/crypto/aes-ce-glue.c
+++ b/arch/arm/crypto/aes-ce-glue.c
@@ -285,9 +285,7 @@ static int ctr_encrypt(struct skcipher_request *req)

ce_aes_ctr_encrypt(tail, NULL, (u8 *)ctx->key_enc,
num_rounds(ctx), blocks, walk.iv);
- if (tdst != tsrc)
- memcpy(tdst, tsrc, nbytes);
- crypto_xor(tdst, tail, nbytes);
+ crypto_xor_cpy(tdst, tsrc, tail, nbytes);
err = skcipher_walk_done(&walk, 0);
}
kernel_neon_end();
diff --git a/arch/arm/crypto/aes-neonbs-glue.c b/arch/arm/crypto/aes-neonbs-glue.c
index c76377961444..18768f330449 100644
--- a/arch/arm/crypto/aes-neonbs-glue.c
+++ b/arch/arm/crypto/aes-neonbs-glue.c
@@ -221,9 +221,8 @@ static int ctr_encrypt(struct skcipher_request *req)
u8 *dst = walk.dst.virt.addr + blocks * AES_BLOCK_SIZE;
u8 *src = walk.src.virt.addr + blocks * AES_BLOCK_SIZE;

- if (dst != src)
- memcpy(dst, src, walk.total % AES_BLOCK_SIZE);
- crypto_xor(dst, final, walk.total % AES_BLOCK_SIZE);
+ crypto_xor_cpy(dst, src, final,
+ walk.total % AES_BLOCK_SIZE);

err = skcipher_walk_done(&walk, 0);
break;
diff --git a/arch/arm64/crypto/aes-glue.c b/arch/arm64/crypto/aes-glue.c
index bcf596b0197e..0da30e3b0e4b 100644
--- a/arch/arm64/crypto/aes-glue.c
+++ b/arch/arm64/crypto/aes-glue.c
@@ -241,9 +241,7 @@ static int ctr_encrypt(struct skcipher_request *req)

aes_ctr_encrypt(tail, NULL, (u8 *)ctx->key_enc, rounds,
blocks, walk.iv, first);
- if (tdst != tsrc)
- memcpy(tdst, tsrc, nbytes);
- crypto_xor(tdst, tail, nbytes);
+ crypto_xor_cpy(tdst, tsrc, tail, nbytes);
err = skcipher_walk_done(&walk, 0);
}
kernel_neon_end();
diff --git a/arch/arm64/crypto/aes-neonbs-glue.c b/arch/arm64/crypto/aes-neonbs-glue.c
index db2501d93550..9001aec16007 100644
--- a/arch/arm64/crypto/aes-neonbs-glue.c
+++ b/arch/arm64/crypto/aes-neonbs-glue.c
@@ -224,9 +224,8 @@ static int ctr_encrypt(struct skcipher_request *req)
u8 *dst = walk.dst.virt.addr + blocks * AES_BLOCK_SIZE;
u8 *src = walk.src.virt.addr + blocks * AES_BLOCK_SIZE;

- if (dst != src)
- memcpy(dst, src, walk.total % AES_BLOCK_SIZE);
- crypto_xor(dst, final, walk.total % AES_BLOCK_SIZE);
+ crypto_xor_cpy(dst, src, final,
+ walk.total % AES_BLOCK_SIZE);

err = skcipher_walk_done(&walk, 0);
break;
diff --git a/arch/sparc/crypto/aes_glue.c b/arch/sparc/crypto/aes_glue.c
index c90930de76ba..3cd4f6b198b6 100644
--- a/arch/sparc/crypto/aes_glue.c
+++ b/arch/sparc/crypto/aes_glue.c
@@ -344,8 +344,7 @@ static void ctr_crypt_final(struct crypto_sparc64_aes_ctx *ctx,

ctx->ops->ecb_encrypt(&ctx->key[0], (const u64 *)ctrblk,
keystream, AES_BLOCK_SIZE);
- crypto_xor((u8 *) keystream, src, nbytes);
- memcpy(dst, keystream, nbytes);
+ crypto_xor_cpy(dst, (u8 *) keystream, src, nbytes);
crypto_inc(ctrblk, AES_BLOCK_SIZE);
}

diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
index 4a55cdcdc008..5c15d6b57329 100644
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -475,8 +475,8 @@ static void ctr_crypt_final(struct crypto_aes_ctx *ctx,
unsigned int nbytes = walk->nbytes;

aesni_enc(ctx, keystream, ctrblk);
- crypto_xor(keystream, src, nbytes);
- memcpy(dst, keystream, nbytes);
+ crypto_xor_cpy(dst, keystream, src, nbytes);
+
crypto_inc(ctrblk, AES_BLOCK_SIZE);
}

diff --git a/arch/x86/crypto/blowfish_glue.c b/arch/x86/crypto/blowfish_glue.c
index 17c05531dfd1..f9eca34301e2 100644
--- a/arch/x86/crypto/blowfish_glue.c
+++ b/arch/x86/crypto/blowfish_glue.c
@@ -271,8 +271,7 @@ static void ctr_crypt_final(struct bf_ctx *ctx, struct blkcipher_walk *walk)
unsigned int nbytes = walk->nbytes;

blowfish_enc_blk(ctx, keystream, ctrblk);
- crypto_xor(keystream, src, nbytes);
- memcpy(dst, keystream, nbytes);
+ crypto_xor_cpy(dst, keystream, src, nbytes);

crypto_inc(ctrblk, BF_BLOCK_SIZE);
}
diff --git a/arch/x86/crypto/cast5_avx_glue.c b/arch/x86/crypto/cast5_avx_glue.c
index 8648158f3916..dbea6020ffe7 100644
--- a/arch/x86/crypto/cast5_avx_glue.c
+++ b/arch/x86/crypto/cast5_avx_glue.c
@@ -256,8 +256,7 @@ static void ctr_crypt_final(struct blkcipher_desc *desc,
unsigned int nbytes = walk->nbytes;

__cast5_encrypt(ctx, keystream, ctrblk);
- crypto_xor(keystream, src, nbytes);
- memcpy(dst, keystream, nbytes);
+ crypto_xor_cpy(dst, keystream, src, nbytes);

crypto_inc(ctrblk, CAST5_BLOCK_SIZE);
}
diff --git a/arch/x86/crypto/des3_ede_glue.c b/arch/x86/crypto/des3_ede_glue.c
index d6fc59aaaadf..30c0a37f4882 100644
--- a/arch/x86/crypto/des3_ede_glue.c
+++ b/arch/x86/crypto/des3_ede_glue.c
@@ -277,8 +277,7 @@ static void ctr_crypt_final(struct des3_ede_x86_ctx *ctx,
unsigned int nbytes = walk->nbytes;

des3_ede_enc_blk(ctx, keystream, ctrblk);
- crypto_xor(keystream, src, nbytes);
- memcpy(dst, keystream, nbytes);
+ crypto_xor_cpy(dst, keystream, src, nbytes);

crypto_inc(ctrblk, DES3_EDE_BLOCK_SIZE);
}
diff --git a/crypto/ctr.c b/crypto/ctr.c
index 477d9226ccaa..854d924f9d8e 100644
--- a/crypto/ctr.c
+++ b/crypto/ctr.c
@@ -65,8 +65,7 @@ static void crypto_ctr_crypt_final(struct blkcipher_walk *walk,
unsigned int nbytes = walk->nbytes;

crypto_cipher_encrypt_one(tfm, keystream, ctrblk);
- crypto_xor(keystream, src, nbytes);
- memcpy(dst, keystream, nbytes);
+ crypto_xor_cpy(dst, keystream, src, nbytes);

crypto_inc(ctrblk, bsize);
}
diff --git a/crypto/pcbc.c b/crypto/pcbc.c
index 29dd2b4a3b85..d9e45a958720 100644
--- a/crypto/pcbc.c
+++ b/crypto/pcbc.c
@@ -55,8 +55,7 @@ static int crypto_pcbc_encrypt_segment(struct skcipher_request *req,
do {
crypto_xor(iv, src, bsize);
crypto_cipher_encrypt_one(tfm, dst, iv);
- memcpy(iv, dst, bsize);
- crypto_xor(iv, src, bsize);
+ crypto_xor_cpy(iv, dst, src, bsize);

src += bsize;
dst += bsize;
@@ -79,8 +78,7 @@ static int crypto_pcbc_encrypt_inplace(struct skcipher_request *req,
memcpy(tmpbuf, src, bsize);
crypto_xor(iv, src, bsize);
crypto_cipher_encrypt_one(tfm, src, iv);
- memcpy(iv, tmpbuf, bsize);
- crypto_xor(iv, src, bsize);
+ crypto_xor_cpy(iv, tmpbuf, src, bsize);

src += bsize;
} while ((nbytes -= bsize) >= bsize);
@@ -127,8 +125,7 @@ static int crypto_pcbc_decrypt_segment(struct skcipher_request *req,
do {
crypto_cipher_decrypt_one(tfm, dst, src);
crypto_xor(dst, iv, bsize);
- memcpy(iv, src, bsize);
- crypto_xor(iv, dst, bsize);
+ crypto_xor_cpy(iv, dst, src, bsize);

src += bsize;
dst += bsize;
@@ -153,8 +150,7 @@ static int crypto_pcbc_decrypt_inplace(struct skcipher_request *req,
memcpy(tmpbuf, src, bsize);
crypto_cipher_decrypt_one(tfm, src, src);
crypto_xor(src, iv, bsize);
- memcpy(iv, tmpbuf, bsize);
- crypto_xor(iv, src, bsize);
+ crypto_xor_cpy(iv, src, tmpbuf, bsize);

src += bsize;
} while ((nbytes -= bsize) >= bsize);
diff --git a/drivers/crypto/vmx/aes_ctr.c b/drivers/crypto/vmx/aes_ctr.c
index 9c26d9e8dbea..15a23f7e2e24 100644
--- a/drivers/crypto/vmx/aes_ctr.c
+++ b/drivers/crypto/vmx/aes_ctr.c
@@ -104,8 +104,7 @@ static void p8_aes_ctr_final(struct p8_aes_ctr_ctx *ctx,
pagefault_enable();
preempt_enable();

- crypto_xor(keystream, src, nbytes);
- memcpy(dst, keystream, nbytes);
+ crypto_xor_cpy(dst, keystream, src, nbytes);
crypto_inc(ctrblk, AES_BLOCK_SIZE);
}

diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index cdf6b1e12460..fa17e5452796 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -758,9 +758,8 @@ static int crypt_iv_tcw_whitening(struct crypt_config *cc,
int i, r;

/* xor whitening with sector number */
- memcpy(buf, tcw->whitening, TCW_WHITENING_SIZE);
- crypto_xor(buf, (u8 *)&sector, 8);
- crypto_xor(&buf[8], (u8 *)&sector, 8);
+ crypto_xor_cpy(buf, tcw->whitening, (u8 *)&sector, 8);
+ crypto_xor_cpy(&buf[8], tcw->whitening + 8, (u8 *)&sector, 8);

/* calculate crc32 for every 32bit part and xor it */
desc->tfm = tcw->crc32_tfm;
@@ -805,10 +804,10 @@ static int crypt_iv_tcw_gen(struct crypt_config *cc, u8 *iv,
}

/* Calculate IV */
- memcpy(iv, tcw->iv_seed, cc->iv_size);
- crypto_xor(iv, (u8 *)&sector, 8);
+ crypto_xor_cpy(iv, tcw->iv_seed, (u8 *)&sector, 8);
if (cc->iv_size > 8)
- crypto_xor(&iv[8], (u8 *)&sector, cc->iv_size - 8);
+ crypto_xor_cpy(&iv[8], tcw->iv_seed + 8, (u8 *)&sector,
+ cc->iv_size - 8);

return r;
}
diff --git a/include/crypto/algapi.h b/include/crypto/algapi.h
index fd547f946bf8..e3cebf640c00 100644
--- a/include/crypto/algapi.h
+++ b/include/crypto/algapi.h
@@ -211,6 +211,25 @@ static inline void crypto_xor(u8 *dst, const u8 *src, unsigned int size)
}
}

+static inline void crypto_xor_cpy(u8 *dst, const u8 *src1, const u8 *src2,
+ unsigned int size)
+{
+ if (IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) &&
+ __builtin_constant_p(size) &&
+ (size % sizeof(unsigned long)) == 0) {
+ unsigned long *d = (unsigned long *)dst;
+ unsigned long *s1 = (unsigned long *)src1;
+ unsigned long *s2 = (unsigned long *)src2;
+
+ while (size > 0) {
+ *d++ = *s1++ ^ *s2++;
+ size -= sizeof(unsigned long);
+ }
+ } else {
+ __crypto_xor(dst, src1, src2, size);
+ }
+}
+
int blkcipher_walk_done(struct blkcipher_desc *desc,
struct blkcipher_walk *walk, int err);
int blkcipher_walk_virt(struct blkcipher_desc *desc,
--
2.9.3
Ard Biesheuvel
2017-07-24 10:28:05 UTC
Permalink
The arm64 kernel will shortly disallow nested kernel mode NEON, so
add a fallback to scalar C code that can be invoked in that case.

Signed-off-by: Ard Biesheuvel <***@linaro.org>
---
arch/arm64/crypto/Kconfig | 3 +-
arch/arm64/crypto/ghash-ce-glue.c | 49 ++++++++++++++++----
2 files changed, 43 insertions(+), 9 deletions(-)

diff --git a/arch/arm64/crypto/Kconfig b/arch/arm64/crypto/Kconfig
index d92293747d63..7d75a363e317 100644
--- a/arch/arm64/crypto/Kconfig
+++ b/arch/arm64/crypto/Kconfig
@@ -28,8 +28,9 @@ config CRYPTO_SHA2_ARM64_CE

config CRYPTO_GHASH_ARM64_CE
tristate "GHASH (for GCM chaining mode) using ARMv8 Crypto Extensions"
- depends on ARM64 && KERNEL_MODE_NEON
+ depends on KERNEL_MODE_NEON
select CRYPTO_HASH
+ select CRYPTO_GF128MUL

config CRYPTO_CRCT10DIF_ARM64_CE
tristate "CRCT10DIF digest algorithm using PMULL instructions"
diff --git a/arch/arm64/crypto/ghash-ce-glue.c b/arch/arm64/crypto/ghash-ce-glue.c
index 833ec1e3f3e9..30221ef56e70 100644
--- a/arch/arm64/crypto/ghash-ce-glue.c
+++ b/arch/arm64/crypto/ghash-ce-glue.c
@@ -1,7 +1,7 @@
/*
* Accelerated GHASH implementation with ARMv8 PMULL instructions.
*
- * Copyright (C) 2014 Linaro Ltd. <***@linaro.org>
+ * Copyright (C) 2014 - 2017 Linaro Ltd. <***@linaro.org>
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 as published
@@ -9,7 +9,9 @@
*/

#include <asm/neon.h>
+#include <asm/simd.h>
#include <asm/unaligned.h>
+#include <crypto/gf128mul.h>
#include <crypto/internal/hash.h>
#include <linux/cpufeature.h>
#include <linux/crypto.h>
@@ -25,6 +27,7 @@ MODULE_LICENSE("GPL v2");
struct ghash_key {
u64 a;
u64 b;
+ be128 k;
};

struct ghash_desc_ctx {
@@ -44,6 +47,36 @@ static int ghash_init(struct shash_desc *desc)
return 0;
}

+static void ghash_do_update(int blocks, u64 dg[], const char *src,
+ struct ghash_key *key, const char *head)
+{
+ if (likely(may_use_simd())) {
+ kernel_neon_begin();
+ pmull_ghash_update(blocks, dg, src, key, head);
+ kernel_neon_end();
+ } else {
+ be128 dst = { cpu_to_be64(dg[1]), cpu_to_be64(dg[0]) };
+
+ do {
+ const u8 *in = src;
+
+ if (head) {
+ in = head;
+ blocks++;
+ head = NULL;
+ } else {
+ src += GHASH_BLOCK_SIZE;
+ }
+
+ crypto_xor((u8 *)&dst, in, GHASH_BLOCK_SIZE);
+ gf128mul_lle(&dst, &key->k);
+ } while (--blocks);
+
+ dg[0] = be64_to_cpu(dst.b);
+ dg[1] = be64_to_cpu(dst.a);
+ }
+}
+
static int ghash_update(struct shash_desc *desc, const u8 *src,
unsigned int len)
{
@@ -67,10 +100,9 @@ static int ghash_update(struct shash_desc *desc, const u8 *src,
blocks = len / GHASH_BLOCK_SIZE;
len %= GHASH_BLOCK_SIZE;

- kernel_neon_begin_partial(8);
- pmull_ghash_update(blocks, ctx->digest, src, key,
- partial ? ctx->buf : NULL);
- kernel_neon_end();
+ ghash_do_update(blocks, ctx->digest, src, key,
+ partial ? ctx->buf : NULL);
+
src += blocks * GHASH_BLOCK_SIZE;
partial = 0;
}
@@ -89,9 +121,7 @@ static int ghash_final(struct shash_desc *desc, u8 *dst)

memset(ctx->buf + partial, 0, GHASH_BLOCK_SIZE - partial);

- kernel_neon_begin_partial(8);
- pmull_ghash_update(1, ctx->digest, ctx->buf, key, NULL);
- kernel_neon_end();
+ ghash_do_update(1, ctx->digest, ctx->buf, key, NULL);
}
put_unaligned_be64(ctx->digest[1], dst);
put_unaligned_be64(ctx->digest[0], dst + 8);
@@ -111,6 +141,9 @@ static int ghash_setkey(struct crypto_shash *tfm,
return -EINVAL;
}

+ /* needed for the fallback */
+ memcpy(&key->k, inkey, GHASH_BLOCK_SIZE);
+
/* perform multiplication by 'x' in GF(2^128) */
b = get_unaligned_be64(inkey);
a = get_unaligned_be64(inkey + 8);
--
2.9.3
Ard Biesheuvel
2017-07-24 10:28:07 UTC
Permalink
The arm64 kernel will shortly disallow nested kernel mode NEON, so
add a fallback to scalar C code that can be invoked in that case.

Signed-off-by: Ard Biesheuvel <***@linaro.org>
---
arch/arm64/crypto/crc32-ce-glue.c | 11 ++++++-----
1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/arch/arm64/crypto/crc32-ce-glue.c b/arch/arm64/crypto/crc32-ce-glue.c
index eccb1ae90064..624f4137918c 100644
--- a/arch/arm64/crypto/crc32-ce-glue.c
+++ b/arch/arm64/crypto/crc32-ce-glue.c
@@ -1,7 +1,7 @@
/*
* Accelerated CRC32(C) using arm64 NEON and Crypto Extensions instructions
*
- * Copyright (C) 2016 Linaro Ltd <***@linaro.org>
+ * Copyright (C) 2016 - 2017 Linaro Ltd <***@linaro.org>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
@@ -19,6 +19,7 @@

#include <asm/hwcap.h>
#include <asm/neon.h>
+#include <asm/simd.h>
#include <asm/unaligned.h>

#define PMULL_MIN_LEN 64L /* minimum size of buffer
@@ -105,10 +106,10 @@ static int crc32_pmull_update(struct shash_desc *desc, const u8 *data,
length -= l;
}

- if (length >= PMULL_MIN_LEN) {
+ if (length >= PMULL_MIN_LEN && may_use_simd()) {
l = round_down(length, SCALE_F);

- kernel_neon_begin_partial(10);
+ kernel_neon_begin();
*crc = crc32_pmull_le(data, l, *crc);
kernel_neon_end();

@@ -137,10 +138,10 @@ static int crc32c_pmull_update(struct shash_desc *desc, const u8 *data,
length -= l;
}

- if (length >= PMULL_MIN_LEN) {
+ if (length >= PMULL_MIN_LEN && may_use_simd()) {
l = round_down(length, SCALE_F);

- kernel_neon_begin_partial(10);
+ kernel_neon_begin();
*crc = crc32c_pmull_le(data, l, *crc);
kernel_neon_end();
--
2.9.3
Ard Biesheuvel
2017-07-24 10:28:09 UTC
Permalink
The arm64 kernel will shortly disallow nested kernel mode NEON, so
add a fallback to scalar code that can be invoked in that case.

Signed-off-by: Ard Biesheuvel <***@linaro.org>
---
arch/arm64/crypto/Kconfig | 3 +-
arch/arm64/crypto/sha2-ce-glue.c | 30 +++++++++++++++++---
arch/arm64/crypto/sha256-glue.c | 1 +
3 files changed, 29 insertions(+), 5 deletions(-)

diff --git a/arch/arm64/crypto/Kconfig b/arch/arm64/crypto/Kconfig
index 5d5953545dad..8cd145f9c1ff 100644
--- a/arch/arm64/crypto/Kconfig
+++ b/arch/arm64/crypto/Kconfig
@@ -24,8 +24,9 @@ config CRYPTO_SHA1_ARM64_CE

config CRYPTO_SHA2_ARM64_CE
tristate "SHA-224/SHA-256 digest algorithm (ARMv8 Crypto Extensions)"
- depends on ARM64 && KERNEL_MODE_NEON
+ depends on KERNEL_MODE_NEON
select CRYPTO_HASH
+ select CRYPTO_SHA256_ARM64

config CRYPTO_GHASH_ARM64_CE
tristate "GHASH (for GCM chaining mode) using ARMv8 Crypto Extensions"
diff --git a/arch/arm64/crypto/sha2-ce-glue.c b/arch/arm64/crypto/sha2-ce-glue.c
index 0ed9486f75dd..fd1ff2b13dfa 100644
--- a/arch/arm64/crypto/sha2-ce-glue.c
+++ b/arch/arm64/crypto/sha2-ce-glue.c
@@ -1,7 +1,7 @@
/*
* sha2-ce-glue.c - SHA-224/SHA-256 using ARMv8 Crypto Extensions
*
- * Copyright (C) 2014 Linaro Ltd <***@linaro.org>
+ * Copyright (C) 2014 - 2017 Linaro Ltd <***@linaro.org>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
@@ -9,6 +9,7 @@
*/

#include <asm/neon.h>
+#include <asm/simd.h>
#include <asm/unaligned.h>
#include <crypto/internal/hash.h>
#include <crypto/sha.h>
@@ -34,13 +35,19 @@ const u32 sha256_ce_offsetof_count = offsetof(struct sha256_ce_state,
const u32 sha256_ce_offsetof_finalize = offsetof(struct sha256_ce_state,
finalize);

+asmlinkage void sha256_block_data_order(u32 *digest, u8 const *src, int blocks);
+
static int sha256_ce_update(struct shash_desc *desc, const u8 *data,
unsigned int len)
{
struct sha256_ce_state *sctx = shash_desc_ctx(desc);

+ if (!may_use_simd())
+ return sha256_base_do_update(desc, data, len,
+ (sha256_block_fn *)sha256_block_data_order);
+
sctx->finalize = 0;
- kernel_neon_begin_partial(28);
+ kernel_neon_begin();
sha256_base_do_update(desc, data, len,
(sha256_block_fn *)sha2_ce_transform);
kernel_neon_end();
@@ -54,13 +61,22 @@ static int sha256_ce_finup(struct shash_desc *desc, const u8 *data,
struct sha256_ce_state *sctx = shash_desc_ctx(desc);
bool finalize = !sctx->sst.count && !(len % SHA256_BLOCK_SIZE);

+ if (!may_use_simd()) {
+ if (len)
+ sha256_base_do_update(desc, data, len,
+ (sha256_block_fn *)sha256_block_data_order);
+ sha256_base_do_finalize(desc,
+ (sha256_block_fn *)sha256_block_data_order);
+ return sha256_base_finish(desc, out);
+ }
+
/*
* Allow the asm code to perform the finalization if there is no
* partial data and the input is a round multiple of the block size.
*/
sctx->finalize = finalize;

- kernel_neon_begin_partial(28);
+ kernel_neon_begin();
sha256_base_do_update(desc, data, len,
(sha256_block_fn *)sha2_ce_transform);
if (!finalize)
@@ -74,8 +90,14 @@ static int sha256_ce_final(struct shash_desc *desc, u8 *out)
{
struct sha256_ce_state *sctx = shash_desc_ctx(desc);

+ if (!may_use_simd()) {
+ sha256_base_do_finalize(desc,
+ (sha256_block_fn *)sha256_block_data_order);
+ return sha256_base_finish(desc, out);
+ }
+
sctx->finalize = 0;
- kernel_neon_begin_partial(28);
+ kernel_neon_begin();
sha256_base_do_finalize(desc, (sha256_block_fn *)sha2_ce_transform);
kernel_neon_end();
return sha256_base_finish(desc, out);
diff --git a/arch/arm64/crypto/sha256-glue.c b/arch/arm64/crypto/sha256-glue.c
index a2226f841960..b064d925fe2a 100644
--- a/arch/arm64/crypto/sha256-glue.c
+++ b/arch/arm64/crypto/sha256-glue.c
@@ -29,6 +29,7 @@ MODULE_ALIAS_CRYPTO("sha256");

asmlinkage void sha256_block_data_order(u32 *digest, const void *data,
unsigned int num_blks);
+EXPORT_SYMBOL(sha256_block_data_order);

asmlinkage void sha256_block_neon(u32 *digest, const void *data,
unsigned int num_blks);
--
2.9.3
Ard Biesheuvel
2017-07-24 10:28:11 UTC
Permalink
The arm64 kernel will shortly disallow nested kernel mode NEON, so
add a fallback to scalar code that can be invoked in that case.

Signed-off-by: Ard Biesheuvel <***@linaro.org>
---
arch/arm64/crypto/Kconfig | 1 +
arch/arm64/crypto/aes-ce-cipher.c | 20 +++++++++++++++++---
2 files changed, 18 insertions(+), 3 deletions(-)

diff --git a/arch/arm64/crypto/Kconfig b/arch/arm64/crypto/Kconfig
index 8cd145f9c1ff..2fd4bb6d0b5a 100644
--- a/arch/arm64/crypto/Kconfig
+++ b/arch/arm64/crypto/Kconfig
@@ -52,6 +52,7 @@ config CRYPTO_AES_ARM64_CE
tristate "AES core cipher using ARMv8 Crypto Extensions"
depends on ARM64 && KERNEL_MODE_NEON
select CRYPTO_ALGAPI
+ select CRYPTO_AES_ARM64

config CRYPTO_AES_ARM64_CE_CCM
tristate "AES in CCM mode using ARMv8 Crypto Extensions"
diff --git a/arch/arm64/crypto/aes-ce-cipher.c b/arch/arm64/crypto/aes-ce-cipher.c
index a0a0e5e3a8b5..6a75cd75ed11 100644
--- a/arch/arm64/crypto/aes-ce-cipher.c
+++ b/arch/arm64/crypto/aes-ce-cipher.c
@@ -9,6 +9,7 @@
*/

#include <asm/neon.h>
+#include <asm/simd.h>
#include <asm/unaligned.h>
#include <crypto/aes.h>
#include <linux/cpufeature.h>
@@ -21,6 +22,9 @@ MODULE_DESCRIPTION("Synchronous AES cipher using ARMv8 Crypto Extensions");
MODULE_AUTHOR("Ard Biesheuvel <***@linaro.org>");
MODULE_LICENSE("GPL v2");

+asmlinkage void __aes_arm64_encrypt(u32 *rk, u8 *out, const u8 *in, int rounds);
+asmlinkage void __aes_arm64_decrypt(u32 *rk, u8 *out, const u8 *in, int rounds);
+
struct aes_block {
u8 b[AES_BLOCK_SIZE];
};
@@ -45,7 +49,12 @@ static void aes_cipher_encrypt(struct crypto_tfm *tfm, u8 dst[], u8 const src[])
void *dummy0;
int dummy1;

- kernel_neon_begin_partial(4);
+ if (!may_use_simd()) {
+ __aes_arm64_encrypt(ctx->key_enc, dst, src, num_rounds(ctx));
+ return;
+ }
+
+ kernel_neon_begin();

__asm__(" ld1 {v0.16b}, %[in] ;"
" ld1 {v1.4s}, [%[key]], #16 ;"
@@ -90,7 +99,12 @@ static void aes_cipher_decrypt(struct crypto_tfm *tfm, u8 dst[], u8 const src[])
void *dummy0;
int dummy1;

- kernel_neon_begin_partial(4);
+ if (!may_use_simd()) {
+ __aes_arm64_decrypt(ctx->key_dec, dst, src, num_rounds(ctx));
+ return;
+ }
+
+ kernel_neon_begin();

__asm__(" ld1 {v0.16b}, %[in] ;"
" ld1 {v1.4s}, [%[key]], #16 ;"
@@ -170,7 +184,7 @@ int ce_aes_expandkey(struct crypto_aes_ctx *ctx, const u8 *in_key,
for (i = 0; i < kwords; i++)
ctx->key_enc[i] = get_unaligned_le32(in_key + i * sizeof(u32));

- kernel_neon_begin_partial(2);
+ kernel_neon_begin();
for (i = 0; i < sizeof(rcon); i++) {
u32 *rki = ctx->key_enc + (i * kwords);
u32 *rko = rki + kwords;
--
2.9.3
Ard Biesheuvel
2017-07-24 10:28:08 UTC
Permalink
The arm64 kernel will shortly disallow nested kernel mode NEON, so
add a fallback to scalar C code that can be invoked in that case.

Signed-off-by: Ard Biesheuvel <***@linaro.org>
---
arch/arm64/crypto/Kconfig | 3 ++-
arch/arm64/crypto/sha1-ce-glue.c | 18 ++++++++++++++----
2 files changed, 16 insertions(+), 5 deletions(-)

diff --git a/arch/arm64/crypto/Kconfig b/arch/arm64/crypto/Kconfig
index 7d75a363e317..5d5953545dad 100644
--- a/arch/arm64/crypto/Kconfig
+++ b/arch/arm64/crypto/Kconfig
@@ -18,8 +18,9 @@ config CRYPTO_SHA512_ARM64

config CRYPTO_SHA1_ARM64_CE
tristate "SHA-1 digest algorithm (ARMv8 Crypto Extensions)"
- depends on ARM64 && KERNEL_MODE_NEON
+ depends on KERNEL_MODE_NEON
select CRYPTO_HASH
+ select CRYPTO_SHA1

config CRYPTO_SHA2_ARM64_CE
tristate "SHA-224/SHA-256 digest algorithm (ARMv8 Crypto Extensions)"
diff --git a/arch/arm64/crypto/sha1-ce-glue.c b/arch/arm64/crypto/sha1-ce-glue.c
index ea319c055f5d..efbeb3e0dcfb 100644
--- a/arch/arm64/crypto/sha1-ce-glue.c
+++ b/arch/arm64/crypto/sha1-ce-glue.c
@@ -1,7 +1,7 @@
/*
* sha1-ce-glue.c - SHA-1 secure hash using ARMv8 Crypto Extensions
*
- * Copyright (C) 2014 Linaro Ltd <***@linaro.org>
+ * Copyright (C) 2014 - 2017 Linaro Ltd <***@linaro.org>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
@@ -9,6 +9,7 @@
*/

#include <asm/neon.h>
+#include <asm/simd.h>
#include <asm/unaligned.h>
#include <crypto/internal/hash.h>
#include <crypto/sha.h>
@@ -37,8 +38,11 @@ static int sha1_ce_update(struct shash_desc *desc, const u8 *data,
{
struct sha1_ce_state *sctx = shash_desc_ctx(desc);

+ if (!may_use_simd())
+ return crypto_sha1_update(desc, data, len);
+
sctx->finalize = 0;
- kernel_neon_begin_partial(16);
+ kernel_neon_begin();
sha1_base_do_update(desc, data, len,
(sha1_block_fn *)sha1_ce_transform);
kernel_neon_end();
@@ -52,13 +56,16 @@ static int sha1_ce_finup(struct shash_desc *desc, const u8 *data,
struct sha1_ce_state *sctx = shash_desc_ctx(desc);
bool finalize = !sctx->sst.count && !(len % SHA1_BLOCK_SIZE);

+ if (!may_use_simd())
+ return crypto_sha1_finup(desc, data, len, out);
+
/*
* Allow the asm code to perform the finalization if there is no
* partial data and the input is a round multiple of the block size.
*/
sctx->finalize = finalize;

- kernel_neon_begin_partial(16);
+ kernel_neon_begin();
sha1_base_do_update(desc, data, len,
(sha1_block_fn *)sha1_ce_transform);
if (!finalize)
@@ -71,8 +78,11 @@ static int sha1_ce_final(struct shash_desc *desc, u8 *out)
{
struct sha1_ce_state *sctx = shash_desc_ctx(desc);

+ if (!may_use_simd())
+ return crypto_sha1_finup(desc, NULL, 0, out);
+
sctx->finalize = 0;
- kernel_neon_begin_partial(16);
+ kernel_neon_begin();
sha1_base_do_finalize(desc, (sha1_block_fn *)sha1_ce_transform);
kernel_neon_end();
return sha1_base_finish(desc, out);
--
2.9.3
Ard Biesheuvel
2017-07-24 10:28:10 UTC
Permalink
In order to be able to reuse the generic AES code as a fallback for
situations where the NEON may not be used, update the key handling
to match the byte order of the generic code: it stores round keys
as sequences of 32-bit quantities rather than streams of bytes, and
so our code needs to be updated to reflect that.

Signed-off-by: Ard Biesheuvel <***@linaro.org>
---
arch/arm64/crypto/aes-ce-ccm-core.S | 30 ++++++++---------
arch/arm64/crypto/aes-ce-cipher.c | 35 +++++++++-----------
arch/arm64/crypto/aes-ce.S | 12 +++----
3 files changed, 37 insertions(+), 40 deletions(-)

diff --git a/arch/arm64/crypto/aes-ce-ccm-core.S b/arch/arm64/crypto/aes-ce-ccm-core.S
index 3363560c79b7..e3a375c4cb83 100644
--- a/arch/arm64/crypto/aes-ce-ccm-core.S
+++ b/arch/arm64/crypto/aes-ce-ccm-core.S
@@ -1,7 +1,7 @@
/*
* aesce-ccm-core.S - AES-CCM transform for ARMv8 with Crypto Extensions
*
- * Copyright (C) 2013 - 2014 Linaro Ltd <***@linaro.org>
+ * Copyright (C) 2013 - 2017 Linaro Ltd <***@linaro.org>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
@@ -32,7 +32,7 @@ ENTRY(ce_aes_ccm_auth_data)
beq 8f /* out of input? */
cbnz w8, 0b
eor v0.16b, v0.16b, v1.16b
-1: ld1 {v3.16b}, [x4] /* load first round key */
+1: ld1 {v3.4s}, [x4] /* load first round key */
prfm pldl1strm, [x1]
cmp w5, #12 /* which key size? */
add x6, x4, #16
@@ -42,17 +42,17 @@ ENTRY(ce_aes_ccm_auth_data)
mov v5.16b, v3.16b
b 4f
2: mov v4.16b, v3.16b
- ld1 {v5.16b}, [x6], #16 /* load 2nd round key */
+ ld1 {v5.4s}, [x6], #16 /* load 2nd round key */
3: aese v0.16b, v4.16b
aesmc v0.16b, v0.16b
-4: ld1 {v3.16b}, [x6], #16 /* load next round key */
+4: ld1 {v3.4s}, [x6], #16 /* load next round key */
aese v0.16b, v5.16b
aesmc v0.16b, v0.16b
-5: ld1 {v4.16b}, [x6], #16 /* load next round key */
+5: ld1 {v4.4s}, [x6], #16 /* load next round key */
subs w7, w7, #3
aese v0.16b, v3.16b
aesmc v0.16b, v0.16b
- ld1 {v5.16b}, [x6], #16 /* load next round key */
+ ld1 {v5.4s}, [x6], #16 /* load next round key */
bpl 3b
aese v0.16b, v4.16b
subs w2, w2, #16 /* last data? */
@@ -90,7 +90,7 @@ ENDPROC(ce_aes_ccm_auth_data)
* u32 rounds);
*/
ENTRY(ce_aes_ccm_final)
- ld1 {v3.16b}, [x2], #16 /* load first round key */
+ ld1 {v3.4s}, [x2], #16 /* load first round key */
ld1 {v0.16b}, [x0] /* load mac */
cmp w3, #12 /* which key size? */
sub w3, w3, #2 /* modified # of rounds */
@@ -100,17 +100,17 @@ ENTRY(ce_aes_ccm_final)
mov v5.16b, v3.16b
b 2f
0: mov v4.16b, v3.16b
-1: ld1 {v5.16b}, [x2], #16 /* load next round key */
+1: ld1 {v5.4s}, [x2], #16 /* load next round key */
aese v0.16b, v4.16b
aesmc v0.16b, v0.16b
aese v1.16b, v4.16b
aesmc v1.16b, v1.16b
-2: ld1 {v3.16b}, [x2], #16 /* load next round key */
+2: ld1 {v3.4s}, [x2], #16 /* load next round key */
aese v0.16b, v5.16b
aesmc v0.16b, v0.16b
aese v1.16b, v5.16b
aesmc v1.16b, v1.16b
-3: ld1 {v4.16b}, [x2], #16 /* load next round key */
+3: ld1 {v4.4s}, [x2], #16 /* load next round key */
subs w3, w3, #3
aese v0.16b, v3.16b
aesmc v0.16b, v0.16b
@@ -137,31 +137,31 @@ CPU_LE( rev x8, x8 ) /* keep swabbed ctr in reg */
cmp w4, #12 /* which key size? */
sub w7, w4, #2 /* get modified # of rounds */
ins v1.d[1], x9 /* no carry in lower ctr */
- ld1 {v3.16b}, [x3] /* load first round key */
+ ld1 {v3.4s}, [x3] /* load first round key */
add x10, x3, #16
bmi 1f
bne 4f
mov v5.16b, v3.16b
b 3f
1: mov v4.16b, v3.16b
- ld1 {v5.16b}, [x10], #16 /* load 2nd round key */
+ ld1 {v5.4s}, [x10], #16 /* load 2nd round key */
2: /* inner loop: 3 rounds, 2x interleaved */
aese v0.16b, v4.16b
aesmc v0.16b, v0.16b
aese v1.16b, v4.16b
aesmc v1.16b, v1.16b
-3: ld1 {v3.16b}, [x10], #16 /* load next round key */
+3: ld1 {v3.4s}, [x10], #16 /* load next round key */
aese v0.16b, v5.16b
aesmc v0.16b, v0.16b
aese v1.16b, v5.16b
aesmc v1.16b, v1.16b
-4: ld1 {v4.16b}, [x10], #16 /* load next round key */
+4: ld1 {v4.4s}, [x10], #16 /* load next round key */
subs w7, w7, #3
aese v0.16b, v3.16b
aesmc v0.16b, v0.16b
aese v1.16b, v3.16b
aesmc v1.16b, v1.16b
- ld1 {v5.16b}, [x10], #16 /* load next round key */
+ ld1 {v5.4s}, [x10], #16 /* load next round key */
bpl 2b
aese v0.16b, v4.16b
aese v1.16b, v4.16b
diff --git a/arch/arm64/crypto/aes-ce-cipher.c b/arch/arm64/crypto/aes-ce-cipher.c
index 50d9fe11d0c8..a0a0e5e3a8b5 100644
--- a/arch/arm64/crypto/aes-ce-cipher.c
+++ b/arch/arm64/crypto/aes-ce-cipher.c
@@ -1,7 +1,7 @@
/*
* aes-ce-cipher.c - core AES cipher using ARMv8 Crypto Extensions
*
- * Copyright (C) 2013 - 2014 Linaro Ltd <***@linaro.org>
+ * Copyright (C) 2013 - 2017 Linaro Ltd <***@linaro.org>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
@@ -9,6 +9,7 @@
*/

#include <asm/neon.h>
+#include <asm/unaligned.h>
#include <crypto/aes.h>
#include <linux/cpufeature.h>
#include <linux/crypto.h>
@@ -47,24 +48,24 @@ static void aes_cipher_encrypt(struct crypto_tfm *tfm, u8 dst[], u8 const src[])
kernel_neon_begin_partial(4);

__asm__(" ld1 {v0.16b}, %[in] ;"
- " ld1 {v1.16b}, [%[key]], #16 ;"
+ " ld1 {v1.4s}, [%[key]], #16 ;"
" cmp %w[rounds], #10 ;"
" bmi 0f ;"
" bne 3f ;"
" mov v3.16b, v1.16b ;"
" b 2f ;"
"0: mov v2.16b, v1.16b ;"
- " ld1 {v3.16b}, [%[key]], #16 ;"
+ " ld1 {v3.4s}, [%[key]], #16 ;"
"1: aese v0.16b, v2.16b ;"
" aesmc v0.16b, v0.16b ;"
- "2: ld1 {v1.16b}, [%[key]], #16 ;"
+ "2: ld1 {v1.4s}, [%[key]], #16 ;"
" aese v0.16b, v3.16b ;"
" aesmc v0.16b, v0.16b ;"
- "3: ld1 {v2.16b}, [%[key]], #16 ;"
+ "3: ld1 {v2.4s}, [%[key]], #16 ;"
" subs %w[rounds], %w[rounds], #3 ;"
" aese v0.16b, v1.16b ;"
" aesmc v0.16b, v0.16b ;"
- " ld1 {v3.16b}, [%[key]], #16 ;"
+ " ld1 {v3.4s}, [%[key]], #16 ;"
" bpl 1b ;"
" aese v0.16b, v2.16b ;"
" eor v0.16b, v0.16b, v3.16b ;"
@@ -92,24 +93,24 @@ static void aes_cipher_decrypt(struct crypto_tfm *tfm, u8 dst[], u8 const src[])
kernel_neon_begin_partial(4);

__asm__(" ld1 {v0.16b}, %[in] ;"
- " ld1 {v1.16b}, [%[key]], #16 ;"
+ " ld1 {v1.4s}, [%[key]], #16 ;"
" cmp %w[rounds], #10 ;"
" bmi 0f ;"
" bne 3f ;"
" mov v3.16b, v1.16b ;"
" b 2f ;"
"0: mov v2.16b, v1.16b ;"
- " ld1 {v3.16b}, [%[key]], #16 ;"
+ " ld1 {v3.4s}, [%[key]], #16 ;"
"1: aesd v0.16b, v2.16b ;"
" aesimc v0.16b, v0.16b ;"
- "2: ld1 {v1.16b}, [%[key]], #16 ;"
+ "2: ld1 {v1.4s}, [%[key]], #16 ;"
" aesd v0.16b, v3.16b ;"
" aesimc v0.16b, v0.16b ;"
- "3: ld1 {v2.16b}, [%[key]], #16 ;"
+ "3: ld1 {v2.4s}, [%[key]], #16 ;"
" subs %w[rounds], %w[rounds], #3 ;"
" aesd v0.16b, v1.16b ;"
" aesimc v0.16b, v0.16b ;"
- " ld1 {v3.16b}, [%[key]], #16 ;"
+ " ld1 {v3.4s}, [%[key]], #16 ;"
" bpl 1b ;"
" aesd v0.16b, v2.16b ;"
" eor v0.16b, v0.16b, v3.16b ;"
@@ -165,20 +166,16 @@ int ce_aes_expandkey(struct crypto_aes_ctx *ctx, const u8 *in_key,
key_len != AES_KEYSIZE_256)
return -EINVAL;

- memcpy(ctx->key_enc, in_key, key_len);
ctx->key_length = key_len;
+ for (i = 0; i < kwords; i++)
+ ctx->key_enc[i] = get_unaligned_le32(in_key + i * sizeof(u32));

kernel_neon_begin_partial(2);
for (i = 0; i < sizeof(rcon); i++) {
u32 *rki = ctx->key_enc + (i * kwords);
u32 *rko = rki + kwords;

-#ifndef CONFIG_CPU_BIG_ENDIAN
rko[0] = ror32(aes_sub(rki[kwords - 1]), 8) ^ rcon[i] ^ rki[0];
-#else
- rko[0] = rol32(aes_sub(rki[kwords - 1]), 8) ^ (rcon[i] << 24) ^
- rki[0];
-#endif
rko[1] = rko[0] ^ rki[1];
rko[2] = rko[1] ^ rki[2];
rko[3] = rko[2] ^ rki[3];
@@ -210,9 +207,9 @@ int ce_aes_expandkey(struct crypto_aes_ctx *ctx, const u8 *in_key,

key_dec[0] = key_enc[j];
for (i = 1, j--; j > 0; i++, j--)
- __asm__("ld1 {v0.16b}, %[in] ;"
+ __asm__("ld1 {v0.4s}, %[in] ;"
"aesimc v1.16b, v0.16b ;"
- "st1 {v1.16b}, %[out] ;"
+ "st1 {v1.4s}, %[out] ;"

: [out] "=Q"(key_dec[i])
: [in] "Q"(key_enc[j])
diff --git a/arch/arm64/crypto/aes-ce.S b/arch/arm64/crypto/aes-ce.S
index b46093d567e5..50330f5c3adc 100644
--- a/arch/arm64/crypto/aes-ce.S
+++ b/arch/arm64/crypto/aes-ce.S
@@ -2,7 +2,7 @@
* linux/arch/arm64/crypto/aes-ce.S - AES cipher for ARMv8 with
* Crypto Extensions
*
- * Copyright (C) 2013 Linaro Ltd <***@linaro.org>
+ * Copyright (C) 2013 - 2017 Linaro Ltd <***@linaro.org>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
@@ -22,11 +22,11 @@
cmp \rounds, #12
blo 2222f /* 128 bits */
beq 1111f /* 192 bits */
- ld1 {v17.16b-v18.16b}, [\rk], #32
-1111: ld1 {v19.16b-v20.16b}, [\rk], #32
-2222: ld1 {v21.16b-v24.16b}, [\rk], #64
- ld1 {v25.16b-v28.16b}, [\rk], #64
- ld1 {v29.16b-v31.16b}, [\rk]
+ ld1 {v17.4s-v18.4s}, [\rk], #32
+1111: ld1 {v19.4s-v20.4s}, [\rk], #32
+2222: ld1 {v21.4s-v24.4s}, [\rk], #64
+ ld1 {v25.4s-v28.4s}, [\rk], #64
+ ld1 {v29.4s-v31.4s}, [\rk]
.endm

/* prepare for encryption with key in rk[] */
--
2.9.3
Ard Biesheuvel
2017-07-24 10:28:15 UTC
Permalink
Of the various chaining modes implemented by the bit sliced AES driver,
only CTR is exposed as a synchronous cipher, and requires a fallback in
order to remain usable once we update the kernel mode NEON handling logic
to disallow nested use. So wire up the existing CTR fallback C code.

Signed-off-by: Ard Biesheuvel <***@linaro.org>
---
arch/arm64/crypto/Kconfig | 1 +
arch/arm64/crypto/aes-neonbs-glue.c | 48 ++++++++++++++++++--
2 files changed, 44 insertions(+), 5 deletions(-)

diff --git a/arch/arm64/crypto/Kconfig b/arch/arm64/crypto/Kconfig
index a068dcbe2518..f9e264b83366 100644
--- a/arch/arm64/crypto/Kconfig
+++ b/arch/arm64/crypto/Kconfig
@@ -89,6 +89,7 @@ config CRYPTO_AES_ARM64_BS
depends on KERNEL_MODE_NEON
select CRYPTO_BLKCIPHER
select CRYPTO_AES_ARM64_NEON_BLK
+ select CRYPTO_AES_ARM64
select CRYPTO_SIMD

endif
diff --git a/arch/arm64/crypto/aes-neonbs-glue.c b/arch/arm64/crypto/aes-neonbs-glue.c
index 9001aec16007..c55d68ccb89f 100644
--- a/arch/arm64/crypto/aes-neonbs-glue.c
+++ b/arch/arm64/crypto/aes-neonbs-glue.c
@@ -1,7 +1,7 @@
/*
* Bit sliced AES using NEON instructions
*
- * Copyright (C) 2016 Linaro Ltd <***@linaro.org>
+ * Copyright (C) 2016 - 2017 Linaro Ltd <***@linaro.org>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
@@ -9,12 +9,15 @@
*/

#include <asm/neon.h>
+#include <asm/simd.h>
#include <crypto/aes.h>
#include <crypto/internal/simd.h>
#include <crypto/internal/skcipher.h>
#include <crypto/xts.h>
#include <linux/module.h>

+#include "aes-ctr-fallback.h"
+
MODULE_AUTHOR("Ard Biesheuvel <***@linaro.org>");
MODULE_LICENSE("GPL v2");

@@ -58,6 +61,11 @@ struct aesbs_cbc_ctx {
u32 enc[AES_MAX_KEYLENGTH_U32];
};

+struct aesbs_ctr_ctx {
+ struct aesbs_ctx key; /* must be first member */
+ struct crypto_aes_ctx fallback;
+};
+
struct aesbs_xts_ctx {
struct aesbs_ctx key;
u32 twkey[AES_MAX_KEYLENGTH_U32];
@@ -196,6 +204,25 @@ static int cbc_decrypt(struct skcipher_request *req)
return err;
}

+static int aesbs_ctr_setkey_sync(struct crypto_skcipher *tfm, const u8 *in_key,
+ unsigned int key_len)
+{
+ struct aesbs_ctr_ctx *ctx = crypto_skcipher_ctx(tfm);
+ int err;
+
+ err = crypto_aes_expand_key(&ctx->fallback, in_key, key_len);
+ if (err)
+ return err;
+
+ ctx->key.rounds = 6 + key_len / 4;
+
+ kernel_neon_begin();
+ aesbs_convert_key(ctx->key.rk, ctx->fallback.key_enc, ctx->key.rounds);
+ kernel_neon_end();
+
+ return 0;
+}
+
static int ctr_encrypt(struct skcipher_request *req)
{
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
@@ -259,6 +286,17 @@ static int aesbs_xts_setkey(struct crypto_skcipher *tfm, const u8 *in_key,
return aesbs_setkey(tfm, in_key, key_len);
}

+static int ctr_encrypt_sync(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct aesbs_ctr_ctx *ctx = crypto_skcipher_ctx(tfm);
+
+ if (!may_use_simd())
+ return aes_ctr_encrypt_fallback(&ctx->fallback, req);
+
+ return ctr_encrypt(req);
+}
+
static int __xts_crypt(struct skcipher_request *req,
void (*fn)(u8 out[], u8 const in[], u8 const rk[],
int rounds, int blocks, u8 iv[]))
@@ -355,7 +393,7 @@ static struct skcipher_alg aes_algs[] = { {
.base.cra_driver_name = "ctr-aes-neonbs",
.base.cra_priority = 250 - 1,
.base.cra_blocksize = 1,
- .base.cra_ctxsize = sizeof(struct aesbs_ctx),
+ .base.cra_ctxsize = sizeof(struct aesbs_ctr_ctx),
.base.cra_module = THIS_MODULE,

.min_keysize = AES_MIN_KEY_SIZE,
@@ -363,9 +401,9 @@ static struct skcipher_alg aes_algs[] = { {
.chunksize = AES_BLOCK_SIZE,
.walksize = 8 * AES_BLOCK_SIZE,
.ivsize = AES_BLOCK_SIZE,
- .setkey = aesbs_setkey,
- .encrypt = ctr_encrypt,
- .decrypt = ctr_encrypt,
+ .setkey = aesbs_ctr_setkey_sync,
+ .encrypt = ctr_encrypt_sync,
+ .decrypt = ctr_encrypt_sync,
}, {
.base.cra_name = "__xts(aes)",
.base.cra_driver_name = "__xts-aes-neonbs",
--
2.9.3
Ard Biesheuvel
2017-07-24 10:28:12 UTC
Permalink
The arm64 kernel will shortly disallow nested kernel mode NEON.

So honour this in the ARMv8 Crypto Extensions implementation of
CCM-AES, and fall back to a scalar implementation using the generic
crypto helpers for AES, XOR and incrementing the CTR counter.

Signed-off-by: Ard Biesheuvel <***@linaro.org>
---
arch/arm64/crypto/Kconfig | 1 +
arch/arm64/crypto/aes-ce-ccm-glue.c | 174 ++++++++++++++++----
2 files changed, 140 insertions(+), 35 deletions(-)

diff --git a/arch/arm64/crypto/Kconfig b/arch/arm64/crypto/Kconfig
index 2fd4bb6d0b5a..ba637765c19a 100644
--- a/arch/arm64/crypto/Kconfig
+++ b/arch/arm64/crypto/Kconfig
@@ -59,6 +59,7 @@ config CRYPTO_AES_ARM64_CE_CCM
depends on ARM64 && KERNEL_MODE_NEON
select CRYPTO_ALGAPI
select CRYPTO_AES_ARM64_CE
+ select CRYPTO_AES_ARM64
select CRYPTO_AEAD

config CRYPTO_AES_ARM64_CE_BLK
diff --git a/arch/arm64/crypto/aes-ce-ccm-glue.c b/arch/arm64/crypto/aes-ce-ccm-glue.c
index 6a7dbc7c83a6..a1254036f2b1 100644
--- a/arch/arm64/crypto/aes-ce-ccm-glue.c
+++ b/arch/arm64/crypto/aes-ce-ccm-glue.c
@@ -1,7 +1,7 @@
/*
* aes-ccm-glue.c - AES-CCM transform for ARMv8 with Crypto Extensions
*
- * Copyright (C) 2013 - 2014 Linaro Ltd <***@linaro.org>
+ * Copyright (C) 2013 - 2017 Linaro Ltd <***@linaro.org>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
@@ -9,6 +9,7 @@
*/

#include <asm/neon.h>
+#include <asm/simd.h>
#include <asm/unaligned.h>
#include <crypto/aes.h>
#include <crypto/scatterwalk.h>
@@ -44,6 +45,8 @@ asmlinkage void ce_aes_ccm_decrypt(u8 out[], u8 const in[], u32 cbytes,
asmlinkage void ce_aes_ccm_final(u8 mac[], u8 const ctr[], u32 const rk[],
u32 rounds);

+asmlinkage void __aes_arm64_encrypt(u32 *rk, u8 *out, const u8 *in, int rounds);
+
static int ccm_setkey(struct crypto_aead *tfm, const u8 *in_key,
unsigned int key_len)
{
@@ -103,7 +106,45 @@ static int ccm_init_mac(struct aead_request *req, u8 maciv[], u32 msglen)
return 0;
}

-static void ccm_calculate_auth_mac(struct aead_request *req, u8 mac[])
+static void ccm_update_mac(struct crypto_aes_ctx *key, u8 mac[], u8 const in[],
+ u32 abytes, u32 *macp, bool use_neon)
+{
+ if (likely(use_neon)) {
+ ce_aes_ccm_auth_data(mac, in, abytes, macp, key->key_enc,
+ num_rounds(key));
+ } else {
+ if (*macp > 0 && *macp < AES_BLOCK_SIZE) {
+ int added = min(abytes, AES_BLOCK_SIZE - *macp);
+
+ crypto_xor(&mac[*macp], in, added);
+
+ *macp += added;
+ in += added;
+ abytes -= added;
+ }
+
+ while (abytes > AES_BLOCK_SIZE) {
+ __aes_arm64_encrypt(key->key_enc, mac, mac,
+ num_rounds(key));
+ crypto_xor(mac, in, AES_BLOCK_SIZE);
+
+ in += AES_BLOCK_SIZE;
+ abytes -= AES_BLOCK_SIZE;
+ }
+
+ if (abytes > 0) {
+ __aes_arm64_encrypt(key->key_enc, mac, mac,
+ num_rounds(key));
+ crypto_xor(mac, in, abytes);
+ *macp = abytes;
+ } else {
+ *macp = 0;
+ }
+ }
+}
+
+static void ccm_calculate_auth_mac(struct aead_request *req, u8 mac[],
+ bool use_neon)
{
struct crypto_aead *aead = crypto_aead_reqtfm(req);
struct crypto_aes_ctx *ctx = crypto_aead_ctx(aead);
@@ -122,8 +163,7 @@ static void ccm_calculate_auth_mac(struct aead_request *req, u8 mac[])
ltag.len = 6;
}

- ce_aes_ccm_auth_data(mac, (u8 *)&ltag, ltag.len, &macp, ctx->key_enc,
- num_rounds(ctx));
+ ccm_update_mac(ctx, mac, (u8 *)&ltag, ltag.len, &macp, use_neon);
scatterwalk_start(&walk, req->src);

do {
@@ -135,8 +175,7 @@ static void ccm_calculate_auth_mac(struct aead_request *req, u8 mac[])
n = scatterwalk_clamp(&walk, len);
}
p = scatterwalk_map(&walk);
- ce_aes_ccm_auth_data(mac, p, n, &macp, ctx->key_enc,
- num_rounds(ctx));
+ ccm_update_mac(ctx, mac, p, n, &macp, use_neon);
len -= n;

scatterwalk_unmap(p);
@@ -145,6 +184,56 @@ static void ccm_calculate_auth_mac(struct aead_request *req, u8 mac[])
} while (len);
}

+static int ccm_crypt_fallback(struct skcipher_walk *walk, u8 mac[], u8 iv0[],
+ struct crypto_aes_ctx *ctx, bool enc)
+{
+ u8 buf[AES_BLOCK_SIZE];
+ int err = 0;
+
+ while (walk->nbytes) {
+ int blocks = walk->nbytes / AES_BLOCK_SIZE;
+ u32 tail = walk->nbytes % AES_BLOCK_SIZE;
+ u8 *dst = walk->dst.virt.addr;
+ u8 *src = walk->src.virt.addr;
+ u32 nbytes = walk->nbytes;
+
+ if (nbytes == walk->total && tail > 0) {
+ blocks++;
+ tail = 0;
+ }
+
+ do {
+ u32 bsize = AES_BLOCK_SIZE;
+
+ if (nbytes < AES_BLOCK_SIZE)
+ bsize = nbytes;
+
+ crypto_inc(walk->iv, AES_BLOCK_SIZE);
+ __aes_arm64_encrypt(ctx->key_enc, buf, walk->iv,
+ num_rounds(ctx));
+ __aes_arm64_encrypt(ctx->key_enc, mac, mac,
+ num_rounds(ctx));
+ if (enc)
+ crypto_xor(mac, src, bsize);
+ crypto_xor_cpy(dst, src, buf, bsize);
+ if (!enc)
+ crypto_xor(mac, dst, bsize);
+ dst += bsize;
+ src += bsize;
+ nbytes -= bsize;
+ } while (--blocks);
+
+ err = skcipher_walk_done(walk, tail);
+ }
+
+ if (!err) {
+ __aes_arm64_encrypt(ctx->key_enc, buf, iv0, num_rounds(ctx));
+ __aes_arm64_encrypt(ctx->key_enc, mac, mac, num_rounds(ctx));
+ crypto_xor(mac, buf, AES_BLOCK_SIZE);
+ }
+ return err;
+}
+
static int ccm_encrypt(struct aead_request *req)
{
struct crypto_aead *aead = crypto_aead_reqtfm(req);
@@ -153,39 +242,46 @@ static int ccm_encrypt(struct aead_request *req)
u8 __aligned(8) mac[AES_BLOCK_SIZE];
u8 buf[AES_BLOCK_SIZE];
u32 len = req->cryptlen;
+ bool use_neon = may_use_simd();
int err;

err = ccm_init_mac(req, mac, len);
if (err)
return err;

- kernel_neon_begin_partial(6);
+ if (likely(use_neon))
+ kernel_neon_begin();

if (req->assoclen)
- ccm_calculate_auth_mac(req, mac);
+ ccm_calculate_auth_mac(req, mac, use_neon);

/* preserve the original iv for the final round */
memcpy(buf, req->iv, AES_BLOCK_SIZE);

err = skcipher_walk_aead_encrypt(&walk, req, true);

- while (walk.nbytes) {
- u32 tail = walk.nbytes % AES_BLOCK_SIZE;
-
- if (walk.nbytes == walk.total)
- tail = 0;
+ if (likely(use_neon)) {
+ while (walk.nbytes) {
+ u32 tail = walk.nbytes % AES_BLOCK_SIZE;

- ce_aes_ccm_encrypt(walk.dst.virt.addr, walk.src.virt.addr,
- walk.nbytes - tail, ctx->key_enc,
- num_rounds(ctx), mac, walk.iv);
+ if (walk.nbytes == walk.total)
+ tail = 0;

- err = skcipher_walk_done(&walk, tail);
- }
- if (!err)
- ce_aes_ccm_final(mac, buf, ctx->key_enc, num_rounds(ctx));
+ ce_aes_ccm_encrypt(walk.dst.virt.addr,
+ walk.src.virt.addr,
+ walk.nbytes - tail, ctx->key_enc,
+ num_rounds(ctx), mac, walk.iv);

- kernel_neon_end();
+ err = skcipher_walk_done(&walk, tail);
+ }
+ if (!err)
+ ce_aes_ccm_final(mac, buf, ctx->key_enc,
+ num_rounds(ctx));

+ kernel_neon_end();
+ } else {
+ err = ccm_crypt_fallback(&walk, mac, buf, ctx, true);
+ }
if (err)
return err;

@@ -205,38 +301,46 @@ static int ccm_decrypt(struct aead_request *req)
u8 __aligned(8) mac[AES_BLOCK_SIZE];
u8 buf[AES_BLOCK_SIZE];
u32 len = req->cryptlen - authsize;
+ bool use_neon = may_use_simd();
int err;

err = ccm_init_mac(req, mac, len);
if (err)
return err;

- kernel_neon_begin_partial(6);
+ if (likely(use_neon))
+ kernel_neon_begin();

if (req->assoclen)
- ccm_calculate_auth_mac(req, mac);
+ ccm_calculate_auth_mac(req, mac, use_neon);

/* preserve the original iv for the final round */
memcpy(buf, req->iv, AES_BLOCK_SIZE);

err = skcipher_walk_aead_decrypt(&walk, req, true);

- while (walk.nbytes) {
- u32 tail = walk.nbytes % AES_BLOCK_SIZE;
+ if (likely(use_neon)) {
+ while (walk.nbytes) {
+ u32 tail = walk.nbytes % AES_BLOCK_SIZE;

- if (walk.nbytes == walk.total)
- tail = 0;
+ if (walk.nbytes == walk.total)
+ tail = 0;

- ce_aes_ccm_decrypt(walk.dst.virt.addr, walk.src.virt.addr,
- walk.nbytes - tail, ctx->key_enc,
- num_rounds(ctx), mac, walk.iv);
+ ce_aes_ccm_decrypt(walk.dst.virt.addr,
+ walk.src.virt.addr,
+ walk.nbytes - tail, ctx->key_enc,
+ num_rounds(ctx), mac, walk.iv);

- err = skcipher_walk_done(&walk, tail);
- }
- if (!err)
- ce_aes_ccm_final(mac, buf, ctx->key_enc, num_rounds(ctx));
+ err = skcipher_walk_done(&walk, tail);
+ }
+ if (!err)
+ ce_aes_ccm_final(mac, buf, ctx->key_enc,
+ num_rounds(ctx));

- kernel_neon_end();
+ kernel_neon_end();
+ } else {
+ err = ccm_crypt_fallback(&walk, mac, buf, ctx, false);
+ }

if (err)
return err;
--
2.9.3
Ard Biesheuvel
2017-07-24 10:28:13 UTC
Permalink
To accommodate systems that may disallow use of the NEON in kernel mode
in some circumstances, introduce a C fallback for synchronous AES in CTR
mode, and use it if may_use_simd() returns false.

Signed-off-by: Ard Biesheuvel <***@linaro.org>
---
arch/arm64/crypto/Kconfig | 6 +-
arch/arm64/crypto/aes-ctr-fallback.h | 53 ++++++++++++++++++
arch/arm64/crypto/aes-glue.c | 59 +++++++++++++++-----
3 files changed, 101 insertions(+), 17 deletions(-)

diff --git a/arch/arm64/crypto/Kconfig b/arch/arm64/crypto/Kconfig
index ba637765c19a..a068dcbe2518 100644
--- a/arch/arm64/crypto/Kconfig
+++ b/arch/arm64/crypto/Kconfig
@@ -64,15 +64,17 @@ config CRYPTO_AES_ARM64_CE_CCM

config CRYPTO_AES_ARM64_CE_BLK
tristate "AES in ECB/CBC/CTR/XTS modes using ARMv8 Crypto Extensions"
- depends on ARM64 && KERNEL_MODE_NEON
+ depends on KERNEL_MODE_NEON
select CRYPTO_BLKCIPHER
select CRYPTO_AES_ARM64_CE
+ select CRYPTO_AES_ARM64
select CRYPTO_SIMD

config CRYPTO_AES_ARM64_NEON_BLK
tristate "AES in ECB/CBC/CTR/XTS modes using NEON instructions"
- depends on ARM64 && KERNEL_MODE_NEON
+ depends on KERNEL_MODE_NEON
select CRYPTO_BLKCIPHER
+ select CRYPTO_AES_ARM64
select CRYPTO_AES
select CRYPTO_SIMD

diff --git a/arch/arm64/crypto/aes-ctr-fallback.h b/arch/arm64/crypto/aes-ctr-fallback.h
new file mode 100644
index 000000000000..c9285717b6b5
--- /dev/null
+++ b/arch/arm64/crypto/aes-ctr-fallback.h
@@ -0,0 +1,53 @@
+/*
+ * Fallback for sync aes(ctr) in contexts where kernel mode NEON
+ * is not allowed
+ *
+ * Copyright (C) 2017 Linaro Ltd <***@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <crypto/aes.h>
+#include <crypto/internal/skcipher.h>
+
+asmlinkage void __aes_arm64_encrypt(u32 *rk, u8 *out, const u8 *in, int rounds);
+
+static inline int aes_ctr_encrypt_fallback(struct crypto_aes_ctx *ctx,
+ struct skcipher_request *req)
+{
+ struct skcipher_walk walk;
+ u8 buf[AES_BLOCK_SIZE];
+ int err;
+
+ err = skcipher_walk_virt(&walk, req, true);
+
+ while (walk.nbytes > 0) {
+ u8 *dst = walk.dst.virt.addr;
+ u8 *src = walk.src.virt.addr;
+ int nbytes = walk.nbytes;
+ int tail = 0;
+
+ if (nbytes < walk.total) {
+ nbytes = round_down(nbytes, AES_BLOCK_SIZE);
+ tail = walk.nbytes % AES_BLOCK_SIZE;
+ }
+
+ do {
+ int bsize = min(nbytes, AES_BLOCK_SIZE);
+
+ __aes_arm64_encrypt(ctx->key_enc, buf, walk.iv,
+ 6 + ctx->key_length / 4);
+ crypto_xor_cpy(dst, src, buf, bsize);
+ crypto_inc(walk.iv, AES_BLOCK_SIZE);
+
+ dst += AES_BLOCK_SIZE;
+ src += AES_BLOCK_SIZE;
+ nbytes -= AES_BLOCK_SIZE;
+ } while (nbytes > 0);
+
+ err = skcipher_walk_done(&walk, tail);
+ }
+ return err;
+}
diff --git a/arch/arm64/crypto/aes-glue.c b/arch/arm64/crypto/aes-glue.c
index 0da30e3b0e4b..998ba519a026 100644
--- a/arch/arm64/crypto/aes-glue.c
+++ b/arch/arm64/crypto/aes-glue.c
@@ -10,6 +10,7 @@

#include <asm/neon.h>
#include <asm/hwcap.h>
+#include <asm/simd.h>
#include <crypto/aes.h>
#include <crypto/internal/hash.h>
#include <crypto/internal/simd.h>
@@ -19,6 +20,7 @@
#include <crypto/xts.h>

#include "aes-ce-setkey.h"
+#include "aes-ctr-fallback.h"

#ifdef USE_V8_CRYPTO_EXTENSIONS
#define MODE "ce"
@@ -249,6 +251,17 @@ static int ctr_encrypt(struct skcipher_request *req)
return err;
}

+static int ctr_encrypt_sync(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct crypto_aes_ctx *ctx = crypto_skcipher_ctx(tfm);
+
+ if (!may_use_simd())
+ return aes_ctr_encrypt_fallback(ctx, req);
+
+ return ctr_encrypt(req);
+}
+
static int xts_encrypt(struct skcipher_request *req)
{
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
@@ -355,8 +368,8 @@ static struct skcipher_alg aes_algs[] = { {
.ivsize = AES_BLOCK_SIZE,
.chunksize = AES_BLOCK_SIZE,
.setkey = skcipher_aes_setkey,
- .encrypt = ctr_encrypt,
- .decrypt = ctr_encrypt,
+ .encrypt = ctr_encrypt_sync,
+ .decrypt = ctr_encrypt_sync,
}, {
.base = {
.cra_name = "__xts(aes)",
@@ -458,11 +471,35 @@ static int mac_init(struct shash_desc *desc)
return 0;
}

+static void mac_do_update(struct crypto_aes_ctx *ctx, u8 const in[], int blocks,
+ u8 dg[], int enc_before, int enc_after)
+{
+ int rounds = 6 + ctx->key_length / 4;
+
+ if (may_use_simd()) {
+ kernel_neon_begin();
+ aes_mac_update(in, ctx->key_enc, rounds, blocks, dg, enc_before,
+ enc_after);
+ kernel_neon_end();
+ } else {
+ if (enc_before)
+ __aes_arm64_encrypt(ctx->key_enc, dg, dg, rounds);
+
+ while (blocks--) {
+ crypto_xor(dg, in, AES_BLOCK_SIZE);
+ in += AES_BLOCK_SIZE;
+
+ if (blocks || enc_after)
+ __aes_arm64_encrypt(ctx->key_enc, dg, dg,
+ rounds);
+ }
+ }
+}
+
static int mac_update(struct shash_desc *desc, const u8 *p, unsigned int len)
{
struct mac_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm);
struct mac_desc_ctx *ctx = shash_desc_ctx(desc);
- int rounds = 6 + tctx->key.key_length / 4;

while (len > 0) {
unsigned int l;
@@ -474,10 +511,8 @@ static int mac_update(struct shash_desc *desc, const u8 *p, unsigned int len)

len %= AES_BLOCK_SIZE;

- kernel_neon_begin();
- aes_mac_update(p, tctx->key.key_enc, rounds, blocks,
- ctx->dg, (ctx->len != 0), (len != 0));
- kernel_neon_end();
+ mac_do_update(&tctx->key, p, blocks, ctx->dg,
+ (ctx->len != 0), (len != 0));

p += blocks * AES_BLOCK_SIZE;

@@ -505,11 +540,8 @@ static int cbcmac_final(struct shash_desc *desc, u8 *out)
{
struct mac_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm);
struct mac_desc_ctx *ctx = shash_desc_ctx(desc);
- int rounds = 6 + tctx->key.key_length / 4;

- kernel_neon_begin();
- aes_mac_update(NULL, tctx->key.key_enc, rounds, 0, ctx->dg, 1, 0);
- kernel_neon_end();
+ mac_do_update(&tctx->key, NULL, 0, ctx->dg, 1, 0);

memcpy(out, ctx->dg, AES_BLOCK_SIZE);

@@ -520,7 +552,6 @@ static int cmac_final(struct shash_desc *desc, u8 *out)
{
struct mac_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm);
struct mac_desc_ctx *ctx = shash_desc_ctx(desc);
- int rounds = 6 + tctx->key.key_length / 4;
u8 *consts = tctx->consts;

if (ctx->len != AES_BLOCK_SIZE) {
@@ -528,9 +559,7 @@ static int cmac_final(struct shash_desc *desc, u8 *out)
consts += AES_BLOCK_SIZE;
}

- kernel_neon_begin();
- aes_mac_update(consts, tctx->key.key_enc, rounds, 1, ctx->dg, 0, 1);
- kernel_neon_end();
+ mac_do_update(&tctx->key, consts, 1, ctx->dg, 0, 1);

memcpy(out, ctx->dg, AES_BLOCK_SIZE);
--
2.9.3
Ard Biesheuvel
2017-07-24 10:28:16 UTC
Permalink
Currently, the AES-GCM implementation for arm64 systems that support the
ARMv8 Crypto Extensions is based on the generic GCM module, which combines
the AES-CTR implementation using AES instructions with the PMULL based
GHASH driver. This is suboptimal, given the fact that the input data needs
to be loaded twice, once for the encryption and again for the MAC
calculation.

On Cortex-A57 (r1p2) and other recent cores that implement micro-op fusing
for the AES instructions, AES executes at less than 1 cycle per byte, which
means that any cycles wasted on loading the data twice hurt even more.

So implement a new GCM driver that combines the AES and PMULL instructions
at the block level. This improves performance on Cortex-A57 by ~37% (from
3.5 cpb to 2.6 cpb)

Signed-off-by: Ard Biesheuvel <***@linaro.org>
---
arch/arm64/crypto/Kconfig | 4 +-
arch/arm64/crypto/ghash-ce-core.S | 175 ++++++++
arch/arm64/crypto/ghash-ce-glue.c | 438 ++++++++++++++++++--
3 files changed, 591 insertions(+), 26 deletions(-)

diff --git a/arch/arm64/crypto/Kconfig b/arch/arm64/crypto/Kconfig
index f9e264b83366..7ca54a76f6b9 100644
--- a/arch/arm64/crypto/Kconfig
+++ b/arch/arm64/crypto/Kconfig
@@ -29,10 +29,12 @@ config CRYPTO_SHA2_ARM64_CE
select CRYPTO_SHA256_ARM64

config CRYPTO_GHASH_ARM64_CE
- tristate "GHASH (for GCM chaining mode) using ARMv8 Crypto Extensions"
+ tristate "GHASH/AES-GCM using ARMv8 Crypto Extensions"
depends on KERNEL_MODE_NEON
select CRYPTO_HASH
select CRYPTO_GF128MUL
+ select CRYPTO_AES
+ select CRYPTO_AES_ARM64

config CRYPTO_CRCT10DIF_ARM64_CE
tristate "CRCT10DIF digest algorithm using PMULL instructions"
diff --git a/arch/arm64/crypto/ghash-ce-core.S b/arch/arm64/crypto/ghash-ce-core.S
index f0bb9f0b524f..cb22459eba85 100644
--- a/arch/arm64/crypto/ghash-ce-core.S
+++ b/arch/arm64/crypto/ghash-ce-core.S
@@ -77,3 +77,178 @@ CPU_LE( rev64 T1.16b, T1.16b )
st1 {XL.2d}, [x1]
ret
ENDPROC(pmull_ghash_update)
+
+ KS .req v8
+ CTR .req v9
+ INP .req v10
+
+ .macro load_round_keys, rounds, rk
+ cmp \rounds, #12
+ blo 2222f /* 128 bits */
+ beq 1111f /* 192 bits */
+ ld1 {v17.4s-v18.4s}, [\rk], #32
+1111: ld1 {v19.4s-v20.4s}, [\rk], #32
+2222: ld1 {v21.4s-v24.4s}, [\rk], #64
+ ld1 {v25.4s-v28.4s}, [\rk], #64
+ ld1 {v29.4s-v31.4s}, [\rk]
+ .endm
+
+ .macro enc_round, state, key
+ aese \state\().16b, \key\().16b
+ aesmc \state\().16b, \state\().16b
+ .endm
+
+ .macro enc_block, state, rounds
+ cmp \rounds, #12
+ b.lo 2222f /* 128 bits */
+ b.eq 1111f /* 192 bits */
+ enc_round \state, v17
+ enc_round \state, v18
+1111: enc_round \state, v19
+ enc_round \state, v20
+2222: .irp key, v21, v22, v23, v24, v25, v26, v27, v28, v29
+ enc_round \state, \key
+ .endr
+ aese \state\().16b, v30.16b
+ eor \state\().16b, \state\().16b, v31.16b
+ .endm
+
+ .macro pmull_gcm_do_crypt, enc
+ ld1 {SHASH.2d}, [x4]
+ ld1 {XL.2d}, [x1]
+ ldr x8, [x5, #8] // load lower counter
+
+ movi MASK.16b, #0xe1
+ ext SHASH2.16b, SHASH.16b, SHASH.16b, #8
+CPU_LE( rev x8, x8 )
+ shl MASK.2d, MASK.2d, #57
+ eor SHASH2.16b, SHASH2.16b, SHASH.16b
+
+ .if \enc == 1
+ ld1 {KS.16b}, [x7]
+ .endif
+
+0: ld1 {CTR.8b}, [x5] // load upper counter
+ ld1 {INP.16b}, [x3], #16
+ rev x9, x8
+ add x8, x8, #1
+ sub w0, w0, #1
+ ins CTR.d[1], x9 // set lower counter
+
+ .if \enc == 1
+ eor INP.16b, INP.16b, KS.16b // encrypt input
+ st1 {INP.16b}, [x2], #16
+ .endif
+
+ rev64 T1.16b, INP.16b
+
+ cmp w6, #12
+ b.ge 2f // AES-192/256?
+
+1: enc_round CTR, v21
+
+ ext T2.16b, XL.16b, XL.16b, #8
+ ext IN1.16b, T1.16b, T1.16b, #8
+
+ enc_round CTR, v22
+
+ eor T1.16b, T1.16b, T2.16b
+ eor XL.16b, XL.16b, IN1.16b
+
+ enc_round CTR, v23
+
+ pmull2 XH.1q, SHASH.2d, XL.2d // a1 * b1
+ eor T1.16b, T1.16b, XL.16b
+
+ enc_round CTR, v24
+
+ pmull XL.1q, SHASH.1d, XL.1d // a0 * b0
+ pmull XM.1q, SHASH2.1d, T1.1d // (a1 + a0)(b1 + b0)
+
+ enc_round CTR, v25
+
+ ext T1.16b, XL.16b, XH.16b, #8
+ eor T2.16b, XL.16b, XH.16b
+ eor XM.16b, XM.16b, T1.16b
+
+ enc_round CTR, v26
+
+ eor XM.16b, XM.16b, T2.16b
+ pmull T2.1q, XL.1d, MASK.1d
+
+ enc_round CTR, v27
+
+ mov XH.d[0], XM.d[1]
+ mov XM.d[1], XL.d[0]
+
+ enc_round CTR, v28
+
+ eor XL.16b, XM.16b, T2.16b
+
+ enc_round CTR, v29
+
+ ext T2.16b, XL.16b, XL.16b, #8
+
+ aese CTR.16b, v30.16b
+
+ pmull XL.1q, XL.1d, MASK.1d
+ eor T2.16b, T2.16b, XH.16b
+
+ eor KS.16b, CTR.16b, v31.16b
+
+ eor XL.16b, XL.16b, T2.16b
+
+ .if \enc == 0
+ eor INP.16b, INP.16b, KS.16b
+ st1 {INP.16b}, [x2], #16
+ .endif
+
+ cbnz w0, 0b
+
+CPU_LE( rev x8, x8 )
+ st1 {XL.2d}, [x1]
+ str x8, [x5, #8] // store lower counter
+
+ .if \enc == 1
+ st1 {KS.16b}, [x7]
+ .endif
+
+ ret
+
+2: b.eq 3f // AES-192?
+ enc_round CTR, v17
+ enc_round CTR, v18
+3: enc_round CTR, v19
+ enc_round CTR, v20
+ b 1b
+ .endm
+
+ /*
+ * void pmull_gcm_encrypt(int blocks, u64 dg[], u8 dst[], const u8 src[],
+ * struct ghash_key const *k, u8 ctr[],
+ * int rounds, u8 ks[])
+ */
+ENTRY(pmull_gcm_encrypt)
+ pmull_gcm_do_crypt 1
+ENDPROC(pmull_gcm_encrypt)
+
+ /*
+ * void pmull_gcm_decrypt(int blocks, u64 dg[], u8 dst[], const u8 src[],
+ * struct ghash_key const *k, u8 ctr[],
+ * int rounds)
+ */
+ENTRY(pmull_gcm_decrypt)
+ pmull_gcm_do_crypt 0
+ENDPROC(pmull_gcm_decrypt)
+
+ /*
+ * void pmull_gcm_encrypt_block(u8 dst[], u8 src[], u8 rk[], int rounds)
+ */
+ENTRY(pmull_gcm_encrypt_block)
+ cbz x2, 0f
+ load_round_keys w3, x2
+0: ld1 {v0.16b}, [x1]
+ enc_block v0, w3
+ st1 {v0.16b}, [x0]
+ ret
+ENDPROC(pmull_gcm_encrypt_block)
diff --git a/arch/arm64/crypto/ghash-ce-glue.c b/arch/arm64/crypto/ghash-ce-glue.c
index 30221ef56e70..ee6aaac05905 100644
--- a/arch/arm64/crypto/ghash-ce-glue.c
+++ b/arch/arm64/crypto/ghash-ce-glue.c
@@ -11,18 +11,25 @@
#include <asm/neon.h>
#include <asm/simd.h>
#include <asm/unaligned.h>
+#include <crypto/aes.h>
+#include <crypto/algapi.h>
+#include <crypto/b128ops.h>
#include <crypto/gf128mul.h>
+#include <crypto/internal/aead.h>
#include <crypto/internal/hash.h>
+#include <crypto/internal/skcipher.h>
+#include <crypto/scatterwalk.h>
#include <linux/cpufeature.h>
#include <linux/crypto.h>
#include <linux/module.h>

-MODULE_DESCRIPTION("GHASH secure hash using ARMv8 Crypto Extensions");
+MODULE_DESCRIPTION("GHASH and AES-GCM using ARMv8 Crypto Extensions");
MODULE_AUTHOR("Ard Biesheuvel <***@linaro.org>");
MODULE_LICENSE("GPL v2");

#define GHASH_BLOCK_SIZE 16
#define GHASH_DIGEST_SIZE 16
+#define GCM_IV_SIZE 12

struct ghash_key {
u64 a;
@@ -36,9 +43,27 @@ struct ghash_desc_ctx {
u32 count;
};

+struct gcm_aes_ctx {
+ struct crypto_aes_ctx aes_key;
+ struct ghash_key ghash_key;
+};
+
asmlinkage void pmull_ghash_update(int blocks, u64 dg[], const char *src,
struct ghash_key const *k, const char *head);

+asmlinkage void pmull_gcm_encrypt(int blocks, u64 dg[], u8 dst[],
+ const u8 src[], struct ghash_key const *k,
+ u8 ctr[], int rounds, u8 ks[]);
+
+asmlinkage void pmull_gcm_decrypt(int blocks, u64 dg[], u8 dst[],
+ const u8 src[], struct ghash_key const *k,
+ u8 ctr[], int rounds);
+
+asmlinkage void pmull_gcm_encrypt_block(u8 dst[], u8 const src[],
+ u32 const rk[], int rounds);
+
+asmlinkage void __aes_arm64_encrypt(u32 *rk, u8 *out, const u8 *in, int rounds);
+
static int ghash_init(struct shash_desc *desc)
{
struct ghash_desc_ctx *ctx = shash_desc_ctx(desc);
@@ -130,17 +155,11 @@ static int ghash_final(struct shash_desc *desc, u8 *dst)
return 0;
}

-static int ghash_setkey(struct crypto_shash *tfm,
- const u8 *inkey, unsigned int keylen)
+static int __ghash_setkey(struct ghash_key *key,
+ const u8 *inkey, unsigned int keylen)
{
- struct ghash_key *key = crypto_shash_ctx(tfm);
u64 a, b;

- if (keylen != GHASH_BLOCK_SIZE) {
- crypto_shash_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
- return -EINVAL;
- }
-
/* needed for the fallback */
memcpy(&key->k, inkey, GHASH_BLOCK_SIZE);

@@ -157,32 +176,401 @@ static int ghash_setkey(struct crypto_shash *tfm,
return 0;
}

+static int ghash_setkey(struct crypto_shash *tfm,
+ const u8 *inkey, unsigned int keylen)
+{
+ struct ghash_key *key = crypto_shash_ctx(tfm);
+
+ if (keylen != GHASH_BLOCK_SIZE) {
+ crypto_shash_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
+ return -EINVAL;
+ }
+
+ return __ghash_setkey(key, inkey, keylen);
+}
+
static struct shash_alg ghash_alg = {
- .digestsize = GHASH_DIGEST_SIZE,
- .init = ghash_init,
- .update = ghash_update,
- .final = ghash_final,
- .setkey = ghash_setkey,
- .descsize = sizeof(struct ghash_desc_ctx),
- .base = {
- .cra_name = "ghash",
- .cra_driver_name = "ghash-ce",
- .cra_priority = 200,
- .cra_flags = CRYPTO_ALG_TYPE_SHASH,
- .cra_blocksize = GHASH_BLOCK_SIZE,
- .cra_ctxsize = sizeof(struct ghash_key),
- .cra_module = THIS_MODULE,
- },
+ .base.cra_name = "ghash",
+ .base.cra_driver_name = "ghash-ce",
+ .base.cra_priority = 200,
+ .base.cra_flags = CRYPTO_ALG_TYPE_SHASH,
+ .base.cra_blocksize = GHASH_BLOCK_SIZE,
+ .base.cra_ctxsize = sizeof(struct ghash_key),
+ .base.cra_module = THIS_MODULE,
+
+ .digestsize = GHASH_DIGEST_SIZE,
+ .init = ghash_init,
+ .update = ghash_update,
+ .final = ghash_final,
+ .setkey = ghash_setkey,
+ .descsize = sizeof(struct ghash_desc_ctx),
+};
+
+static int num_rounds(struct crypto_aes_ctx *ctx)
+{
+ /*
+ * # of rounds specified by AES:
+ * 128 bit key 10 rounds
+ * 192 bit key 12 rounds
+ * 256 bit key 14 rounds
+ * => n byte key => 6 + (n/4) rounds
+ */
+ return 6 + ctx->key_length / 4;
+}
+
+static int gcm_setkey(struct crypto_aead *tfm, const u8 *inkey,
+ unsigned int keylen)
+{
+ struct gcm_aes_ctx *ctx = crypto_aead_ctx(tfm);
+ u8 key[GHASH_BLOCK_SIZE];
+ int ret;
+
+ ret = crypto_aes_expand_key(&ctx->aes_key, inkey, keylen);
+ if (ret) {
+ tfm->base.crt_flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
+ return -EINVAL;
+ }
+
+ __aes_arm64_encrypt(ctx->aes_key.key_enc, key, (u8[AES_BLOCK_SIZE]){},
+ num_rounds(&ctx->aes_key));
+
+ return __ghash_setkey(&ctx->ghash_key, key, sizeof(key));
+}
+
+static int gcm_setauthsize(struct crypto_aead *tfm, unsigned int authsize)
+{
+ switch (authsize) {
+ case 4:
+ case 8:
+ case 12 ... 16:
+ break;
+ default:
+ return -EINVAL;
+ }
+ return 0;
+}
+
+static void gcm_update_mac(u64 dg[], const u8 *src, int count, u8 buf[],
+ int *buf_count, struct gcm_aes_ctx *ctx)
+{
+ if (*buf_count > 0) {
+ int buf_added = min(count, GHASH_BLOCK_SIZE - *buf_count);
+
+ memcpy(&buf[*buf_count], src, buf_added);
+
+ *buf_count += buf_added;
+ src += buf_added;
+ count -= buf_added;
+ }
+
+ if (count >= GHASH_BLOCK_SIZE || *buf_count == GHASH_BLOCK_SIZE) {
+ int blocks = count / GHASH_BLOCK_SIZE;
+
+ ghash_do_update(blocks, dg, src, &ctx->ghash_key,
+ *buf_count ? buf : NULL);
+
+ src += blocks * GHASH_BLOCK_SIZE;
+ count %= GHASH_BLOCK_SIZE;
+ *buf_count = 0;
+ }
+
+ if (count > 0) {
+ memcpy(buf, src, count);
+ *buf_count = count;
+ }
+}
+
+static void gcm_calculate_auth_mac(struct aead_request *req, u64 dg[])
+{
+ struct crypto_aead *aead = crypto_aead_reqtfm(req);
+ struct gcm_aes_ctx *ctx = crypto_aead_ctx(aead);
+ u8 buf[GHASH_BLOCK_SIZE];
+ struct scatter_walk walk;
+ u32 len = req->assoclen;
+ int buf_count = 0;
+
+ scatterwalk_start(&walk, req->src);
+
+ do {
+ u32 n = scatterwalk_clamp(&walk, len);
+ u8 *p;
+
+ if (!n) {
+ scatterwalk_start(&walk, sg_next(walk.sg));
+ n = scatterwalk_clamp(&walk, len);
+ }
+ p = scatterwalk_map(&walk);
+
+ gcm_update_mac(dg, p, n, buf, &buf_count, ctx);
+ len -= n;
+
+ scatterwalk_unmap(p);
+ scatterwalk_advance(&walk, n);
+ scatterwalk_done(&walk, 0, len);
+ } while (len);
+
+ if (buf_count) {
+ memset(&buf[buf_count], 0, GHASH_BLOCK_SIZE - buf_count);
+ ghash_do_update(1, dg, buf, &ctx->ghash_key, NULL);
+ }
+}
+
+static void gcm_final(struct aead_request *req, struct gcm_aes_ctx *ctx,
+ u64 dg[], u8 tag[], int cryptlen)
+{
+ u8 mac[AES_BLOCK_SIZE];
+ u128 lengths;
+
+ lengths.a = cpu_to_be64(req->assoclen * 8);
+ lengths.b = cpu_to_be64(cryptlen * 8);
+
+ ghash_do_update(1, dg, (void *)&lengths, &ctx->ghash_key, NULL);
+
+ put_unaligned_be64(dg[1], mac);
+ put_unaligned_be64(dg[0], mac + 8);
+
+ crypto_xor(tag, mac, AES_BLOCK_SIZE);
+}
+
+static int gcm_encrypt(struct aead_request *req)
+{
+ struct crypto_aead *aead = crypto_aead_reqtfm(req);
+ struct gcm_aes_ctx *ctx = crypto_aead_ctx(aead);
+ struct skcipher_walk walk;
+ u8 iv[AES_BLOCK_SIZE];
+ u8 ks[AES_BLOCK_SIZE];
+ u8 tag[AES_BLOCK_SIZE];
+ u64 dg[2] = {};
+ int err;
+
+ if (req->assoclen)
+ gcm_calculate_auth_mac(req, dg);
+
+ memcpy(iv, req->iv, GCM_IV_SIZE);
+ put_unaligned_be32(1, iv + GCM_IV_SIZE);
+
+ if (likely(may_use_simd())) {
+ kernel_neon_begin();
+
+ pmull_gcm_encrypt_block(tag, iv, ctx->aes_key.key_enc,
+ num_rounds(&ctx->aes_key));
+ put_unaligned_be32(2, iv + GCM_IV_SIZE);
+ pmull_gcm_encrypt_block(ks, iv, NULL,
+ num_rounds(&ctx->aes_key));
+ put_unaligned_be32(3, iv + GCM_IV_SIZE);
+
+ err = skcipher_walk_aead_encrypt(&walk, req, true);
+
+ while (walk.nbytes >= AES_BLOCK_SIZE) {
+ int blocks = walk.nbytes / AES_BLOCK_SIZE;
+
+ pmull_gcm_encrypt(blocks, dg, walk.dst.virt.addr,
+ walk.src.virt.addr, &ctx->ghash_key,
+ iv, num_rounds(&ctx->aes_key), ks);
+
+ err = skcipher_walk_done(&walk,
+ walk.nbytes % AES_BLOCK_SIZE);
+ }
+ kernel_neon_end();
+ } else {
+ __aes_arm64_encrypt(ctx->aes_key.key_enc, tag, iv,
+ num_rounds(&ctx->aes_key));
+ put_unaligned_be32(2, iv + GCM_IV_SIZE);
+
+ err = skcipher_walk_aead_encrypt(&walk, req, true);
+
+ while (walk.nbytes >= AES_BLOCK_SIZE) {
+ int blocks = walk.nbytes / AES_BLOCK_SIZE;
+ u8 *dst = walk.dst.virt.addr;
+ u8 *src = walk.src.virt.addr;
+
+ do {
+ __aes_arm64_encrypt(ctx->aes_key.key_enc,
+ ks, iv,
+ num_rounds(&ctx->aes_key));
+ crypto_xor_cpy(dst, src, ks, AES_BLOCK_SIZE);
+ crypto_inc(iv, AES_BLOCK_SIZE);
+
+ dst += AES_BLOCK_SIZE;
+ src += AES_BLOCK_SIZE;
+ } while (--blocks > 0);
+
+ ghash_do_update(walk.nbytes / AES_BLOCK_SIZE, dg,
+ walk.dst.virt.addr, &ctx->ghash_key,
+ NULL);
+
+ err = skcipher_walk_done(&walk,
+ walk.nbytes % AES_BLOCK_SIZE);
+ }
+ if (walk.nbytes)
+ __aes_arm64_encrypt(ctx->aes_key.key_enc, ks, iv,
+ num_rounds(&ctx->aes_key));
+ }
+
+ /* handle the tail */
+ if (walk.nbytes) {
+ u8 buf[GHASH_BLOCK_SIZE];
+
+ crypto_xor_cpy(walk.dst.virt.addr, walk.src.virt.addr, ks,
+ walk.nbytes);
+
+ memcpy(buf, walk.dst.virt.addr, walk.nbytes);
+ memset(buf + walk.nbytes, 0, GHASH_BLOCK_SIZE - walk.nbytes);
+ ghash_do_update(1, dg, buf, &ctx->ghash_key, NULL);
+
+ err = skcipher_walk_done(&walk, 0);
+ }
+
+ if (err)
+ return err;
+
+ gcm_final(req, ctx, dg, tag, req->cryptlen);
+
+ /* copy authtag to end of dst */
+ scatterwalk_map_and_copy(tag, req->dst, req->assoclen + req->cryptlen,
+ crypto_aead_authsize(aead), 1);
+
+ return 0;
+}
+
+static int gcm_decrypt(struct aead_request *req)
+{
+ struct crypto_aead *aead = crypto_aead_reqtfm(req);
+ struct gcm_aes_ctx *ctx = crypto_aead_ctx(aead);
+ unsigned int authsize = crypto_aead_authsize(aead);
+ struct skcipher_walk walk;
+ u8 iv[AES_BLOCK_SIZE];
+ u8 tag[AES_BLOCK_SIZE];
+ u8 buf[GHASH_BLOCK_SIZE];
+ u64 dg[2] = {};
+ int err;
+
+ if (req->assoclen)
+ gcm_calculate_auth_mac(req, dg);
+
+ memcpy(iv, req->iv, GCM_IV_SIZE);
+ put_unaligned_be32(1, iv + GCM_IV_SIZE);
+
+ if (likely(may_use_simd())) {
+ kernel_neon_begin();
+
+ pmull_gcm_encrypt_block(tag, iv, ctx->aes_key.key_enc,
+ num_rounds(&ctx->aes_key));
+ put_unaligned_be32(2, iv + GCM_IV_SIZE);
+
+ err = skcipher_walk_aead_decrypt(&walk, req, true);
+
+ while (walk.nbytes >= AES_BLOCK_SIZE) {
+ int blocks = walk.nbytes / AES_BLOCK_SIZE;
+
+ pmull_gcm_decrypt(blocks, dg, walk.dst.virt.addr,
+ walk.src.virt.addr, &ctx->ghash_key,
+ iv, num_rounds(&ctx->aes_key));
+
+ err = skcipher_walk_done(&walk,
+ walk.nbytes % AES_BLOCK_SIZE);
+ }
+ if (walk.nbytes)
+ pmull_gcm_encrypt_block(iv, iv, NULL,
+ num_rounds(&ctx->aes_key));
+
+ kernel_neon_end();
+ } else {
+ __aes_arm64_encrypt(ctx->aes_key.key_enc, tag, iv,
+ num_rounds(&ctx->aes_key));
+ put_unaligned_be32(2, iv + GCM_IV_SIZE);
+
+ err = skcipher_walk_aead_decrypt(&walk, req, true);
+
+ while (walk.nbytes >= AES_BLOCK_SIZE) {
+ int blocks = walk.nbytes / AES_BLOCK_SIZE;
+ u8 *dst = walk.dst.virt.addr;
+ u8 *src = walk.src.virt.addr;
+
+ ghash_do_update(blocks, dg, walk.src.virt.addr,
+ &ctx->ghash_key, NULL);
+
+ do {
+ __aes_arm64_encrypt(ctx->aes_key.key_enc,
+ buf, iv,
+ num_rounds(&ctx->aes_key));
+ crypto_xor_cpy(dst, src, buf, AES_BLOCK_SIZE);
+ crypto_inc(iv, AES_BLOCK_SIZE);
+
+ dst += AES_BLOCK_SIZE;
+ src += AES_BLOCK_SIZE;
+ } while (--blocks > 0);
+
+ err = skcipher_walk_done(&walk,
+ walk.nbytes % AES_BLOCK_SIZE);
+ }
+ if (walk.nbytes)
+ __aes_arm64_encrypt(ctx->aes_key.key_enc, iv, iv,
+ num_rounds(&ctx->aes_key));
+ }
+
+ /* handle the tail */
+ if (walk.nbytes) {
+ memcpy(buf, walk.src.virt.addr, walk.nbytes);
+ memset(buf + walk.nbytes, 0, GHASH_BLOCK_SIZE - walk.nbytes);
+ ghash_do_update(1, dg, buf, &ctx->ghash_key, NULL);
+
+ crypto_xor_cpy(walk.dst.virt.addr, walk.src.virt.addr, iv,
+ walk.nbytes);
+
+ err = skcipher_walk_done(&walk, 0);
+ }
+
+ if (err)
+ return err;
+
+ gcm_final(req, ctx, dg, tag, req->cryptlen - authsize);
+
+ /* compare calculated auth tag with the stored one */
+ scatterwalk_map_and_copy(buf, req->src,
+ req->assoclen + req->cryptlen - authsize,
+ authsize, 0);
+
+ if (crypto_memneq(tag, buf, authsize))
+ return -EBADMSG;
+ return 0;
+}
+
+static struct aead_alg gcm_aes_alg = {
+ .ivsize = GCM_IV_SIZE,
+ .chunksize = AES_BLOCK_SIZE,
+ .maxauthsize = AES_BLOCK_SIZE,
+ .setkey = gcm_setkey,
+ .setauthsize = gcm_setauthsize,
+ .encrypt = gcm_encrypt,
+ .decrypt = gcm_decrypt,
+
+ .base.cra_name = "gcm(aes)",
+ .base.cra_driver_name = "gcm-aes-ce",
+ .base.cra_priority = 300,
+ .base.cra_blocksize = 1,
+ .base.cra_ctxsize = sizeof(struct gcm_aes_ctx),
+ .base.cra_module = THIS_MODULE,
};

static int __init ghash_ce_mod_init(void)
{
- return crypto_register_shash(&ghash_alg);
+ int ret;
+
+ ret = crypto_register_aead(&gcm_aes_alg);
+ if (ret)
+ return ret;
+
+ ret = crypto_register_shash(&ghash_alg);
+ if (ret)
+ crypto_unregister_aead(&gcm_aes_alg);
+ return ret;
}

static void __exit ghash_ce_mod_exit(void)
{
crypto_unregister_shash(&ghash_alg);
+ crypto_unregister_aead(&gcm_aes_alg);
}

module_cpu_feature_match(PMULL, ghash_ce_mod_init);
--
2.9.3
Ard Biesheuvel
2017-07-24 10:28:19 UTC
Permalink
For the final round, avoid the expanded and padded lookup tables
exported by the generic AES driver. Instead, for encryption, we can
perform byte loads from the same table we used for the inner rounds,
which will still be hot in the caches. For decryption, use the inverse
AES Sbox directly, which is 4x smaller than the inverse lookup table
exported by the generic driver.

This should significantly reduce the Dcache footprint of our code,
which makes the code more robust against timing attacks. It does not
introduce any additional module dependencies, given that we already
rely on the core AES module for the shared key expansion routines.

Signed-off-by: Ard Biesheuvel <***@linaro.org>
---
arch/arm/crypto/aes-cipher-core.S | 88 +++++++++++++++-----
1 file changed, 65 insertions(+), 23 deletions(-)

diff --git a/arch/arm/crypto/aes-cipher-core.S b/arch/arm/crypto/aes-cipher-core.S
index c817a86c4ca8..54b384084637 100644
--- a/arch/arm/crypto/aes-cipher-core.S
+++ b/arch/arm/crypto/aes-cipher-core.S
@@ -10,6 +10,7 @@
*/

#include <linux/linkage.h>
+#include <asm/cache.h>

.text
.align 5
@@ -32,19 +33,19 @@
.endif
.endm

- .macro __load, out, in, idx
+ .macro __load, out, in, idx, sz, op
.if __LINUX_ARM_ARCH__ < 7 && \idx > 0
- ldr \out, [ttab, \in, lsr #(8 * \idx) - 2]
+ ldr\op \out, [ttab, \in, lsr #(8 * \idx) - \sz]
.else
- ldr \out, [ttab, \in, lsl #2]
+ ldr\op \out, [ttab, \in, lsl #\sz]
.endif
.endm

- .macro __hround, out0, out1, in0, in1, in2, in3, t3, t4, enc
+ .macro __hround, out0, out1, in0, in1, in2, in3, t3, t4, enc, sz, op
__select \out0, \in0, 0
__select t0, \in1, 1
- __load \out0, \out0, 0
- __load t0, t0, 1
+ __load \out0, \out0, 0, \sz, \op
+ __load t0, t0, 1, \sz, \op

.if \enc
__select \out1, \in1, 0
@@ -53,10 +54,10 @@
__select \out1, \in3, 0
__select t1, \in0, 1
.endif
- __load \out1, \out1, 0
+ __load \out1, \out1, 0, \sz, \op
__select t2, \in2, 2
- __load t1, t1, 1
- __load t2, t2, 2
+ __load t1, t1, 1, \sz, \op
+ __load t2, t2, 2, \sz, \op

eor \out0, \out0, t0, ror #24

@@ -68,9 +69,9 @@
__select \t3, \in1, 2
__select \t4, \in2, 3
.endif
- __load \t3, \t3, 2
- __load t0, t0, 3
- __load \t4, \t4, 3
+ __load \t3, \t3, 2, \sz, \op
+ __load t0, t0, 3, \sz, \op
+ __load \t4, \t4, 3, \sz, \op

eor \out1, \out1, t1, ror #24
eor \out0, \out0, t2, ror #16
@@ -82,14 +83,14 @@
eor \out1, \out1, t2
.endm

- .macro fround, out0, out1, out2, out3, in0, in1, in2, in3
- __hround \out0, \out1, \in0, \in1, \in2, \in3, \out2, \out3, 1
- __hround \out2, \out3, \in2, \in3, \in0, \in1, \in1, \in2, 1
+ .macro fround, out0, out1, out2, out3, in0, in1, in2, in3, sz=2, op
+ __hround \out0, \out1, \in0, \in1, \in2, \in3, \out2, \out3, 1, \sz, \op
+ __hround \out2, \out3, \in2, \in3, \in0, \in1, \in1, \in2, 1, \sz, \op
.endm

- .macro iround, out0, out1, out2, out3, in0, in1, in2, in3
- __hround \out0, \out1, \in0, \in3, \in2, \in1, \out2, \out3, 0
- __hround \out2, \out3, \in2, \in1, \in0, \in3, \in1, \in0, 0
+ .macro iround, out0, out1, out2, out3, in0, in1, in2, in3, sz=2, op
+ __hround \out0, \out1, \in0, \in3, \in2, \in1, \out2, \out3, 0, \sz, \op
+ __hround \out2, \out3, \in2, \in1, \in0, \in3, \in1, \in0, 0, \sz, \op
.endm

.macro __rev, out, in
@@ -114,7 +115,7 @@
.endif
.endm

- .macro do_crypt, round, ttab, ltab
+ .macro do_crypt, round, ttab, ltab, bsz
push {r3-r11, lr}

ldr r4, [in]
@@ -146,9 +147,12 @@

1: subs rounds, rounds, #4
\round r8, r9, r10, r11, r4, r5, r6, r7
- __adrl ttab, \ltab, ls
+ bls 2f
\round r4, r5, r6, r7, r8, r9, r10, r11
- bhi 0b
+ b 0b
+
+2: __adrl ttab, \ltab
+ \round r4, r5, r6, r7, r8, r9, r10, r11, \bsz, b

#ifdef CONFIG_CPU_BIG_ENDIAN
__rev r4, r4
@@ -170,10 +174,48 @@
.ltorg
.endm

+ .align L1_CACHE_SHIFT
+ .type __aes_arm_inverse_sbox, %object
+__aes_arm_inverse_sbox:
+ .byte 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38
+ .byte 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb
+ .byte 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87
+ .byte 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb
+ .byte 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d
+ .byte 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e
+ .byte 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2
+ .byte 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25
+ .byte 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16
+ .byte 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92
+ .byte 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda
+ .byte 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84
+ .byte 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a
+ .byte 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06
+ .byte 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02
+ .byte 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b
+ .byte 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea
+ .byte 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73
+ .byte 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85
+ .byte 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e
+ .byte 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89
+ .byte 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b
+ .byte 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20
+ .byte 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4
+ .byte 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31
+ .byte 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f
+ .byte 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d
+ .byte 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef
+ .byte 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0
+ .byte 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61
+ .byte 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26
+ .byte 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
+ .size __aes_arm_inverse_sbox, . - __aes_arm_inverse_sbox
+
ENTRY(__aes_arm_encrypt)
- do_crypt fround, crypto_ft_tab, crypto_fl_tab
+ do_crypt fround, crypto_ft_tab, crypto_ft_tab + 1, 2
ENDPROC(__aes_arm_encrypt)

+ .align 5
ENTRY(__aes_arm_decrypt)
- do_crypt iround, crypto_it_tab, crypto_il_tab
+ do_crypt iround, crypto_it_tab, __aes_arm_inverse_sbox, 0
ENDPROC(__aes_arm_decrypt)
--
2.9.3
Ard Biesheuvel
2017-07-24 10:28:17 UTC
Permalink
Implement a NEON fallback for systems that do support NEON but have
no support for the optional 64x64->128 polynomial multiplication
instruction that is part of the ARMv8 Crypto Extensions. It is based
on the paper "Fast Software Polynomial Multiplication on ARM Processors
Using the NEON Engine" by Danilo Camara, Conrado Gouvea, Julio Lopez and
Ricardo Dahab (https://hal.inria.fr/hal-01506572)

On a 32-bit guest executing under KVM on a Cortex-A57, the new code is
not only 4x faster than the generic table based GHASH driver, it is also
time invariant. (Note that the existing vmull.p64 code is 16x faster on
this core).

Signed-off-by: Ard Biesheuvel <***@linaro.org>
---
arch/arm/crypto/Kconfig | 5 +-
arch/arm/crypto/ghash-ce-core.S | 234 ++++++++++++++++----
arch/arm/crypto/ghash-ce-glue.c | 24 +-
3 files changed, 215 insertions(+), 48 deletions(-)

diff --git a/arch/arm/crypto/Kconfig b/arch/arm/crypto/Kconfig
index b9adedcc5b2e..ec72752d5668 100644
--- a/arch/arm/crypto/Kconfig
+++ b/arch/arm/crypto/Kconfig
@@ -94,14 +94,15 @@ config CRYPTO_AES_ARM_CE
ARMv8 Crypto Extensions

config CRYPTO_GHASH_ARM_CE
- tristate "PMULL-accelerated GHASH using ARMv8 Crypto Extensions"
+ tristate "PMULL-accelerated GHASH using NEON/ARMv8 Crypto Extensions"
depends on KERNEL_MODE_NEON
select CRYPTO_HASH
select CRYPTO_CRYPTD
help
Use an implementation of GHASH (used by the GCM AEAD chaining mode)
that uses the 64x64 to 128 bit polynomial multiplication (vmull.p64)
- that is part of the ARMv8 Crypto Extensions
+ that is part of the ARMv8 Crypto Extensions, or a slower variant that
+ uses the vmull.p8 instruction that is part of the basic NEON ISA.

config CRYPTO_CRCT10DIF_ARM_CE
tristate "CRCT10DIF digest algorithm using PMULL instructions"
diff --git a/arch/arm/crypto/ghash-ce-core.S b/arch/arm/crypto/ghash-ce-core.S
index f6ab8bcc9efe..2f78c10b1881 100644
--- a/arch/arm/crypto/ghash-ce-core.S
+++ b/arch/arm/crypto/ghash-ce-core.S
@@ -1,7 +1,7 @@
/*
- * Accelerated GHASH implementation with ARMv8 vmull.p64 instructions.
+ * Accelerated GHASH implementation with NEON/ARMv8 vmull.p8/64 instructions.
*
- * Copyright (C) 2015 Linaro Ltd. <***@linaro.org>
+ * Copyright (C) 2015 - 2017 Linaro Ltd. <***@linaro.org>
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 as published
@@ -12,40 +12,162 @@
#include <asm/assembler.h>

SHASH .req q0
- SHASH2 .req q1
- T1 .req q2
- T2 .req q3
- MASK .req q4
- XL .req q5
- XM .req q6
- XH .req q7
- IN1 .req q7
+ T1 .req q1
+ XL .req q2
+ XM .req q3
+ XH .req q4
+ IN1 .req q4

SHASH_L .req d0
SHASH_H .req d1
- SHASH2_L .req d2
- T1_L .req d4
- MASK_L .req d8
- XL_L .req d10
- XL_H .req d11
- XM_L .req d12
- XM_H .req d13
- XH_L .req d14
+ T1_L .req d2
+ T1_H .req d3
+ XL_L .req d4
+ XL_H .req d5
+ XM_L .req d6
+ XM_H .req d7
+ XH_L .req d8
+
+ t0l .req d10
+ t0h .req d11
+ t1l .req d12
+ t1h .req d13
+ t2l .req d14
+ t2h .req d15
+ t3l .req d16
+ t3h .req d17
+ t4l .req d18
+ t4h .req d19
+
+ t0q .req q5
+ t1q .req q6
+ t2q .req q7
+ t3q .req q8
+ t4q .req q9
+ T2 .req q9
+
+ s1l .req d20
+ s1h .req d21
+ s2l .req d22
+ s2h .req d23
+ s3l .req d24
+ s3h .req d25
+ s4l .req d26
+ s4h .req d27
+
+ MASK .req d28
+ SHASH2_p8 .req d28
+
+ k16 .req d29
+ k32 .req d30
+ k48 .req d31
+ SHASH2_p64 .req d31

.text
.fpu crypto-neon-fp-armv8

+ .macro __pmull_p64, rd, rn, rm, b1, b2, b3, b4
+ vmull.p64 \rd, \rn, \rm
+ .endm
+
/*
- * void pmull_ghash_update(int blocks, u64 dg[], const char *src,
- * struct ghash_key const *k, const char *head)
+ * This implementation of 64x64 -> 128 bit polynomial multiplication
+ * using vmull.p8 instructions (8x8 -> 16) is taken from the paper
+ * "Fast Software Polynomial Multiplication on ARM Processors Using
+ * the NEON Engine" by Danilo Camara, Conrado Gouvea, Julio Lopez and
+ * Ricardo Dahab (https://hal.inria.fr/hal-01506572)
+ *
+ * It has been slightly tweaked for in-order performance, and to allow
+ * 'rq' to overlap with 'ad' or 'bd'.
*/
-ENTRY(pmull_ghash_update)
- vld1.64 {SHASH}, [r3]
+ .macro __pmull_p8, rq, ad, bd, b1=t4l, b2=t3l, b3=t4l, b4=t3l
+ vext.8 t0l, \ad, \ad, #1 @ A1
+ .ifc \b1, t4l
+ vext.8 t4l, \bd, \bd, #1 @ B1
+ .endif
+ vmull.p8 t0q, t0l, \bd @ F = A1*B
+ vext.8 t1l, \ad, \ad, #2 @ A2
+ vmull.p8 t4q, \ad, \b1 @ E = A*B1
+ .ifc \b2, t3l
+ vext.8 t3l, \bd, \bd, #2 @ B2
+ .endif
+ vmull.p8 t1q, t1l, \bd @ H = A2*B
+ vext.8 t2l, \ad, \ad, #3 @ A3
+ vmull.p8 t3q, \ad, \b2 @ G = A*B2
+ veor t0q, t0q, t4q @ L = E + F
+ .ifc \b3, t4l
+ vext.8 t4l, \bd, \bd, #3 @ B3
+ .endif
+ vmull.p8 t2q, t2l, \bd @ J = A3*B
+ veor t0l, t0l, t0h @ t0 = (L) (P0 + P1) << 8
+ veor t1q, t1q, t3q @ M = G + H
+ .ifc \b4, t3l
+ vext.8 t3l, \bd, \bd, #4 @ B4
+ .endif
+ vmull.p8 t4q, \ad, \b3 @ I = A*B3
+ veor t1l, t1l, t1h @ t1 = (M) (P2 + P3) << 16
+ vmull.p8 t3q, \ad, \b4 @ K = A*B4
+ vand t0h, t0h, k48
+ vand t1h, t1h, k32
+ veor t2q, t2q, t4q @ N = I + J
+ veor t0l, t0l, t0h
+ veor t1l, t1l, t1h
+ veor t2l, t2l, t2h @ t2 = (N) (P4 + P5) << 24
+ vand t2h, t2h, k16
+ veor t3l, t3l, t3h @ t3 = (K) (P6 + P7) << 32
+ vmov.i64 t3h, #0
+ vext.8 t0q, t0q, t0q, #15
+ veor t2l, t2l, t2h
+ vext.8 t1q, t1q, t1q, #14
+ vmull.p8 \rq, \ad, \bd @ D = A*B
+ vext.8 t2q, t2q, t2q, #13
+ vext.8 t3q, t3q, t3q, #12
+ veor t0q, t0q, t1q
+ veor t2q, t2q, t3q
+ veor \rq, \rq, t0q
+ veor \rq, \rq, t2q
+ .endm
+
+ //
+ // PMULL (64x64->128) based reduction for CPUs that can do
+ // it in a single instruction.
+ //
+ .macro __pmull_reduce_p64
+ vmull.p64 T1, XL_L, MASK
+
+ veor XH_L, XH_L, XM_H
+ vext.8 T1, T1, T1, #8
+ veor XL_H, XL_H, XM_L
+ veor T1, T1, XL
+
+ vmull.p64 XL, T1_H, MASK
+ .endm
+
+ //
+ // Alternative reduction for CPUs that lack support for the
+ // 64x64->128 PMULL instruction
+ //
+ .macro __pmull_reduce_p8
+ veor XL_H, XL_H, XM_L
+ veor XH_L, XH_L, XM_H
+
+ vshl.i64 T1, XL, #57
+ vshl.i64 T2, XL, #62
+ veor T1, T1, T2
+ vshl.i64 T2, XL, #63
+ veor T1, T1, T2
+ veor XL_H, XL_H, T1_L
+ veor XH_L, XH_L, T1_H
+
+ vshr.u64 T1, XL, #1
+ veor XH, XH, XL
+ veor XL, XL, T1
+ vshr.u64 T1, T1, #6
+ vshr.u64 XL, XL, #1
+ .endm
+
+ .macro ghash_update, pn
vld1.64 {XL}, [r1]
- vmov.i8 MASK, #0xe1
- vext.8 SHASH2, SHASH, SHASH, #8
- vshl.u64 MASK, MASK, #57
- veor SHASH2, SHASH2, SHASH

/* do the head block first, if supplied */
ldr ip, [sp]
@@ -62,33 +184,59 @@ ENTRY(pmull_ghash_update)
#ifndef CONFIG_CPU_BIG_ENDIAN
vrev64.8 T1, T1
#endif
- vext.8 T2, XL, XL, #8
vext.8 IN1, T1, T1, #8
- veor T1, T1, T2
+ veor T1_L, T1_L, XL_H
veor XL, XL, IN1

- vmull.p64 XH, SHASH_H, XL_H @ a1 * b1
+ __pmull_\pn XH, XL_H, SHASH_H, s1h, s2h, s3h, s4h @ a1 * b1
veor T1, T1, XL
- vmull.p64 XL, SHASH_L, XL_L @ a0 * b0
- vmull.p64 XM, SHASH2_L, T1_L @ (a1 + a0)(b1 + b0)
+ __pmull_\pn XL, XL_L, SHASH_L, s1l, s2l, s3l, s4l @ a0 * b0
+ __pmull_\pn XM, T1_L, SHASH2_\pn @ (a1+a0)(b1+b0)

- vext.8 T1, XL, XH, #8
- veor T2, XL, XH
+ veor T1, XL, XH
veor XM, XM, T1
- veor XM, XM, T2
- vmull.p64 T2, XL_L, MASK_L

- vmov XH_L, XM_H
- vmov XM_H, XL_L
+ __pmull_reduce_\pn

- veor XL, XM, T2
- vext.8 T2, XL, XL, #8
- vmull.p64 XL, XL_L, MASK_L
- veor T2, T2, XH
- veor XL, XL, T2
+ veor T1, T1, XH
+ veor XL, XL, T1

bne 0b

vst1.64 {XL}, [r1]
bx lr
-ENDPROC(pmull_ghash_update)
+ .endm
+
+ /*
+ * void pmull_ghash_update(int blocks, u64 dg[], const char *src,
+ * struct ghash_key const *k, const char *head)
+ */
+ENTRY(pmull_ghash_update_p64)
+ vld1.64 {SHASH}, [r3]
+ veor SHASH2_p64, SHASH_L, SHASH_H
+
+ vmov.i8 MASK, #0xe1
+ vshl.u64 MASK, MASK, #57
+
+ ghash_update p64
+ENDPROC(pmull_ghash_update_p64)
+
+ENTRY(pmull_ghash_update_p8)
+ vld1.64 {SHASH}, [r3]
+ veor SHASH2_p8, SHASH_L, SHASH_H
+
+ vext.8 s1l, SHASH_L, SHASH_L, #1
+ vext.8 s2l, SHASH_L, SHASH_L, #2
+ vext.8 s3l, SHASH_L, SHASH_L, #3
+ vext.8 s4l, SHASH_L, SHASH_L, #4
+ vext.8 s1h, SHASH_H, SHASH_H, #1
+ vext.8 s2h, SHASH_H, SHASH_H, #2
+ vext.8 s3h, SHASH_H, SHASH_H, #3
+ vext.8 s4h, SHASH_H, SHASH_H, #4
+
+ vmov.i64 k16, #0xffff
+ vmov.i64 k32, #0xffffffff
+ vmov.i64 k48, #0xffffffffffff
+
+ ghash_update p8
+ENDPROC(pmull_ghash_update_p8)
diff --git a/arch/arm/crypto/ghash-ce-glue.c b/arch/arm/crypto/ghash-ce-glue.c
index 6bac8bea9f1e..d9bb52cae2ac 100644
--- a/arch/arm/crypto/ghash-ce-glue.c
+++ b/arch/arm/crypto/ghash-ce-glue.c
@@ -22,6 +22,7 @@
MODULE_DESCRIPTION("GHASH secure hash using ARMv8 Crypto Extensions");
MODULE_AUTHOR("Ard Biesheuvel <***@linaro.org>");
MODULE_LICENSE("GPL v2");
+MODULE_ALIAS_CRYPTO("ghash");

#define GHASH_BLOCK_SIZE 16
#define GHASH_DIGEST_SIZE 16
@@ -41,8 +42,17 @@ struct ghash_async_ctx {
struct cryptd_ahash *cryptd_tfm;
};

-asmlinkage void pmull_ghash_update(int blocks, u64 dg[], const char *src,
- struct ghash_key const *k, const char *head);
+asmlinkage void pmull_ghash_update_p64(int blocks, u64 dg[], const char *src,
+ struct ghash_key const *k,
+ const char *head);
+
+asmlinkage void pmull_ghash_update_p8(int blocks, u64 dg[], const char *src,
+ struct ghash_key const *k,
+ const char *head);
+
+static void (*pmull_ghash_update)(int blocks, u64 dg[], const char *src,
+ struct ghash_key const *k,
+ const char *head);

static int ghash_init(struct shash_desc *desc)
{
@@ -312,6 +322,14 @@ static int __init ghash_ce_mod_init(void)
{
int err;

+ if (!(elf_hwcap & HWCAP_NEON))
+ return -ENODEV;
+
+ if (elf_hwcap2 & HWCAP2_PMULL)
+ pmull_ghash_update = pmull_ghash_update_p64;
+ else
+ pmull_ghash_update = pmull_ghash_update_p8;
+
err = crypto_register_shash(&ghash_alg);
if (err)
return err;
@@ -332,5 +350,5 @@ static void __exit ghash_ce_mod_exit(void)
crypto_unregister_shash(&ghash_alg);
}

-module_cpu_feature_match(PMULL, ghash_ce_mod_init);
+module_init(ghash_ce_mod_init);
module_exit(ghash_ce_mod_exit);
--
2.9.3
Ard Biesheuvel
2017-07-24 10:28:20 UTC
Permalink
For the final round, avoid the expanded and padded lookup tables
exported by the generic AES driver. Instead, for encryption, we can
perform byte loads from the same table we used for the inner rounds,
which will still be hot in the caches. For decryption, use the inverse
AES Sbox directly, which is 4x smaller than the inverse lookup table
exported by the generic driver.

This should significantly reduce the Dcache footprint of our code,
which makes the code more robust against timing attacks. It does not
introduce any additional module dependencies, given that we already
rely on the core AES module for the shared key expansion routines.
It also frees up register x18, which is not available as a scratch
register on all platforms, which and so avoiding it improves
shareability of this code.

Signed-off-by: Ard Biesheuvel <***@linaro.org>
---
arch/arm64/crypto/aes-cipher-core.S | 152 ++++++++++++++------
1 file changed, 107 insertions(+), 45 deletions(-)

diff --git a/arch/arm64/crypto/aes-cipher-core.S b/arch/arm64/crypto/aes-cipher-core.S
index f2f9cc519309..6d2445d603cc 100644
--- a/arch/arm64/crypto/aes-cipher-core.S
+++ b/arch/arm64/crypto/aes-cipher-core.S
@@ -10,6 +10,7 @@

#include <linux/linkage.h>
#include <asm/assembler.h>
+#include <asm/cache.h>

.text

@@ -17,94 +18,155 @@
out .req x1
in .req x2
rounds .req x3
- tt .req x4
- lt .req x2
+ tt .req x2

- .macro __pair, enc, reg0, reg1, in0, in1e, in1d, shift
+ .macro __pair1, sz, op, reg0, reg1, in0, in1e, in1d, shift
+ .ifc \op\shift, b0
+ ubfiz \reg0, \in0, #2, #8
+ ubfiz \reg1, \in1e, #2, #8
+ .else
ubfx \reg0, \in0, #\shift, #8
- .if \enc
ubfx \reg1, \in1e, #\shift, #8
- .else
- ubfx \reg1, \in1d, #\shift, #8
.endif
+
+ /*
+ * AArch64 cannot do byte size indexed loads from a table containing
+ * 32-bit quantities, i.e., 'ldrb w12, [tt, w12, uxtw #2]' is not a
+ * valid instruction. So perform the shift explicitly first for the
+ * high bytes (the low byte is shifted implicitly by using ubfiz rather
+ * than ubfx above)
+ */
+ .ifnc \op, b
ldr \reg0, [tt, \reg0, uxtw #2]
ldr \reg1, [tt, \reg1, uxtw #2]
+ .else
+ .if \shift > 0
+ lsl \reg0, \reg0, #2
+ lsl \reg1, \reg1, #2
+ .endif
+ ldrb \reg0, [tt, \reg0, uxtw]
+ ldrb \reg1, [tt, \reg1, uxtw]
+ .endif
.endm

- .macro __hround, out0, out1, in0, in1, in2, in3, t0, t1, enc
+ .macro __pair0, sz, op, reg0, reg1, in0, in1e, in1d, shift
+ ubfx \reg0, \in0, #\shift, #8
+ ubfx \reg1, \in1d, #\shift, #8
+ ldr\op \reg0, [tt, \reg0, uxtw #\sz]
+ ldr\op \reg1, [tt, \reg1, uxtw #\sz]
+ .endm
+
+ .macro __hround, out0, out1, in0, in1, in2, in3, t0, t1, enc, sz, op
ldp \out0, \out1, [rk], #8

- __pair \enc, w13, w14, \in0, \in1, \in3, 0
- __pair \enc, w15, w16, \in1, \in2, \in0, 8
- __pair \enc, w17, w18, \in2, \in3, \in1, 16
- __pair \enc, \t0, \t1, \in3, \in0, \in2, 24
-
- eor \out0, \out0, w13
- eor \out1, \out1, w14
- eor \out0, \out0, w15, ror #24
- eor \out1, \out1, w16, ror #24
- eor \out0, \out0, w17, ror #16
- eor \out1, \out1, w18, ror #16
+ __pair\enc \sz, \op, w12, w13, \in0, \in1, \in3, 0
+ __pair\enc \sz, \op, w14, w15, \in1, \in2, \in0, 8
+ __pair\enc \sz, \op, w16, w17, \in2, \in3, \in1, 16
+ __pair\enc \sz, \op, \t0, \t1, \in3, \in0, \in2, 24
+
+ eor \out0, \out0, w12
+ eor \out1, \out1, w13
+ eor \out0, \out0, w14, ror #24
+ eor \out1, \out1, w15, ror #24
+ eor \out0, \out0, w16, ror #16
+ eor \out1, \out1, w17, ror #16
eor \out0, \out0, \t0, ror #8
eor \out1, \out1, \t1, ror #8
.endm

- .macro fround, out0, out1, out2, out3, in0, in1, in2, in3
- __hround \out0, \out1, \in0, \in1, \in2, \in3, \out2, \out3, 1
- __hround \out2, \out3, \in2, \in3, \in0, \in1, \in1, \in2, 1
+ .macro fround, out0, out1, out2, out3, in0, in1, in2, in3, sz=2, op
+ __hround \out0, \out1, \in0, \in1, \in2, \in3, \out2, \out3, 1, \sz, \op
+ __hround \out2, \out3, \in2, \in3, \in0, \in1, \in1, \in2, 1, \sz, \op
.endm

- .macro iround, out0, out1, out2, out3, in0, in1, in2, in3
- __hround \out0, \out1, \in0, \in3, \in2, \in1, \out2, \out3, 0
- __hround \out2, \out3, \in2, \in1, \in0, \in3, \in1, \in0, 0
+ .macro iround, out0, out1, out2, out3, in0, in1, in2, in3, sz=2, op
+ __hround \out0, \out1, \in0, \in3, \in2, \in1, \out2, \out3, 0, \sz, \op
+ __hround \out2, \out3, \in2, \in1, \in0, \in3, \in1, \in0, 0, \sz, \op
.endm

- .macro do_crypt, round, ttab, ltab
- ldp w5, w6, [in]
- ldp w7, w8, [in, #8]
- ldp w9, w10, [rk], #16
- ldp w11, w12, [rk, #-8]
+ .macro do_crypt, round, ttab, ltab, bsz
+ ldp w4, w5, [in]
+ ldp w6, w7, [in, #8]
+ ldp w8, w9, [rk], #16
+ ldp w10, w11, [rk, #-8]

+CPU_BE( rev w4, w4 )
CPU_BE( rev w5, w5 )
CPU_BE( rev w6, w6 )
CPU_BE( rev w7, w7 )
-CPU_BE( rev w8, w8 )

+ eor w4, w4, w8
eor w5, w5, w9
eor w6, w6, w10
eor w7, w7, w11
- eor w8, w8, w12

adr_l tt, \ttab
- adr_l lt, \ltab

tbnz rounds, #1, 1f

-0: \round w9, w10, w11, w12, w5, w6, w7, w8
- \round w5, w6, w7, w8, w9, w10, w11, w12
+0: \round w8, w9, w10, w11, w4, w5, w6, w7
+ \round w4, w5, w6, w7, w8, w9, w10, w11

1: subs rounds, rounds, #4
- \round w9, w10, w11, w12, w5, w6, w7, w8
- csel tt, tt, lt, hi
- \round w5, w6, w7, w8, w9, w10, w11, w12
- b.hi 0b
-
+ \round w8, w9, w10, w11, w4, w5, w6, w7
+ b.ls 3f
+2: \round w4, w5, w6, w7, w8, w9, w10, w11
+ b 0b
+3: adr_l tt, \ltab
+ \round w4, w5, w6, w7, w8, w9, w10, w11, \bsz, b
+
+CPU_BE( rev w4, w4 )
CPU_BE( rev w5, w5 )
CPU_BE( rev w6, w6 )
CPU_BE( rev w7, w7 )
-CPU_BE( rev w8, w8 )

- stp w5, w6, [out]
- stp w7, w8, [out, #8]
+ stp w4, w5, [out]
+ stp w6, w7, [out, #8]
ret
.endm

- .align 5
+ .align L1_CACHE_SHIFT
+ .type __aes_arm64_inverse_sbox, %object
+__aes_arm64_inverse_sbox:
+ .byte 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38
+ .byte 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb
+ .byte 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87
+ .byte 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb
+ .byte 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d
+ .byte 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e
+ .byte 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2
+ .byte 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25
+ .byte 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16
+ .byte 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92
+ .byte 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda
+ .byte 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84
+ .byte 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a
+ .byte 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06
+ .byte 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02
+ .byte 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b
+ .byte 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea
+ .byte 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73
+ .byte 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85
+ .byte 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e
+ .byte 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89
+ .byte 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b
+ .byte 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20
+ .byte 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4
+ .byte 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31
+ .byte 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f
+ .byte 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d
+ .byte 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef
+ .byte 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0
+ .byte 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61
+ .byte 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26
+ .byte 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
+ .size __aes_arm64_inverse_sbox, . - __aes_arm64_inverse_sbox
+
ENTRY(__aes_arm64_encrypt)
- do_crypt fround, crypto_ft_tab, crypto_fl_tab
+ do_crypt fround, crypto_ft_tab, crypto_ft_tab + 1, 2
ENDPROC(__aes_arm64_encrypt)

.align 5
ENTRY(__aes_arm64_decrypt)
- do_crypt iround, crypto_it_tab, crypto_il_tab
+ do_crypt iround, crypto_it_tab, __aes_arm64_inverse_sbox, 0
ENDPROC(__aes_arm64_decrypt)
--
2.9.3
Ard Biesheuvel
2017-07-24 10:28:18 UTC
Permalink
Implement a NEON fallback for systems that do support NEON but have
no support for the optional 64x64->128 polynomial multiplication
instruction that is part of the ARMv8 Crypto Extensions. It is based
on the paper "Fast Software Polynomial Multiplication on ARM Processors
Using the NEON Engine" by Danilo Camara, Conrado Gouvea, Julio Lopez and
Ricardo Dahab (https://hal.inria.fr/hal-01506572), but has been reworked
extensively for the AArch64 ISA.

On a low-end core such as the Cortex-A53 found in the Raspberry Pi3, the
NEON based implementation is 4x faster than the table based one, and
is time invariant as well, making it less vulnerable to timing attacks.
When combined with the bit-sliced NEON implementation of AES-CTR, the
AES-GCM performance increases by 2x (from 58 to 29 cycles per byte).

Signed-off-by: Ard Biesheuvel <***@linaro.org>
---
arch/arm64/crypto/ghash-ce-core.S | 248 +++++++++++++++++---
arch/arm64/crypto/ghash-ce-glue.c | 40 +++-
2 files changed, 252 insertions(+), 36 deletions(-)

diff --git a/arch/arm64/crypto/ghash-ce-core.S b/arch/arm64/crypto/ghash-ce-core.S
index cb22459eba85..11ebf1ae248a 100644
--- a/arch/arm64/crypto/ghash-ce-core.S
+++ b/arch/arm64/crypto/ghash-ce-core.S
@@ -1,7 +1,7 @@
/*
* Accelerated GHASH implementation with ARMv8 PMULL instructions.
*
- * Copyright (C) 2014 Linaro Ltd. <***@linaro.org>
+ * Copyright (C) 2014 - 2017 Linaro Ltd. <***@linaro.org>
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 as published
@@ -11,31 +11,215 @@
#include <linux/linkage.h>
#include <asm/assembler.h>

- SHASH .req v0
- SHASH2 .req v1
- T1 .req v2
- T2 .req v3
- MASK .req v4
- XL .req v5
- XM .req v6
- XH .req v7
- IN1 .req v7
+ SHASH .req v0
+ SHASH2 .req v1
+ T1 .req v2
+ T2 .req v3
+ MASK .req v4
+ XL .req v5
+ XM .req v6
+ XH .req v7
+ IN1 .req v7
+
+ k00_16 .req v8
+ k32_48 .req v9
+
+ t3 .req v10
+ t4 .req v11
+ t5 .req v12
+ t6 .req v13
+ t7 .req v14
+ t8 .req v15
+ t9 .req v16
+
+ perm1 .req v17
+ perm2 .req v18
+ perm3 .req v19
+
+ sh1 .req v20
+ sh2 .req v21
+ sh3 .req v22
+ sh4 .req v23
+
+ ss1 .req v24
+ ss2 .req v25
+ ss3 .req v26
+ ss4 .req v27

.text
.arch armv8-a+crypto

- /*
- * void pmull_ghash_update(int blocks, u64 dg[], const char *src,
- * struct ghash_key const *k, const char *head)
- */
-ENTRY(pmull_ghash_update)
+ .macro __pmull_p64, rd, rn, rm
+ pmull \rd\().1q, \rn\().1d, \rm\().1d
+ .endm
+
+ .macro __pmull2_p64, rd, rn, rm
+ pmull2 \rd\().1q, \rn\().2d, \rm\().2d
+ .endm
+
+ .macro __pmull_p8, rq, ad, bd
+ ext t3.8b, \ad\().8b, \ad\().8b, #1 // A1
+ ext t5.8b, \ad\().8b, \ad\().8b, #2 // A2
+ ext t7.8b, \ad\().8b, \ad\().8b, #3 // A3
+
+ __pmull_p8_\bd \rq, \ad
+ .endm
+
+ .macro __pmull2_p8, rq, ad, bd
+ tbl t3.16b, {\ad\().16b}, perm1.16b // A1
+ tbl t5.16b, {\ad\().16b}, perm2.16b // A2
+ tbl t7.16b, {\ad\().16b}, perm3.16b // A3
+
+ __pmull2_p8_\bd \rq, \ad
+ .endm
+
+ .macro __pmull_p8_SHASH, rq, ad
+ __pmull_p8_tail \rq, \ad\().8b, SHASH.8b, 8b,, sh1, sh2, sh3, sh4
+ .endm
+
+ .macro __pmull_p8_SHASH2, rq, ad
+ __pmull_p8_tail \rq, \ad\().8b, SHASH2.8b, 8b,, ss1, ss2, ss3, ss4
+ .endm
+
+ .macro __pmull2_p8_SHASH, rq, ad
+ __pmull_p8_tail \rq, \ad\().16b, SHASH.16b, 16b, 2, sh1, sh2, sh3, sh4
+ .endm
+
+ .macro __pmull_p8_tail, rq, ad, bd, nb, t, b1, b2, b3, b4
+ pmull\t t3.8h, t3.\nb, \bd // F = A1*B
+ pmull\t t4.8h, \ad, \b1\().\nb // E = A*B1
+ pmull\t t5.8h, t5.\nb, \bd // H = A2*B
+ pmull\t t6.8h, \ad, \b2\().\nb // G = A*B2
+ pmull\t t7.8h, t7.\nb, \bd // J = A3*B
+ pmull\t t8.8h, \ad, \b3\().\nb // I = A*B3
+ pmull\t t9.8h, \ad, \b4\().\nb // K = A*B4
+ pmull\t \rq\().8h, \ad, \bd // D = A*B
+
+ eor t3.16b, t3.16b, t4.16b // L = E + F
+ eor t5.16b, t5.16b, t6.16b // M = G + H
+ eor t7.16b, t7.16b, t8.16b // N = I + J
+
+ uzp1 t4.2d, t3.2d, t5.2d
+ uzp2 t3.2d, t3.2d, t5.2d
+ uzp1 t6.2d, t7.2d, t9.2d
+ uzp2 t7.2d, t7.2d, t9.2d
+
+ // t3 = (L) (P0 + P1) << 8
+ // t5 = (M) (P2 + P3) << 16
+ eor t4.16b, t4.16b, t3.16b
+ and t3.16b, t3.16b, k32_48.16b
+
+ // t7 = (N) (P4 + P5) << 24
+ // t9 = (K) (P6 + P7) << 32
+ eor t6.16b, t6.16b, t7.16b
+ and t7.16b, t7.16b, k00_16.16b
+
+ eor t4.16b, t4.16b, t3.16b
+ eor t6.16b, t6.16b, t7.16b
+
+ zip2 t5.2d, t4.2d, t3.2d
+ zip1 t3.2d, t4.2d, t3.2d
+ zip2 t9.2d, t6.2d, t7.2d
+ zip1 t7.2d, t6.2d, t7.2d
+
+ ext t3.16b, t3.16b, t3.16b, #15
+ ext t5.16b, t5.16b, t5.16b, #14
+ ext t7.16b, t7.16b, t7.16b, #13
+ ext t9.16b, t9.16b, t9.16b, #12
+
+ eor t3.16b, t3.16b, t5.16b
+ eor t7.16b, t7.16b, t9.16b
+ eor \rq\().16b, \rq\().16b, t3.16b
+ eor \rq\().16b, \rq\().16b, t7.16b
+ .endm
+
+ .macro __pmull_pre_p64
+ movi MASK.16b, #0xe1
+ shl MASK.2d, MASK.2d, #57
+ .endm
+
+ .macro __pmull_pre_p8
+ // k00_16 := 0x0000000000000000_000000000000ffff
+ // k32_48 := 0x00000000ffffffff_0000ffffffffffff
+ movi k32_48.2d, #0xffffffff
+ mov k32_48.h[2], k32_48.h[0]
+ ushr k00_16.2d, k32_48.2d, #32
+
+ // prepare the permutation vectors
+ mov_q x5, 0x080f0e0d0c0b0a09
+ movi T1.8b, #8
+ dup perm1.2d, x5
+ eor perm1.16b, perm1.16b, T1.16b
+ ushr perm2.2d, perm1.2d, #8
+ ushr perm3.2d, perm1.2d, #16
+ ushr T1.2d, perm1.2d, #24
+ sli perm2.2d, perm1.2d, #56
+ sli perm3.2d, perm1.2d, #48
+ sli T1.2d, perm1.2d, #40
+
+ // precompute loop invariants
+ tbl sh1.16b, {SHASH.16b}, perm1.16b
+ tbl sh2.16b, {SHASH.16b}, perm2.16b
+ tbl sh3.16b, {SHASH.16b}, perm3.16b
+ tbl sh4.16b, {SHASH.16b}, T1.16b
+ ext ss1.8b, SHASH2.8b, SHASH2.8b, #1
+ ext ss2.8b, SHASH2.8b, SHASH2.8b, #2
+ ext ss3.8b, SHASH2.8b, SHASH2.8b, #3
+ ext ss4.8b, SHASH2.8b, SHASH2.8b, #4
+ .endm
+
+ //
+ // PMULL (64x64->128) based reduction for CPUs that can do
+ // it in a single instruction.
+ //
+ .macro __pmull_reduce_p64
+ pmull T2.1q, XL.1d, MASK.1d
+ eor XM.16b, XM.16b, T1.16b
+
+ mov XH.d[0], XM.d[1]
+ mov XM.d[1], XL.d[0]
+
+ eor XL.16b, XM.16b, T2.16b
+ ext T2.16b, XL.16b, XL.16b, #8
+ pmull XL.1q, XL.1d, MASK.1d
+ .endm
+
+ //
+ // Alternative reduction for CPUs that lack support for the
+ // 64x64->128 PMULL instruction
+ //
+ .macro __pmull_reduce_p8
+ eor XM.16b, XM.16b, T1.16b
+
+ mov XL.d[1], XM.d[0]
+ mov XH.d[0], XM.d[1]
+
+ shl T1.2d, XL.2d, #57
+ shl T2.2d, XL.2d, #62
+ eor T2.16b, T2.16b, T1.16b
+ shl T1.2d, XL.2d, #63
+ eor T2.16b, T2.16b, T1.16b
+ ext T1.16b, XL.16b, XH.16b, #8
+ eor T2.16b, T2.16b, T1.16b
+
+ mov XL.d[1], T2.d[0]
+ mov XH.d[0], T2.d[1]
+
+ ushr T2.2d, XL.2d, #1
+ eor XH.16b, XH.16b, XL.16b
+ eor XL.16b, XL.16b, T2.16b
+ ushr T2.2d, T2.2d, #6
+ ushr XL.2d, XL.2d, #1
+ .endm
+
+ .macro __pmull_ghash, pn
ld1 {SHASH.2d}, [x3]
ld1 {XL.2d}, [x1]
- movi MASK.16b, #0xe1
ext SHASH2.16b, SHASH.16b, SHASH.16b, #8
- shl MASK.2d, MASK.2d, #57
eor SHASH2.16b, SHASH2.16b, SHASH.16b

+ __pmull_pre_\pn
+
/* do the head block first, if supplied */
cbz x4, 0f
ld1 {T1.2d}, [x4]
@@ -52,23 +236,17 @@ CPU_LE( rev64 T1.16b, T1.16b )
eor T1.16b, T1.16b, T2.16b
eor XL.16b, XL.16b, IN1.16b

- pmull2 XH.1q, SHASH.2d, XL.2d // a1 * b1
+ __pmull2_\pn XH, XL, SHASH // a1 * b1
eor T1.16b, T1.16b, XL.16b
- pmull XL.1q, SHASH.1d, XL.1d // a0 * b0
- pmull XM.1q, SHASH2.1d, T1.1d // (a1 + a0)(b1 + b0)
+ __pmull_\pn XL, XL, SHASH // a0 * b0
+ __pmull_\pn XM, T1, SHASH2 // (a1 + a0)(b1 + b0)

- ext T1.16b, XL.16b, XH.16b, #8
eor T2.16b, XL.16b, XH.16b
- eor XM.16b, XM.16b, T1.16b
+ ext T1.16b, XL.16b, XH.16b, #8
eor XM.16b, XM.16b, T2.16b
- pmull T2.1q, XL.1d, MASK.1d

- mov XH.d[0], XM.d[1]
- mov XM.d[1], XL.d[0]
+ __pmull_reduce_\pn

- eor XL.16b, XM.16b, T2.16b
- ext T2.16b, XL.16b, XL.16b, #8
- pmull XL.1q, XL.1d, MASK.1d
eor T2.16b, T2.16b, XH.16b
eor XL.16b, XL.16b, T2.16b

@@ -76,7 +254,19 @@ CPU_LE( rev64 T1.16b, T1.16b )

st1 {XL.2d}, [x1]
ret
-ENDPROC(pmull_ghash_update)
+ .endm
+
+ /*
+ * void pmull_ghash_update(int blocks, u64 dg[], const char *src,
+ * struct ghash_key const *k, const char *head)
+ */
+ENTRY(pmull_ghash_update_p64)
+ __pmull_ghash p64
+ENDPROC(pmull_ghash_update_p64)
+
+ENTRY(pmull_ghash_update_p8)
+ __pmull_ghash p8
+ENDPROC(pmull_ghash_update_p8)

KS .req v8
CTR .req v9
diff --git a/arch/arm64/crypto/ghash-ce-glue.c b/arch/arm64/crypto/ghash-ce-glue.c
index ee6aaac05905..cfc9c92814fd 100644
--- a/arch/arm64/crypto/ghash-ce-glue.c
+++ b/arch/arm64/crypto/ghash-ce-glue.c
@@ -26,6 +26,7 @@
MODULE_DESCRIPTION("GHASH and AES-GCM using ARMv8 Crypto Extensions");
MODULE_AUTHOR("Ard Biesheuvel <***@linaro.org>");
MODULE_LICENSE("GPL v2");
+MODULE_ALIAS_CRYPTO("ghash");

#define GHASH_BLOCK_SIZE 16
#define GHASH_DIGEST_SIZE 16
@@ -48,8 +49,17 @@ struct gcm_aes_ctx {
struct ghash_key ghash_key;
};

-asmlinkage void pmull_ghash_update(int blocks, u64 dg[], const char *src,
- struct ghash_key const *k, const char *head);
+asmlinkage void pmull_ghash_update_p64(int blocks, u64 dg[], const char *src,
+ struct ghash_key const *k,
+ const char *head);
+
+asmlinkage void pmull_ghash_update_p8(int blocks, u64 dg[], const char *src,
+ struct ghash_key const *k,
+ const char *head);
+
+static void (*pmull_ghash_update)(int blocks, u64 dg[], const char *src,
+ struct ghash_key const *k,
+ const char *head);

asmlinkage void pmull_gcm_encrypt(int blocks, u64 dg[], u8 dst[],
const u8 src[], struct ghash_key const *k,
@@ -557,13 +567,24 @@ static int __init ghash_ce_mod_init(void)
{
int ret;

- ret = crypto_register_aead(&gcm_aes_alg);
- if (ret)
- return ret;
+ if (!(elf_hwcap & HWCAP_ASIMD))
+ return -ENODEV;
+
+ if (elf_hwcap & HWCAP_PMULL)
+ pmull_ghash_update = pmull_ghash_update_p64;
+
+ else
+ pmull_ghash_update = pmull_ghash_update_p8;

ret = crypto_register_shash(&ghash_alg);
if (ret)
- crypto_unregister_aead(&gcm_aes_alg);
+ return ret;
+
+ if (elf_hwcap & HWCAP_PMULL) {
+ ret = crypto_register_aead(&gcm_aes_alg);
+ if (ret)
+ crypto_unregister_shash(&ghash_alg);
+ }
return ret;
}

@@ -573,5 +594,10 @@ static void __exit ghash_ce_mod_exit(void)
crypto_unregister_aead(&gcm_aes_alg);
}

-module_cpu_feature_match(PMULL, ghash_ce_mod_init);
+static const struct cpu_feature ghash_cpu_feature[] = {
+ { cpu_feature(PMULL) }, { }
+};
+MODULE_DEVICE_TABLE(cpu, ghash_cpu_feature);
+
+module_init(ghash_ce_mod_init);
module_exit(ghash_ce_mod_exit);
--
2.9.3
Ard Biesheuvel
2017-07-24 10:28:14 UTC
Permalink
To accommodate systems that disallow the use of kernel mode NEON in
some circumstances, take the return value of may_use_simd into
account when deciding whether to invoke the C fallback routine.

Signed-off-by: Ard Biesheuvel <***@linaro.org>
---
arch/arm64/crypto/chacha20-neon-glue.c | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/crypto/chacha20-neon-glue.c b/arch/arm64/crypto/chacha20-neon-glue.c
index a7cd575ea223..cbdb75d15cd0 100644
--- a/arch/arm64/crypto/chacha20-neon-glue.c
+++ b/arch/arm64/crypto/chacha20-neon-glue.c
@@ -1,7 +1,7 @@
/*
* ChaCha20 256-bit cipher algorithm, RFC7539, arm64 NEON functions
*
- * Copyright (C) 2016 Linaro, Ltd. <***@linaro.org>
+ * Copyright (C) 2016 - 2017 Linaro, Ltd. <***@linaro.org>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
@@ -26,6 +26,7 @@

#include <asm/hwcap.h>
#include <asm/neon.h>
+#include <asm/simd.h>

asmlinkage void chacha20_block_xor_neon(u32 *state, u8 *dst, const u8 *src);
asmlinkage void chacha20_4block_xor_neon(u32 *state, u8 *dst, const u8 *src);
@@ -64,7 +65,7 @@ static int chacha20_neon(struct skcipher_request *req)
u32 state[16];
int err;

- if (req->cryptlen <= CHACHA20_BLOCK_SIZE)
+ if (!may_use_simd() || req->cryptlen <= CHACHA20_BLOCK_SIZE)
return crypto_chacha20_crypt(req);

err = skcipher_walk_virt(&walk, req, true);
--
2.9.3
Loading...