This commit is contained in:
Henry Case 2023-11-30 09:37:13 +00:00
commit be59b2b31d
3 zmienionych plików z 2097 dodań i 0 usunięć

286
aes-arm64.S Normal file
Wyświetl plik

@ -0,0 +1,286 @@
#include "asm-common.h"
.arch armv8-a+crypto
.extern F(abort)
.extern F(rijndael_rcon)
.text
///--------------------------------------------------------------------------
/// Main code.
/// The ARM crypto extension implements a little-endian version of AES
/// (though the manual doesn't actually spell this out and you have to
/// experiment)a, note that internal interface presents as big-endian so
/// as to work better with things like GCM. We therefore maintain the round
/// keys in little-endian form, and have to end-swap blocks in and out.
///
/// For added amusement, the crypto extension doesn't implement the larger-
/// block versions of Rijndael, so we have to end-swap the keys if we're
/// preparing for one of those.
// Useful constants.
.equ maxrounds, 16 // maximum number of rounds
.equ maxblksz, 32 // maximum block size, in bytes
.equ kbufsz, maxblksz*(maxrounds + 1) // size of key-sched buffer
// Context structure.
.equ nr, 0 // number of rounds
.equ w, nr + 4 // encryption key words
.equ wi, w + kbufsz // decryption key words
///--------------------------------------------------------------------------
/// Key setup.
FUNC(rijndael_setup_arm64_crypto)
// Arguments:
// x0 = pointer to context
// w1 = block size in 32-bit words
// x2 = pointer to key material
// x3 = key size in words
pushreg x29, x30
mov x29, sp
// The initial round key material is taken directly from the input
// key, so copy it over. Unfortunately, the key material is not
// guaranteed to be aligned in any especially useful way. Assume
// that alignment traps are not enabled. (Why would they be? On
// A32, alignment traps were part of a transition plan which changed
// the way unaligned loads and stores behaved, but there's never been
// any other behaviour on A64.)
mov x15, x3
add x4, x0, #w
0: sub x15, x15, #1
ldr w14, [x2], #4
str w14, [x4], #4
cbnz x15, 0b
// Find out other useful things and prepare for the main loop.
9: ldr w9, [x0, #nr] // number of rounds
madd w2, w1, w9, w1 // total key size in words
leaext x5, rijndael_rcon // round constants
sub x6, x2, x3 // minus what we've copied already
add x7, x0, #w // position in previous cycle
movi v1.4s, #0 // all-zero register for the key
mov x8, #0 // position in current cycle
// Main key expansion loop. Dispatch according to the position in
// the cycle.
0: ldr w15, [x7], #4 // word from previous cycle
cbz x8, 1f // first word of the cycle?
cmp x8, #4 // fourth word of the cycle?
b.ne 2f
cmp x3, #7 // seven or eight words of key?
b.cc 2f
// Fourth word of the cycle, seven or eight words of key. We must do
// the byte substitution.
dup v0.4s, w14
aese v0.16b, v1.16b // effectively, just SubBytes
mov w14, v0.s[0]
b 2f
// First word of the cycle. Byte substitution, rotation, and round
// constant.
1: ldrb w13, [x5], #1 // next round constant
dup v0.4s, w14
aese v0.16b, v1.16b // effectively, just SubBytes
mov w14, v0.s[0]
eor w14, w13, w14, ror #8
// Common ending: mix in the word from the previous cycle and store.
2: eor w14, w14, w15
str w14, [x4], #4
// Prepare for the next iteration. If we're done, then stop; if
// we've finished a cycle then reset the counter.
add x8, x8, #1
sub x6, x6, #1
cmp x8, x3
cbz x6, 9f
cmov.cs x8, xzr
b 0b
// Next job is to construct the decryption keys. The keys for the
// first and last rounds don't need to be mangled, but the remaining
// ones do -- and they all need to be reordered too.
//
// The plan of action, then, is to copy the final encryption round's
// keys into place first, then to do each of the intermediate rounds
// in reverse order, and finally do the first round.
//
// Do all the heavy lifting with the vector registers. The order
// we're doing this in means that it's OK if we read or write too
// much, and there's easily enough buffer space for the
// over-enthusiastic reads and writes because the context has space
// for 32-byte blocks, which is our maximum and an exact fit for two
// full-width registers.
9: add x5, x0, #wi
add x4, x0, #w
add x4, x4, w2, uxtw #2
sub x4, x4, w1, uxtw #2 // last round's keys
// Copy the last encryption round's keys.
ld1 {v0.4s, v1.4s}, [x4]
st1 {v0.4s, v1.4s}, [x5]
// Update the loop variables and stop if we've finished.
0: sub w9, w9, #1
add x5, x5, w1, uxtw #2
sub x4, x4, w1, uxtw #2
cbz w9, 9f
// Do another middle round's keys...
ld1 {v0.4s, v1.4s}, [x4]
aesimc v0.16b, v0.16b
aesimc v1.16b, v1.16b
st1 {v0.4s, v1.4s}, [x5]
b 0b
// Finally do the first encryption round.
9: ld1 {v0.4s, v1.4s}, [x4]
st1 {v0.4s, v1.4s}, [x5]
// If the block size is not exactly four words then we must end-swap
// everything. We can use fancy vector toys for this.
cmp w1, #4
b.eq 9f
// End-swap the encryption keys.
add x1, x0, #w
bl endswap_block
// And the decryption keys
add x1, x0, #wi
bl endswap_block
// All done.
9: popreg x29, x30
ret
ENDFUNC
INTFUNC(endswap_block)
// End-swap w2 words starting at x1. x1 is clobbered; w2 is not.
// It's OK to work in 16-byte chunks.
mov w3, w2
0: subs w3, w3, #4
ld1 {v0.4s}, [x1]
rev32 v0.16b, v0.16b
st1 {v0.4s}, [x1], #16
b.hi 0b
ret
ENDFUNC
///--------------------------------------------------------------------------
/// Encrypting and decrypting blocks.
.macro encdec op, aes, mc, koff
FUNC(rijndael_\op\()_arm64_crypto)
// Arguments:
// x0 = pointer to context
// x1 = pointer to input block
// x2 = pointer to output block
// Set things up ready.
ldr w3, [x0, #nr]
add x0, x0, #\koff
ld1 {v0.4s}, [x1]
rev32 v0.16b, v0.16b
// Check the number of rounds and dispatch.
cmp w3, #14
b.eq 14f
cmp w3, #10
b.eq 10f
cmp w3, #12
b.eq 12f
cmp w3, #13
b.eq 13f
cmp w3, #11
b.eq 11f
callext F(abort)
// Eleven rounds.
11: ld1 {v16.4s}, [x0], #16
\aes v0.16b, v16.16b
\mc v0.16b, v0.16b
b 10f
// Twelve rounds.
12: ld1 {v16.4s, v17.4s}, [x0], #32
\aes v0.16b, v16.16b
\mc v0.16b, v0.16b
\aes v0.16b, v17.16b
\mc v0.16b, v0.16b
b 10f
// Thirteen rounds.
13: ld1 {v16.4s-v18.4s}, [x0], #48
\aes v0.16b, v16.16b
\mc v0.16b, v0.16b
\aes v0.16b, v17.16b
\mc v0.16b, v0.16b
\aes v0.16b, v18.16b
\mc v0.16b, v0.16b
b 10f
// Fourteen rounds. (Drops through to the ten round case because
// this is the next most common.)
14: ld1 {v16.4s-v19.4s}, [x0], #64
\aes v0.16b, v16.16b
\mc v0.16b, v0.16b
\aes v0.16b, v17.16b
\mc v0.16b, v0.16b
\aes v0.16b, v18.16b
\mc v0.16b, v0.16b
\aes v0.16b, v19.16b
\mc v0.16b, v0.16b
// Drop through...
// Ten rounds.
10: ld1 {v16.4s-v19.4s}, [x0], #64
ld1 {v20.4s-v23.4s}, [x0], #64
\aes v0.16b, v16.16b
\mc v0.16b, v0.16b
\aes v0.16b, v17.16b
\mc v0.16b, v0.16b
\aes v0.16b, v18.16b
\mc v0.16b, v0.16b
\aes v0.16b, v19.16b
\mc v0.16b, v0.16b
ld1 {v16.4s-v18.4s}, [x0], #48
\aes v0.16b, v20.16b
\mc v0.16b, v0.16b
\aes v0.16b, v21.16b
\mc v0.16b, v0.16b
\aes v0.16b, v22.16b
\mc v0.16b, v0.16b
\aes v0.16b, v23.16b
\mc v0.16b, v0.16b
// Final round has no MixColumns, but is followed by final whitening.
\aes v0.16b, v16.16b
\mc v0.16b, v0.16b
\aes v0.16b, v17.16b
eor v0.16b, v0.16b, v18.16b
// All done.
rev32 v0.16b, v0.16b
st1 {v0.4s}, [x2]
ret
ENDFUNC
.endm
encdec eblk, aese, aesmc, w
encdec dblk, aesd, aesimc, wi
///----- That's all, folks --------------------------------------------------

1180
asm-common.h Normal file

Plik diff jest za duży Load Diff

631
gcm-arm64-pmull.S Normal file
Wyświetl plik

@ -0,0 +1,631 @@
#include "asm-common.h"
.arch armv8-a+crypto
.text
///--------------------------------------------------------------------------
/// Multiplication macros.
// The good news is that we have a fancy instruction to do the
// multiplications. The bad news is that it's not particularly well-
// suited to the job.
//
// For one thing, it only does a 64-bit multiplication, so in general
// we'll need to synthesize the full-width multiply by hand. For
// another thing, it doesn't help with the reduction, so we have to
// do that by hand too. And, finally, GCM has crazy bit ordering,
// and the instruction does nothing useful for that at all.
//
// Focusing on that last problem first: the bits aren't in monotonic
// significance order unless we permute them. Fortunately, ARM64 has
// an instruction which will just permute the bits in each byte for
// us, so we don't have to worry about this very much.
//
// Our main weapons, the `pmull' and `pmull2' instructions, work on
// 64-bit operands, in half of a vector register, and produce 128-bit
// results. But neither of them will multiply the high half of one
// vector by the low half of a second one, so we have a problem,
// which we solve by representing one of the operands redundantly:
// rather than packing the 64-bit pieces together, we duplicate each
// 64-bit piece across both halves of a register.
//
// The commentary for `mul128' is the most detailed. The other
// macros assume that you've already read and understood that.
.macro mul128
// Enter with u and v in v0 and v1/v2 respectively, and 0 in v31;
// leave with z = u v in v0. Clobbers v1--v6.
// First for the double-precision multiplication. It's tempting to
// use Karatsuba's identity here, but I suspect that loses more in
// the shifting, bit-twiddling, and dependency chains that it gains
// in saving a multiplication which otherwise pipelines well.
// v0 = // (u_0; u_1)
// v1/v2 = // (v_0; v_1)
pmull2 v3.1q, v0.2d, v1.2d // u_1 v_0
pmull v4.1q, v0.1d, v2.1d // u_0 v_1
pmull2 v5.1q, v0.2d, v2.2d // (t_1; x_3) = u_1 v_1
pmull v6.1q, v0.1d, v1.1d // (x_0; t_0) = u_0 v_0
// Arrange the pieces to form a double-precision polynomial.
eor v3.16b, v3.16b, v4.16b // (m_0; m_1) = u_0 v_1 + u_1 v_0
vshr128 v4, v3, 64 // (m_1; 0)
vshl128 v3, v3, 64 // (0; m_0)
eor v1.16b, v5.16b, v4.16b // (x_2; x_3)
eor v0.16b, v6.16b, v3.16b // (x_0; x_1)
// And now the only remaining difficulty is that the result needs to
// be reduced modulo p(t) = t^128 + t^7 + t^2 + t + 1. Let R = t^128
// = t^7 + t^2 + t + 1 in our field. So far, we've calculated z_0
// and z_1 such that z_0 + z_1 R = u v using the identity R = t^128:
// now we must collapse the two halves of y together using the other
// identity R = t^7 + t^2 + t + 1.
//
// We do this by working on y_2 and y_3 separately, so consider y_i
// for i = 2 or 3. Certainly, y_i t^{64i} = y_i R t^{64(i-2) =
// (t^7 + t^2 + t + 1) y_i t^{64(i-2)}, but we can't use that
// directly without breaking up the 64-bit word structure. Instead,
// we start by considering just y_i t^7 t^{64(i-2)}, which again
// looks tricky. Now, split y_i = a_i + t^57 b_i, with deg a_i < 57;
// then
//
// y_i t^7 t^{64(i-2)} = a_i t^7 t^{64(i-2)} + b_i t^{64(i-1)}
//
// We can similarly decompose y_i t^2 and y_i t into a pair of 64-bit
// contributions to the t^{64(i-2)} and t^{64(i-1)} words, but the
// splits are different. This is lovely, with one small snag: when
// we do this to y_3, we end up with a contribution back into the
// t^128 coefficient word. But notice that only the low seven bits
// of this word are affected, so there's no knock-on contribution
// into the t^64 word. Therefore, if we handle the high bits of each
// word together, and then the low bits, everything will be fine.
// First, shift the high bits down.
ushr v2.2d, v1.2d, #63 // the b_i for t
ushr v3.2d, v1.2d, #62 // the b_i for t^2
ushr v4.2d, v1.2d, #57 // the b_i for t^7
eor v2.16b, v2.16b, v3.16b // add them all together
eor v2.16b, v2.16b, v4.16b
vshr128 v3, v2, 64
vshl128 v4, v2, 64
eor v1.16b, v1.16b, v3.16b // contribution into high half
eor v0.16b, v0.16b, v4.16b // and low half
// And then shift the low bits up.
shl v2.2d, v1.2d, #1
shl v3.2d, v1.2d, #2
shl v4.2d, v1.2d, #7
eor v1.16b, v1.16b, v2.16b // unit and t contribs
eor v3.16b, v3.16b, v4.16b // t^2 and t^7 contribs
eor v0.16b, v0.16b, v1.16b // mix everything together
eor v0.16b, v0.16b, v3.16b // ... and we're done
.endm
.macro mul64
// Enter with u and v in the low halves of v0 and v1, respectively;
// leave with z = u v in x2. Clobbers x2--x4.
// The multiplication is thankfully easy.
// v0 = // (u; ?)
// v1 = // (v; ?)
pmull v0.1q, v0.1d, v1.1d // u v
// Now we must reduce. This is essentially the same as the 128-bit
// case above, but mostly simpler because everything is smaller. The
// polynomial this time is p(t) = t^64 + t^4 + t^3 + t + 1.
// Before we get stuck in, transfer the product to general-purpose
// registers.
mov x3, v0.d[1]
mov x2, v0.d[0]
// First, shift the high bits down.
eor x4, x3, x3, lsr #1 // pre-mix t^3 and t^4
eor x3, x3, x3, lsr #63 // mix in t contribution
eor x3, x3, x4, lsr #60 // shift and mix in t^3 and t^4
// And then shift the low bits up.
eor x3, x3, x3, lsl #1 // mix unit and t; pre-mix t^3, t^4
eor x2, x2, x3 // fold them in
eor x2, x2, x3, lsl #3 // and t^3 and t^4
.endm
.macro mul96
// Enter with u in the least-significant 96 bits of v0, with zero in
// the upper 32 bits, and with the least-significant 64 bits of v in
// both halves of v1, and the upper 32 bits of v in the low 32 bits
// of each half of v2, with zero in the upper 32 bits; and with zero
// in v31. Yes, that's a bit hairy. Leave with the product u v in
// the low 96 bits of v0, and /junk/ in the high 32 bits. Clobbers
// v1--v6.
// This is an inconvenient size. There's nothing for it but to do
// four multiplications, as if for the 128-bit case. It's possible
// that there's cruft in the top 32 bits of the input registers, so
// shift both of them up by four bytes before we start. This will
// mean that the high 64 bits of the result (from GCM's viewpoint)
// will be zero.
// v0 = // (u_0 + u_1 t^32; u_2)
// v1 = // (v_0 + v_1 t^32; v_0 + v_1 t^32)
// v2 = // (v_2; v_2)
pmull2 v5.1q, v0.2d, v1.2d // u_2 (v_0 + v_1 t^32) t^32 = e_0
pmull v4.1q, v0.1d, v2.1d // v_2 (u_0 + u_1 t^32) t^32 = e_1
pmull2 v6.1q, v0.2d, v2.2d // u_2 v_2 = d = (d; 0)
pmull v3.1q, v0.1d, v1.1d // u_0 v_0 + (u_0 v_1 + u_1 v_0) t^32
// + u_1 v_1 t^64 = f
// Extract the high and low halves of the 192-bit result. The answer
// we want is d t^128 + e t^64 + f, where e = e_0 + e_1. The low 96
// bits of the answer will end up in v0, with junk in the top 32
// bits; the high 96 bits will end up in v1, which must have zero in
// its top 32 bits.
//
// Here, bot(x) is the low 96 bits of a 192-bit quantity x, arranged
// in the low 96 bits of a SIMD register, with junk in the top 32
// bits; and top(x) is the high 96 bits, also arranged in the low 96
// bits of a register, with /zero/ in the top 32 bits.
eor v4.16b, v4.16b, v5.16b // e_0 + e_1 = e
vshl128 v6, v6, 32 // top(d t^128)
vshr128 v5, v4, 32 // top(e t^64)
vshl128 v4, v4, 64 // bot(e t^64)
vshr128 v1, v3, 96 // top(f)
eor v6.16b, v6.16b, v5.16b // top(d t^128 + e t^64)
eor v0.16b, v3.16b, v4.16b // bot([d t^128] + e t^64 + f)
eor v1.16b, v1.16b, v6.16b // top(e t^64 + d t^128 + f)
// Finally, the reduction. This is essentially the same as the
// 128-bit case, except that the polynomial is p(t) = t^96 + t^10 +
// t^9 + t^6 + 1. The degrees are larger but not enough to cause
// trouble for the general approach. Unfortunately, we have to do
// this in 32-bit pieces rather than 64.
// First, shift the high bits down.
ushr v2.4s, v1.4s, #26 // the b_i for t^6
ushr v3.4s, v1.4s, #23 // the b_i for t^9
ushr v4.4s, v1.4s, #22 // the b_i for t^10
eor v2.16b, v2.16b, v3.16b // add them all together
eor v2.16b, v2.16b, v4.16b
vshr128 v3, v2, 64 // contribution for high half
vshl128 v2, v2, 32 // contribution for low half
eor v1.16b, v1.16b, v3.16b // apply to high half
eor v0.16b, v0.16b, v2.16b // and low half
// And then shift the low bits up.
shl v2.4s, v1.4s, #6
shl v3.4s, v1.4s, #9
shl v4.4s, v1.4s, #10
eor v1.16b, v1.16b, v2.16b // unit and t^6 contribs
eor v3.16b, v3.16b, v4.16b // t^9 and t^10 contribs
eor v0.16b, v0.16b, v1.16b // mix everything together
eor v0.16b, v0.16b, v3.16b // ... and we're done
.endm
.macro mul192
// Enter with u in v0 and the less-significant half of v1, with v
// duplicated across both halves of v2/v3/v4, and with zero in v31.
// Leave with the product u v in v0 and the bottom half of v1.
// Clobbers v16--v25.
// Start multiplying and accumulating pieces of product.
// v0 = // (u_0; u_1)
// v1 = // (u_2; ?)
// v2 = // (v_0; v_0)
// v3 = // (v_1; v_1)
// v4 = // (v_2; v_2)
pmull v16.1q, v0.1d, v2.1d // a = u_0 v_0
pmull v19.1q, v0.1d, v3.1d // u_0 v_1
pmull2 v21.1q, v0.2d, v2.2d // u_1 v_0
pmull v17.1q, v0.1d, v4.1d // u_0 v_2
pmull2 v22.1q, v0.2d, v3.2d // u_1 v_1
pmull v23.1q, v1.1d, v2.1d // u_2 v_0
eor v19.16b, v19.16b, v21.16b // b = u_0 v_1 + u_1 v_0
pmull2 v20.1q, v0.2d, v4.2d // u_1 v_2
pmull v24.1q, v1.1d, v3.1d // u_2 v_1
eor v17.16b, v17.16b, v22.16b // u_0 v_2 + u_1 v_1
pmull v18.1q, v1.1d, v4.1d // e = u_2 v_2
eor v17.16b, v17.16b, v23.16b // c = u_0 v_2 + u_1 v_1 + u_2 v_1
eor v20.16b, v20.16b, v24.16b // d = u_1 v_2 + u_2 v_1
// Piece the product together.
// v16 = // (a_0; a_1)
// v19 = // (b_0; b_1)
// v17 = // (c_0; c_1)
// v20 = // (d_0; d_1)
// v18 = // (e_0; e_1)
vshl128 v21, v19, 64 // (0; b_0)
ext v22.16b, v19.16b, v20.16b, #8 // (b_1; d_0)
vshr128 v23, v20, 64 // (d_1; 0)
eor v16.16b, v16.16b, v21.16b // (x_0; x_1)
eor v17.16b, v17.16b, v22.16b // (x_2; x_3)
eor v18.16b, v18.16b, v23.16b // (x_2; x_3)
// Next, the reduction. Our polynomial this time is p(x) = t^192 +
// t^7 + t^2 + t + 1. Yes, the magic numbers are the same as the
// 128-bit case. I don't know why.
// First, shift the high bits down.
// v16 = // (y_0; y_1)
// v17 = // (y_2; y_3)
// v18 = // (y_4; y_5)
mov v19.d[0], v17.d[1] // (y_3; ?)
ushr v23.2d, v18.2d, #63 // hi b_i for t
ushr d20, d19, #63 // lo b_i for t
ushr v24.2d, v18.2d, #62 // hi b_i for t^2
ushr d21, d19, #62 // lo b_i for t^2
ushr v25.2d, v18.2d, #57 // hi b_i for t^7
ushr d22, d19, #57 // lo b_i for t^7
eor v23.16b, v23.16b, v24.16b // mix them all together
eor v20.8b, v20.8b, v21.8b
eor v23.16b, v23.16b, v25.16b
eor v20.8b, v20.8b, v22.8b
// Permute the high pieces while we fold in the b_i.
eor v17.16b, v17.16b, v23.16b
vshl128 v20, v20, 64
mov v19.d[0], v18.d[1] // (y_5; ?)
ext v18.16b, v17.16b, v18.16b, #8 // (y_3; y_4)
eor v16.16b, v16.16b, v20.16b
// And finally shift the low bits up.
// v16 = // (y'_0; y'_1)
// v17 = // (y'_2; ?)
// v18 = // (y'_3; y'_4)
// v19 = // (y'_5; ?)
shl v20.2d, v18.2d, #1
shl d23, d19, #1
shl v21.2d, v18.2d, #2
shl d24, d19, #2
shl v22.2d, v18.2d, #7
shl d25, d19, #7
eor v18.16b, v18.16b, v20.16b // unit and t contribs
eor v19.8b, v19.8b, v23.8b
eor v21.16b, v21.16b, v22.16b // t^2 and t^7 contribs
eor v24.8b, v24.8b, v25.8b
eor v18.16b, v18.16b, v21.16b // all contribs
eor v19.8b, v19.8b, v24.8b
eor v0.16b, v16.16b, v18.16b // mix them into the low half
eor v1.8b, v17.8b, v19.8b
.endm
.macro mul256
// Enter with u in v0/v1, with v duplicated across both halves of
// v2--v5, and with zero in v31. Leave with the product u v in
// v0/v1. Clobbers ???.
// Now it's starting to look worthwhile to do Karatsuba. Suppose
// u = u_0 + u_1 B and v = v_0 + v_1 B. Then
//
// u v = (u_0 v_0) + (u_0 v_1 + u_1 v_0) B + (u_1 v_1) B^2
//
// Name these coefficients of B^i be a, b, and c, respectively, and
// let r = u_0 + u_1 and s = v_0 + v_1. Then observe that
//
// q = r s = (u_0 + u_1) (v_0 + v_1)
// = (u_0 v_0) + (u1 v_1) + (u_0 v_1 + u_1 v_0)
// = a + d + c
//
// The first two terms we've already calculated; the last is the
// remaining one we want. We'll set B = t^128. We know how to do
// 128-bit multiplications already, and Karatsuba is too annoying
// there, so there'll be 12 multiplications altogether, rather than
// the 16 we'd have if we did this the naïve way.
// v0 = // u_0 = (u_00; u_01)
// v1 = // u_1 = (u_10; u_11)
// v2 = // (v_00; v_00)
// v3 = // (v_01; v_01)
// v4 = // (v_10; v_10)
// v5 = // (v_11; v_11)
eor v28.16b, v0.16b, v1.16b // u_* = (u_00 + u_10; u_01 + u_11)
eor v29.16b, v2.16b, v4.16b // v_*0 = v_00 + v_10
eor v30.16b, v3.16b, v5.16b // v_*1 = v_01 + v_11
// Start by building the cross product, q = u_* v_*.
pmull v24.1q, v28.1d, v30.1d // u_*0 v_*1
pmull2 v25.1q, v28.2d, v29.2d // u_*1 v_*0
pmull v20.1q, v28.1d, v29.1d // u_*0 v_*0
pmull2 v21.1q, v28.2d, v30.2d // u_*1 v_*1
eor v24.16b, v24.16b, v25.16b // u_*0 v_*1 + u_*1 v_*0
vshr128 v25, v24, 64
vshl128 v24, v24, 64
eor v20.16b, v20.16b, v24.16b // q_0
eor v21.16b, v21.16b, v25.16b // q_1
// Next, work on the low half, a = u_0 v_0
pmull v24.1q, v0.1d, v3.1d // u_00 v_01
pmull2 v25.1q, v0.2d, v2.2d // u_01 v_00
pmull v16.1q, v0.1d, v2.1d // u_00 v_00
pmull2 v17.1q, v0.2d, v3.2d // u_01 v_01
eor v24.16b, v24.16b, v25.16b // u_00 v_01 + u_01 v_00
vshr128 v25, v24, 64
vshl128 v24, v24, 64
eor v16.16b, v16.16b, v24.16b // a_0
eor v17.16b, v17.16b, v25.16b // a_1
// Mix the pieces we have so far.
eor v20.16b, v20.16b, v16.16b
eor v21.16b, v21.16b, v17.16b
// Finally, work on the high half, c = u_1 v_1
pmull v24.1q, v1.1d, v5.1d // u_10 v_11
pmull2 v25.1q, v1.2d, v4.2d // u_11 v_10
pmull v18.1q, v1.1d, v4.1d // u_10 v_10
pmull2 v19.1q, v1.2d, v5.2d // u_11 v_11
eor v24.16b, v24.16b, v25.16b // u_10 v_11 + u_11 v_10
vshr128 v25, v24, 64
vshl128 v24, v24, 64
eor v18.16b, v18.16b, v24.16b // c_0
eor v19.16b, v19.16b, v25.16b // c_1
// Finish mixing the product together.
eor v20.16b, v20.16b, v18.16b
eor v21.16b, v21.16b, v19.16b
eor v17.16b, v17.16b, v20.16b
eor v18.16b, v18.16b, v21.16b
// Now we must reduce. This is essentially the same as the 192-bit
// case above, but more complicated because everything is bigger.
// The polynomial this time is p(t) = t^256 + t^10 + t^5 + t^2 + 1.
// v16 = // (y_0; y_1)
// v17 = // (y_2; y_3)
// v18 = // (y_4; y_5)
// v19 = // (y_6; y_7)
ushr v24.2d, v18.2d, #62 // (y_4; y_5) b_i for t^2
ushr v25.2d, v19.2d, #62 // (y_6; y_7) b_i for t^2
ushr v26.2d, v18.2d, #59 // (y_4; y_5) b_i for t^5
ushr v27.2d, v19.2d, #59 // (y_6; y_7) b_i for t^5
ushr v28.2d, v18.2d, #54 // (y_4; y_5) b_i for t^10
ushr v29.2d, v19.2d, #54 // (y_6; y_7) b_i for t^10
eor v24.16b, v24.16b, v26.16b // mix the contributions together
eor v25.16b, v25.16b, v27.16b
eor v24.16b, v24.16b, v28.16b
eor v25.16b, v25.16b, v29.16b
vshr128 v26, v25, 64 // slide contribs into position
ext v25.16b, v24.16b, v25.16b, #8
vshl128 v24, v24, 64
eor v18.16b, v18.16b, v26.16b
eor v17.16b, v17.16b, v25.16b
eor v16.16b, v16.16b, v24.16b
// And then shift the low bits up.
// v16 = // (y'_0; y'_1)
// v17 = // (y'_2; y'_3)
// v18 = // (y'_4; y'_5)
// v19 = // (y'_6; y'_7)
shl v24.2d, v18.2d, #2 // (y'_4; y_5) a_i for t^2
shl v25.2d, v19.2d, #2 // (y_6; y_7) a_i for t^2
shl v26.2d, v18.2d, #5 // (y'_4; y_5) a_i for t^5
shl v27.2d, v19.2d, #5 // (y_6; y_7) a_i for t^5
shl v28.2d, v18.2d, #10 // (y'_4; y_5) a_i for t^10
shl v29.2d, v19.2d, #10 // (y_6; y_7) a_i for t^10
eor v18.16b, v18.16b, v24.16b // mix the contributions together
eor v19.16b, v19.16b, v25.16b
eor v26.16b, v26.16b, v28.16b
eor v27.16b, v27.16b, v29.16b
eor v18.16b, v18.16b, v26.16b
eor v19.16b, v19.16b, v27.16b
eor v0.16b, v16.16b, v18.16b
eor v1.16b, v17.16b, v19.16b
.endm
///--------------------------------------------------------------------------
/// Main code.
// There are a number of representations of field elements in this code and
// it can be confusing.
//
// * The `external format' consists of a sequence of contiguous bytes in
// memory called a `block'. The GCM spec explains how to interpret this
// block as an element of a finite field. As discussed extensively, this
// representation is very annoying for a number of reasons. On the other
// hand, this code never actually deals with it directly.
//
// * The `register format' consists of one or more SIMD registers,
// depending on the block size. The bits in each byte are reversed,
// compared to the external format, which makes the polynomials
// completely vanilla, unlike all of the other GCM implementations.
//
// * The `table format' is just like the `register format', only the two
// halves of 128-bit SIMD register are the same, so we need twice as many
// registers.
//
// * The `words' format consists of a sequence of bytes, as in the
// `external format', but, according to the blockcipher in use, the bytes
// within each 32-bit word may be reversed (`big-endian') or not
// (`little-endian'). Accordingly, there are separate entry points for
// each variant, identified with `b' or `l'.
FUNC(gcm_mulk_128b_arm64_pmull)
// On entry, x0 points to a 128-bit field element A in big-endian
// words format; x1 points to a field-element K in table format. On
// exit, A is updated with the product A K.
ldr q0, [x0]
ldp q1, q2, [x1]
rev32 v0.16b, v0.16b
vzero
rbit v0.16b, v0.16b
mul128
rbit v0.16b, v0.16b
rev32 v0.16b, v0.16b
str q0, [x0]
ret
ENDFUNC
FUNC(gcm_mulk_128l_arm64_pmull)
// On entry, x0 points to a 128-bit field element A in little-endian
// words format; x1 points to a field-element K in table format. On
// exit, A is updated with the product A K.
ldr q0, [x0]
ldp q1, q2, [x1]
vzero
rbit v0.16b, v0.16b
mul128
rbit v0.16b, v0.16b
str q0, [x0]
ret
ENDFUNC
FUNC(gcm_mulk_64b_arm64_pmull)
// On entry, x0 points to a 64-bit field element A in big-endian
// words format; x1 points to a field-element K in table format. On
// exit, A is updated with the product A K.
ldr d0, [x0]
ldr q1, [x1]
rev32 v0.8b, v0.8b
rbit v0.8b, v0.8b
mul64
rbit x2, x2
ror x2, x2, #32
str x2, [x0]
ret
ENDFUNC
FUNC(gcm_mulk_64l_arm64_pmull)
// On entry, x0 points to a 64-bit field element A in little-endian
// words format; x1 points to a field-element K in table format. On
// exit, A is updated with the product A K.
ldr d0, [x0]
ldr q1, [x1]
rbit v0.8b, v0.8b
mul64
rbit x2, x2
rev x2, x2
str x2, [x0]
ret
ENDFUNC
FUNC(gcm_mulk_96b_arm64_pmull)
// On entry, x0 points to a 96-bit field element A in big-endian
// words format; x1 points to a field-element K in table format. On
// exit, A is updated with the product A K.
ldr w2, [x0, #8]
ldr d0, [x0, #0]
mov v0.d[1], x2
ldp q1, q2, [x1]
rev32 v0.16b, v0.16b
vzero
rbit v0.16b, v0.16b
mul96
rbit v0.16b, v0.16b
rev32 v0.16b, v0.16b
mov w2, v0.s[2]
str d0, [x0, #0]
str w2, [x0, #8]
ret
ENDFUNC
FUNC(gcm_mulk_96l_arm64_pmull)
// On entry, x0 points to a 96-bit field element A in little-endian
// words format; x1 points to a field-element K in table format. On
// exit, A is updated with the product A K.
ldr d0, [x0, #0]
ldr w2, [x0, #8]
mov v0.d[1], x2
ldp q1, q2, [x1]
rbit v0.16b, v0.16b
vzero
mul96
rbit v0.16b, v0.16b
mov w2, v0.s[2]
str d0, [x0, #0]
str w2, [x0, #8]
ret
ENDFUNC
FUNC(gcm_mulk_192b_arm64_pmull)
// On entry, x0 points to a 192-bit field element A in big-endian
// words format; x1 points to a field-element K in table format. On
// exit, A is updated with the product A K.
ldr q0, [x0, #0]
ldr d1, [x0, #16]
ldp q2, q3, [x1, #0]
ldr q4, [x1, #32]
rev32 v0.16b, v0.16b
rev32 v1.8b, v1.8b
rbit v0.16b, v0.16b
rbit v1.8b, v1.8b
vzero
mul192
rev32 v0.16b, v0.16b
rev32 v1.8b, v1.8b
rbit v0.16b, v0.16b
rbit v1.8b, v1.8b
str q0, [x0, #0]
str d1, [x0, #16]
ret
ENDFUNC
FUNC(gcm_mulk_192l_arm64_pmull)
// On entry, x0 points to a 192-bit field element A in little-endian
// words format; x1 points to a field-element K in table format. On
// exit, A is updated with the product A K.
ldr q0, [x0, #0]
ldr d1, [x0, #16]
ldp q2, q3, [x1, #0]
ldr q4, [x1, #32]
rbit v0.16b, v0.16b
rbit v1.8b, v1.8b
vzero
mul192
rbit v0.16b, v0.16b
rbit v1.8b, v1.8b
str q0, [x0, #0]
str d1, [x0, #16]
ret
ENDFUNC
FUNC(gcm_mulk_256b_arm64_pmull)
// On entry, x0 points to a 256-bit field element A in big-endian
// words format; x1 points to a field-element K in table format. On
// exit, A is updated with the product A K.
ldp q0, q1, [x0]
ldp q2, q3, [x1, #0]
ldp q4, q5, [x1, #32]
rev32 v0.16b, v0.16b
rev32 v1.16b, v1.16b
rbit v0.16b, v0.16b
rbit v1.16b, v1.16b
vzero
mul256
rev32 v0.16b, v0.16b
rev32 v1.16b, v1.16b
rbit v0.16b, v0.16b
rbit v1.16b, v1.16b
stp q0, q1, [x0]
ret
ENDFUNC
FUNC(gcm_mulk_256l_arm64_pmull)
// On entry, x0 points to a 256-bit field element A in little-endian
// words format; x1 points to a field-element K in table format. On
// exit, A is updated with the product A K.
ldp q0, q1, [x0]
ldp q2, q3, [x1, #0]
ldp q4, q5, [x1, #32]
rbit v0.16b, v0.16b
rbit v1.16b, v1.16b
vzero
mul256
rbit v0.16b, v0.16b
rbit v1.16b, v1.16b
stp q0, q1, [x0]
ret
ENDFUNC
///----- That's all, folks --------------------------------------------------