Import
This commit is contained in:
коміт
be59b2b31d
286
aes-arm64.S
Normal file
286
aes-arm64.S
Normal file
@ -0,0 +1,286 @@
|
||||
#include "asm-common.h"
|
||||
|
||||
.arch armv8-a+crypto
|
||||
|
||||
.extern F(abort)
|
||||
.extern F(rijndael_rcon)
|
||||
|
||||
.text
|
||||
|
||||
///--------------------------------------------------------------------------
|
||||
/// Main code.
|
||||
|
||||
/// The ARM crypto extension implements a little-endian version of AES
|
||||
/// (though the manual doesn't actually spell this out and you have to
|
||||
/// experiment)a, note that internal interface presents as big-endian so
|
||||
/// as to work better with things like GCM. We therefore maintain the round
|
||||
/// keys in little-endian form, and have to end-swap blocks in and out.
|
||||
///
|
||||
/// For added amusement, the crypto extension doesn't implement the larger-
|
||||
/// block versions of Rijndael, so we have to end-swap the keys if we're
|
||||
/// preparing for one of those.
|
||||
|
||||
// Useful constants.
|
||||
.equ maxrounds, 16 // maximum number of rounds
|
||||
.equ maxblksz, 32 // maximum block size, in bytes
|
||||
.equ kbufsz, maxblksz*(maxrounds + 1) // size of key-sched buffer
|
||||
|
||||
// Context structure.
|
||||
.equ nr, 0 // number of rounds
|
||||
.equ w, nr + 4 // encryption key words
|
||||
.equ wi, w + kbufsz // decryption key words
|
||||
|
||||
///--------------------------------------------------------------------------
|
||||
/// Key setup.
|
||||
|
||||
FUNC(rijndael_setup_arm64_crypto)
|
||||
|
||||
// Arguments:
|
||||
// x0 = pointer to context
|
||||
// w1 = block size in 32-bit words
|
||||
// x2 = pointer to key material
|
||||
// x3 = key size in words
|
||||
|
||||
pushreg x29, x30
|
||||
mov x29, sp
|
||||
|
||||
// The initial round key material is taken directly from the input
|
||||
// key, so copy it over. Unfortunately, the key material is not
|
||||
// guaranteed to be aligned in any especially useful way. Assume
|
||||
// that alignment traps are not enabled. (Why would they be? On
|
||||
// A32, alignment traps were part of a transition plan which changed
|
||||
// the way unaligned loads and stores behaved, but there's never been
|
||||
// any other behaviour on A64.)
|
||||
mov x15, x3
|
||||
add x4, x0, #w
|
||||
0: sub x15, x15, #1
|
||||
ldr w14, [x2], #4
|
||||
str w14, [x4], #4
|
||||
cbnz x15, 0b
|
||||
|
||||
// Find out other useful things and prepare for the main loop.
|
||||
9: ldr w9, [x0, #nr] // number of rounds
|
||||
madd w2, w1, w9, w1 // total key size in words
|
||||
leaext x5, rijndael_rcon // round constants
|
||||
sub x6, x2, x3 // minus what we've copied already
|
||||
add x7, x0, #w // position in previous cycle
|
||||
movi v1.4s, #0 // all-zero register for the key
|
||||
mov x8, #0 // position in current cycle
|
||||
|
||||
// Main key expansion loop. Dispatch according to the position in
|
||||
// the cycle.
|
||||
0: ldr w15, [x7], #4 // word from previous cycle
|
||||
cbz x8, 1f // first word of the cycle?
|
||||
cmp x8, #4 // fourth word of the cycle?
|
||||
b.ne 2f
|
||||
cmp x3, #7 // seven or eight words of key?
|
||||
b.cc 2f
|
||||
|
||||
// Fourth word of the cycle, seven or eight words of key. We must do
|
||||
// the byte substitution.
|
||||
dup v0.4s, w14
|
||||
aese v0.16b, v1.16b // effectively, just SubBytes
|
||||
mov w14, v0.s[0]
|
||||
b 2f
|
||||
|
||||
// First word of the cycle. Byte substitution, rotation, and round
|
||||
// constant.
|
||||
1: ldrb w13, [x5], #1 // next round constant
|
||||
dup v0.4s, w14
|
||||
aese v0.16b, v1.16b // effectively, just SubBytes
|
||||
mov w14, v0.s[0]
|
||||
eor w14, w13, w14, ror #8
|
||||
|
||||
// Common ending: mix in the word from the previous cycle and store.
|
||||
2: eor w14, w14, w15
|
||||
str w14, [x4], #4
|
||||
|
||||
// Prepare for the next iteration. If we're done, then stop; if
|
||||
// we've finished a cycle then reset the counter.
|
||||
add x8, x8, #1
|
||||
sub x6, x6, #1
|
||||
cmp x8, x3
|
||||
cbz x6, 9f
|
||||
cmov.cs x8, xzr
|
||||
b 0b
|
||||
|
||||
// Next job is to construct the decryption keys. The keys for the
|
||||
// first and last rounds don't need to be mangled, but the remaining
|
||||
// ones do -- and they all need to be reordered too.
|
||||
//
|
||||
// The plan of action, then, is to copy the final encryption round's
|
||||
// keys into place first, then to do each of the intermediate rounds
|
||||
// in reverse order, and finally do the first round.
|
||||
//
|
||||
// Do all the heavy lifting with the vector registers. The order
|
||||
// we're doing this in means that it's OK if we read or write too
|
||||
// much, and there's easily enough buffer space for the
|
||||
// over-enthusiastic reads and writes because the context has space
|
||||
// for 32-byte blocks, which is our maximum and an exact fit for two
|
||||
// full-width registers.
|
||||
9: add x5, x0, #wi
|
||||
add x4, x0, #w
|
||||
add x4, x4, w2, uxtw #2
|
||||
sub x4, x4, w1, uxtw #2 // last round's keys
|
||||
|
||||
// Copy the last encryption round's keys.
|
||||
ld1 {v0.4s, v1.4s}, [x4]
|
||||
st1 {v0.4s, v1.4s}, [x5]
|
||||
|
||||
// Update the loop variables and stop if we've finished.
|
||||
0: sub w9, w9, #1
|
||||
add x5, x5, w1, uxtw #2
|
||||
sub x4, x4, w1, uxtw #2
|
||||
cbz w9, 9f
|
||||
|
||||
// Do another middle round's keys...
|
||||
ld1 {v0.4s, v1.4s}, [x4]
|
||||
aesimc v0.16b, v0.16b
|
||||
aesimc v1.16b, v1.16b
|
||||
st1 {v0.4s, v1.4s}, [x5]
|
||||
b 0b
|
||||
|
||||
// Finally do the first encryption round.
|
||||
9: ld1 {v0.4s, v1.4s}, [x4]
|
||||
st1 {v0.4s, v1.4s}, [x5]
|
||||
|
||||
// If the block size is not exactly four words then we must end-swap
|
||||
// everything. We can use fancy vector toys for this.
|
||||
cmp w1, #4
|
||||
b.eq 9f
|
||||
|
||||
// End-swap the encryption keys.
|
||||
add x1, x0, #w
|
||||
bl endswap_block
|
||||
|
||||
// And the decryption keys
|
||||
add x1, x0, #wi
|
||||
bl endswap_block
|
||||
|
||||
// All done.
|
||||
9: popreg x29, x30
|
||||
ret
|
||||
|
||||
ENDFUNC
|
||||
|
||||
INTFUNC(endswap_block)
|
||||
// End-swap w2 words starting at x1. x1 is clobbered; w2 is not.
|
||||
// It's OK to work in 16-byte chunks.
|
||||
|
||||
mov w3, w2
|
||||
0: subs w3, w3, #4
|
||||
ld1 {v0.4s}, [x1]
|
||||
rev32 v0.16b, v0.16b
|
||||
st1 {v0.4s}, [x1], #16
|
||||
b.hi 0b
|
||||
ret
|
||||
|
||||
ENDFUNC
|
||||
|
||||
///--------------------------------------------------------------------------
|
||||
/// Encrypting and decrypting blocks.
|
||||
|
||||
.macro encdec op, aes, mc, koff
|
||||
FUNC(rijndael_\op\()_arm64_crypto)
|
||||
|
||||
// Arguments:
|
||||
// x0 = pointer to context
|
||||
// x1 = pointer to input block
|
||||
// x2 = pointer to output block
|
||||
|
||||
// Set things up ready.
|
||||
ldr w3, [x0, #nr]
|
||||
add x0, x0, #\koff
|
||||
ld1 {v0.4s}, [x1]
|
||||
rev32 v0.16b, v0.16b
|
||||
|
||||
// Check the number of rounds and dispatch.
|
||||
cmp w3, #14
|
||||
b.eq 14f
|
||||
cmp w3, #10
|
||||
b.eq 10f
|
||||
cmp w3, #12
|
||||
b.eq 12f
|
||||
cmp w3, #13
|
||||
b.eq 13f
|
||||
cmp w3, #11
|
||||
b.eq 11f
|
||||
callext F(abort)
|
||||
|
||||
// Eleven rounds.
|
||||
11: ld1 {v16.4s}, [x0], #16
|
||||
\aes v0.16b, v16.16b
|
||||
\mc v0.16b, v0.16b
|
||||
b 10f
|
||||
|
||||
// Twelve rounds.
|
||||
12: ld1 {v16.4s, v17.4s}, [x0], #32
|
||||
\aes v0.16b, v16.16b
|
||||
\mc v0.16b, v0.16b
|
||||
\aes v0.16b, v17.16b
|
||||
\mc v0.16b, v0.16b
|
||||
b 10f
|
||||
|
||||
// Thirteen rounds.
|
||||
13: ld1 {v16.4s-v18.4s}, [x0], #48
|
||||
\aes v0.16b, v16.16b
|
||||
\mc v0.16b, v0.16b
|
||||
\aes v0.16b, v17.16b
|
||||
\mc v0.16b, v0.16b
|
||||
\aes v0.16b, v18.16b
|
||||
\mc v0.16b, v0.16b
|
||||
b 10f
|
||||
|
||||
// Fourteen rounds. (Drops through to the ten round case because
|
||||
// this is the next most common.)
|
||||
14: ld1 {v16.4s-v19.4s}, [x0], #64
|
||||
\aes v0.16b, v16.16b
|
||||
\mc v0.16b, v0.16b
|
||||
\aes v0.16b, v17.16b
|
||||
\mc v0.16b, v0.16b
|
||||
\aes v0.16b, v18.16b
|
||||
\mc v0.16b, v0.16b
|
||||
\aes v0.16b, v19.16b
|
||||
\mc v0.16b, v0.16b
|
||||
// Drop through...
|
||||
|
||||
// Ten rounds.
|
||||
10: ld1 {v16.4s-v19.4s}, [x0], #64
|
||||
ld1 {v20.4s-v23.4s}, [x0], #64
|
||||
\aes v0.16b, v16.16b
|
||||
\mc v0.16b, v0.16b
|
||||
\aes v0.16b, v17.16b
|
||||
\mc v0.16b, v0.16b
|
||||
\aes v0.16b, v18.16b
|
||||
\mc v0.16b, v0.16b
|
||||
\aes v0.16b, v19.16b
|
||||
\mc v0.16b, v0.16b
|
||||
|
||||
ld1 {v16.4s-v18.4s}, [x0], #48
|
||||
\aes v0.16b, v20.16b
|
||||
\mc v0.16b, v0.16b
|
||||
\aes v0.16b, v21.16b
|
||||
\mc v0.16b, v0.16b
|
||||
\aes v0.16b, v22.16b
|
||||
\mc v0.16b, v0.16b
|
||||
\aes v0.16b, v23.16b
|
||||
\mc v0.16b, v0.16b
|
||||
|
||||
// Final round has no MixColumns, but is followed by final whitening.
|
||||
\aes v0.16b, v16.16b
|
||||
\mc v0.16b, v0.16b
|
||||
\aes v0.16b, v17.16b
|
||||
eor v0.16b, v0.16b, v18.16b
|
||||
|
||||
// All done.
|
||||
rev32 v0.16b, v0.16b
|
||||
st1 {v0.4s}, [x2]
|
||||
ret
|
||||
|
||||
ENDFUNC
|
||||
.endm
|
||||
|
||||
encdec eblk, aese, aesmc, w
|
||||
encdec dblk, aesd, aesimc, wi
|
||||
|
||||
///----- That's all, folks --------------------------------------------------
|
1180
asm-common.h
Normal file
1180
asm-common.h
Normal file
Різницю між файлами не показано, бо вона завелика
Завантажити різницю
631
gcm-arm64-pmull.S
Normal file
631
gcm-arm64-pmull.S
Normal file
@ -0,0 +1,631 @@
|
||||
#include "asm-common.h"
|
||||
|
||||
.arch armv8-a+crypto
|
||||
|
||||
.text
|
||||
|
||||
///--------------------------------------------------------------------------
|
||||
/// Multiplication macros.
|
||||
|
||||
// The good news is that we have a fancy instruction to do the
|
||||
// multiplications. The bad news is that it's not particularly well-
|
||||
// suited to the job.
|
||||
//
|
||||
// For one thing, it only does a 64-bit multiplication, so in general
|
||||
// we'll need to synthesize the full-width multiply by hand. For
|
||||
// another thing, it doesn't help with the reduction, so we have to
|
||||
// do that by hand too. And, finally, GCM has crazy bit ordering,
|
||||
// and the instruction does nothing useful for that at all.
|
||||
//
|
||||
// Focusing on that last problem first: the bits aren't in monotonic
|
||||
// significance order unless we permute them. Fortunately, ARM64 has
|
||||
// an instruction which will just permute the bits in each byte for
|
||||
// us, so we don't have to worry about this very much.
|
||||
//
|
||||
// Our main weapons, the `pmull' and `pmull2' instructions, work on
|
||||
// 64-bit operands, in half of a vector register, and produce 128-bit
|
||||
// results. But neither of them will multiply the high half of one
|
||||
// vector by the low half of a second one, so we have a problem,
|
||||
// which we solve by representing one of the operands redundantly:
|
||||
// rather than packing the 64-bit pieces together, we duplicate each
|
||||
// 64-bit piece across both halves of a register.
|
||||
//
|
||||
// The commentary for `mul128' is the most detailed. The other
|
||||
// macros assume that you've already read and understood that.
|
||||
|
||||
.macro mul128
|
||||
// Enter with u and v in v0 and v1/v2 respectively, and 0 in v31;
|
||||
// leave with z = u v in v0. Clobbers v1--v6.
|
||||
|
||||
// First for the double-precision multiplication. It's tempting to
|
||||
// use Karatsuba's identity here, but I suspect that loses more in
|
||||
// the shifting, bit-twiddling, and dependency chains that it gains
|
||||
// in saving a multiplication which otherwise pipelines well.
|
||||
// v0 = // (u_0; u_1)
|
||||
// v1/v2 = // (v_0; v_1)
|
||||
pmull2 v3.1q, v0.2d, v1.2d // u_1 v_0
|
||||
pmull v4.1q, v0.1d, v2.1d // u_0 v_1
|
||||
pmull2 v5.1q, v0.2d, v2.2d // (t_1; x_3) = u_1 v_1
|
||||
pmull v6.1q, v0.1d, v1.1d // (x_0; t_0) = u_0 v_0
|
||||
|
||||
// Arrange the pieces to form a double-precision polynomial.
|
||||
eor v3.16b, v3.16b, v4.16b // (m_0; m_1) = u_0 v_1 + u_1 v_0
|
||||
vshr128 v4, v3, 64 // (m_1; 0)
|
||||
vshl128 v3, v3, 64 // (0; m_0)
|
||||
eor v1.16b, v5.16b, v4.16b // (x_2; x_3)
|
||||
eor v0.16b, v6.16b, v3.16b // (x_0; x_1)
|
||||
|
||||
// And now the only remaining difficulty is that the result needs to
|
||||
// be reduced modulo p(t) = t^128 + t^7 + t^2 + t + 1. Let R = t^128
|
||||
// = t^7 + t^2 + t + 1 in our field. So far, we've calculated z_0
|
||||
// and z_1 such that z_0 + z_1 R = u v using the identity R = t^128:
|
||||
// now we must collapse the two halves of y together using the other
|
||||
// identity R = t^7 + t^2 + t + 1.
|
||||
//
|
||||
// We do this by working on y_2 and y_3 separately, so consider y_i
|
||||
// for i = 2 or 3. Certainly, y_i t^{64i} = y_i R t^{64(i-2) =
|
||||
// (t^7 + t^2 + t + 1) y_i t^{64(i-2)}, but we can't use that
|
||||
// directly without breaking up the 64-bit word structure. Instead,
|
||||
// we start by considering just y_i t^7 t^{64(i-2)}, which again
|
||||
// looks tricky. Now, split y_i = a_i + t^57 b_i, with deg a_i < 57;
|
||||
// then
|
||||
//
|
||||
// y_i t^7 t^{64(i-2)} = a_i t^7 t^{64(i-2)} + b_i t^{64(i-1)}
|
||||
//
|
||||
// We can similarly decompose y_i t^2 and y_i t into a pair of 64-bit
|
||||
// contributions to the t^{64(i-2)} and t^{64(i-1)} words, but the
|
||||
// splits are different. This is lovely, with one small snag: when
|
||||
// we do this to y_3, we end up with a contribution back into the
|
||||
// t^128 coefficient word. But notice that only the low seven bits
|
||||
// of this word are affected, so there's no knock-on contribution
|
||||
// into the t^64 word. Therefore, if we handle the high bits of each
|
||||
// word together, and then the low bits, everything will be fine.
|
||||
|
||||
// First, shift the high bits down.
|
||||
ushr v2.2d, v1.2d, #63 // the b_i for t
|
||||
ushr v3.2d, v1.2d, #62 // the b_i for t^2
|
||||
ushr v4.2d, v1.2d, #57 // the b_i for t^7
|
||||
eor v2.16b, v2.16b, v3.16b // add them all together
|
||||
eor v2.16b, v2.16b, v4.16b
|
||||
vshr128 v3, v2, 64
|
||||
vshl128 v4, v2, 64
|
||||
eor v1.16b, v1.16b, v3.16b // contribution into high half
|
||||
eor v0.16b, v0.16b, v4.16b // and low half
|
||||
|
||||
// And then shift the low bits up.
|
||||
shl v2.2d, v1.2d, #1
|
||||
shl v3.2d, v1.2d, #2
|
||||
shl v4.2d, v1.2d, #7
|
||||
eor v1.16b, v1.16b, v2.16b // unit and t contribs
|
||||
eor v3.16b, v3.16b, v4.16b // t^2 and t^7 contribs
|
||||
eor v0.16b, v0.16b, v1.16b // mix everything together
|
||||
eor v0.16b, v0.16b, v3.16b // ... and we're done
|
||||
.endm
|
||||
|
||||
.macro mul64
|
||||
// Enter with u and v in the low halves of v0 and v1, respectively;
|
||||
// leave with z = u v in x2. Clobbers x2--x4.
|
||||
|
||||
// The multiplication is thankfully easy.
|
||||
// v0 = // (u; ?)
|
||||
// v1 = // (v; ?)
|
||||
pmull v0.1q, v0.1d, v1.1d // u v
|
||||
|
||||
// Now we must reduce. This is essentially the same as the 128-bit
|
||||
// case above, but mostly simpler because everything is smaller. The
|
||||
// polynomial this time is p(t) = t^64 + t^4 + t^3 + t + 1.
|
||||
|
||||
// Before we get stuck in, transfer the product to general-purpose
|
||||
// registers.
|
||||
mov x3, v0.d[1]
|
||||
mov x2, v0.d[0]
|
||||
|
||||
// First, shift the high bits down.
|
||||
eor x4, x3, x3, lsr #1 // pre-mix t^3 and t^4
|
||||
eor x3, x3, x3, lsr #63 // mix in t contribution
|
||||
eor x3, x3, x4, lsr #60 // shift and mix in t^3 and t^4
|
||||
|
||||
// And then shift the low bits up.
|
||||
eor x3, x3, x3, lsl #1 // mix unit and t; pre-mix t^3, t^4
|
||||
eor x2, x2, x3 // fold them in
|
||||
eor x2, x2, x3, lsl #3 // and t^3 and t^4
|
||||
.endm
|
||||
|
||||
.macro mul96
|
||||
// Enter with u in the least-significant 96 bits of v0, with zero in
|
||||
// the upper 32 bits, and with the least-significant 64 bits of v in
|
||||
// both halves of v1, and the upper 32 bits of v in the low 32 bits
|
||||
// of each half of v2, with zero in the upper 32 bits; and with zero
|
||||
// in v31. Yes, that's a bit hairy. Leave with the product u v in
|
||||
// the low 96 bits of v0, and /junk/ in the high 32 bits. Clobbers
|
||||
// v1--v6.
|
||||
|
||||
// This is an inconvenient size. There's nothing for it but to do
|
||||
// four multiplications, as if for the 128-bit case. It's possible
|
||||
// that there's cruft in the top 32 bits of the input registers, so
|
||||
// shift both of them up by four bytes before we start. This will
|
||||
// mean that the high 64 bits of the result (from GCM's viewpoint)
|
||||
// will be zero.
|
||||
// v0 = // (u_0 + u_1 t^32; u_2)
|
||||
// v1 = // (v_0 + v_1 t^32; v_0 + v_1 t^32)
|
||||
// v2 = // (v_2; v_2)
|
||||
pmull2 v5.1q, v0.2d, v1.2d // u_2 (v_0 + v_1 t^32) t^32 = e_0
|
||||
pmull v4.1q, v0.1d, v2.1d // v_2 (u_0 + u_1 t^32) t^32 = e_1
|
||||
pmull2 v6.1q, v0.2d, v2.2d // u_2 v_2 = d = (d; 0)
|
||||
pmull v3.1q, v0.1d, v1.1d // u_0 v_0 + (u_0 v_1 + u_1 v_0) t^32
|
||||
// + u_1 v_1 t^64 = f
|
||||
|
||||
// Extract the high and low halves of the 192-bit result. The answer
|
||||
// we want is d t^128 + e t^64 + f, where e = e_0 + e_1. The low 96
|
||||
// bits of the answer will end up in v0, with junk in the top 32
|
||||
// bits; the high 96 bits will end up in v1, which must have zero in
|
||||
// its top 32 bits.
|
||||
//
|
||||
// Here, bot(x) is the low 96 bits of a 192-bit quantity x, arranged
|
||||
// in the low 96 bits of a SIMD register, with junk in the top 32
|
||||
// bits; and top(x) is the high 96 bits, also arranged in the low 96
|
||||
// bits of a register, with /zero/ in the top 32 bits.
|
||||
eor v4.16b, v4.16b, v5.16b // e_0 + e_1 = e
|
||||
vshl128 v6, v6, 32 // top(d t^128)
|
||||
vshr128 v5, v4, 32 // top(e t^64)
|
||||
vshl128 v4, v4, 64 // bot(e t^64)
|
||||
vshr128 v1, v3, 96 // top(f)
|
||||
eor v6.16b, v6.16b, v5.16b // top(d t^128 + e t^64)
|
||||
eor v0.16b, v3.16b, v4.16b // bot([d t^128] + e t^64 + f)
|
||||
eor v1.16b, v1.16b, v6.16b // top(e t^64 + d t^128 + f)
|
||||
|
||||
// Finally, the reduction. This is essentially the same as the
|
||||
// 128-bit case, except that the polynomial is p(t) = t^96 + t^10 +
|
||||
// t^9 + t^6 + 1. The degrees are larger but not enough to cause
|
||||
// trouble for the general approach. Unfortunately, we have to do
|
||||
// this in 32-bit pieces rather than 64.
|
||||
|
||||
// First, shift the high bits down.
|
||||
ushr v2.4s, v1.4s, #26 // the b_i for t^6
|
||||
ushr v3.4s, v1.4s, #23 // the b_i for t^9
|
||||
ushr v4.4s, v1.4s, #22 // the b_i for t^10
|
||||
eor v2.16b, v2.16b, v3.16b // add them all together
|
||||
eor v2.16b, v2.16b, v4.16b
|
||||
vshr128 v3, v2, 64 // contribution for high half
|
||||
vshl128 v2, v2, 32 // contribution for low half
|
||||
eor v1.16b, v1.16b, v3.16b // apply to high half
|
||||
eor v0.16b, v0.16b, v2.16b // and low half
|
||||
|
||||
// And then shift the low bits up.
|
||||
shl v2.4s, v1.4s, #6
|
||||
shl v3.4s, v1.4s, #9
|
||||
shl v4.4s, v1.4s, #10
|
||||
eor v1.16b, v1.16b, v2.16b // unit and t^6 contribs
|
||||
eor v3.16b, v3.16b, v4.16b // t^9 and t^10 contribs
|
||||
eor v0.16b, v0.16b, v1.16b // mix everything together
|
||||
eor v0.16b, v0.16b, v3.16b // ... and we're done
|
||||
.endm
|
||||
|
||||
.macro mul192
|
||||
// Enter with u in v0 and the less-significant half of v1, with v
|
||||
// duplicated across both halves of v2/v3/v4, and with zero in v31.
|
||||
// Leave with the product u v in v0 and the bottom half of v1.
|
||||
// Clobbers v16--v25.
|
||||
|
||||
// Start multiplying and accumulating pieces of product.
|
||||
// v0 = // (u_0; u_1)
|
||||
// v1 = // (u_2; ?)
|
||||
// v2 = // (v_0; v_0)
|
||||
// v3 = // (v_1; v_1)
|
||||
// v4 = // (v_2; v_2)
|
||||
pmull v16.1q, v0.1d, v2.1d // a = u_0 v_0
|
||||
|
||||
pmull v19.1q, v0.1d, v3.1d // u_0 v_1
|
||||
pmull2 v21.1q, v0.2d, v2.2d // u_1 v_0
|
||||
|
||||
pmull v17.1q, v0.1d, v4.1d // u_0 v_2
|
||||
pmull2 v22.1q, v0.2d, v3.2d // u_1 v_1
|
||||
pmull v23.1q, v1.1d, v2.1d // u_2 v_0
|
||||
eor v19.16b, v19.16b, v21.16b // b = u_0 v_1 + u_1 v_0
|
||||
|
||||
pmull2 v20.1q, v0.2d, v4.2d // u_1 v_2
|
||||
pmull v24.1q, v1.1d, v3.1d // u_2 v_1
|
||||
eor v17.16b, v17.16b, v22.16b // u_0 v_2 + u_1 v_1
|
||||
|
||||
pmull v18.1q, v1.1d, v4.1d // e = u_2 v_2
|
||||
eor v17.16b, v17.16b, v23.16b // c = u_0 v_2 + u_1 v_1 + u_2 v_1
|
||||
eor v20.16b, v20.16b, v24.16b // d = u_1 v_2 + u_2 v_1
|
||||
|
||||
// Piece the product together.
|
||||
// v16 = // (a_0; a_1)
|
||||
// v19 = // (b_0; b_1)
|
||||
// v17 = // (c_0; c_1)
|
||||
// v20 = // (d_0; d_1)
|
||||
// v18 = // (e_0; e_1)
|
||||
vshl128 v21, v19, 64 // (0; b_0)
|
||||
ext v22.16b, v19.16b, v20.16b, #8 // (b_1; d_0)
|
||||
vshr128 v23, v20, 64 // (d_1; 0)
|
||||
eor v16.16b, v16.16b, v21.16b // (x_0; x_1)
|
||||
eor v17.16b, v17.16b, v22.16b // (x_2; x_3)
|
||||
eor v18.16b, v18.16b, v23.16b // (x_2; x_3)
|
||||
|
||||
// Next, the reduction. Our polynomial this time is p(x) = t^192 +
|
||||
// t^7 + t^2 + t + 1. Yes, the magic numbers are the same as the
|
||||
// 128-bit case. I don't know why.
|
||||
|
||||
// First, shift the high bits down.
|
||||
// v16 = // (y_0; y_1)
|
||||
// v17 = // (y_2; y_3)
|
||||
// v18 = // (y_4; y_5)
|
||||
mov v19.d[0], v17.d[1] // (y_3; ?)
|
||||
|
||||
ushr v23.2d, v18.2d, #63 // hi b_i for t
|
||||
ushr d20, d19, #63 // lo b_i for t
|
||||
ushr v24.2d, v18.2d, #62 // hi b_i for t^2
|
||||
ushr d21, d19, #62 // lo b_i for t^2
|
||||
ushr v25.2d, v18.2d, #57 // hi b_i for t^7
|
||||
ushr d22, d19, #57 // lo b_i for t^7
|
||||
eor v23.16b, v23.16b, v24.16b // mix them all together
|
||||
eor v20.8b, v20.8b, v21.8b
|
||||
eor v23.16b, v23.16b, v25.16b
|
||||
eor v20.8b, v20.8b, v22.8b
|
||||
|
||||
// Permute the high pieces while we fold in the b_i.
|
||||
eor v17.16b, v17.16b, v23.16b
|
||||
vshl128 v20, v20, 64
|
||||
mov v19.d[0], v18.d[1] // (y_5; ?)
|
||||
ext v18.16b, v17.16b, v18.16b, #8 // (y_3; y_4)
|
||||
eor v16.16b, v16.16b, v20.16b
|
||||
|
||||
// And finally shift the low bits up.
|
||||
// v16 = // (y'_0; y'_1)
|
||||
// v17 = // (y'_2; ?)
|
||||
// v18 = // (y'_3; y'_4)
|
||||
// v19 = // (y'_5; ?)
|
||||
shl v20.2d, v18.2d, #1
|
||||
shl d23, d19, #1
|
||||
shl v21.2d, v18.2d, #2
|
||||
shl d24, d19, #2
|
||||
shl v22.2d, v18.2d, #7
|
||||
shl d25, d19, #7
|
||||
eor v18.16b, v18.16b, v20.16b // unit and t contribs
|
||||
eor v19.8b, v19.8b, v23.8b
|
||||
eor v21.16b, v21.16b, v22.16b // t^2 and t^7 contribs
|
||||
eor v24.8b, v24.8b, v25.8b
|
||||
eor v18.16b, v18.16b, v21.16b // all contribs
|
||||
eor v19.8b, v19.8b, v24.8b
|
||||
eor v0.16b, v16.16b, v18.16b // mix them into the low half
|
||||
eor v1.8b, v17.8b, v19.8b
|
||||
.endm
|
||||
|
||||
.macro mul256
|
||||
// Enter with u in v0/v1, with v duplicated across both halves of
|
||||
// v2--v5, and with zero in v31. Leave with the product u v in
|
||||
// v0/v1. Clobbers ???.
|
||||
|
||||
// Now it's starting to look worthwhile to do Karatsuba. Suppose
|
||||
// u = u_0 + u_1 B and v = v_0 + v_1 B. Then
|
||||
//
|
||||
// u v = (u_0 v_0) + (u_0 v_1 + u_1 v_0) B + (u_1 v_1) B^2
|
||||
//
|
||||
// Name these coefficients of B^i be a, b, and c, respectively, and
|
||||
// let r = u_0 + u_1 and s = v_0 + v_1. Then observe that
|
||||
//
|
||||
// q = r s = (u_0 + u_1) (v_0 + v_1)
|
||||
// = (u_0 v_0) + (u1 v_1) + (u_0 v_1 + u_1 v_0)
|
||||
// = a + d + c
|
||||
//
|
||||
// The first two terms we've already calculated; the last is the
|
||||
// remaining one we want. We'll set B = t^128. We know how to do
|
||||
// 128-bit multiplications already, and Karatsuba is too annoying
|
||||
// there, so there'll be 12 multiplications altogether, rather than
|
||||
// the 16 we'd have if we did this the naïve way.
|
||||
// v0 = // u_0 = (u_00; u_01)
|
||||
// v1 = // u_1 = (u_10; u_11)
|
||||
// v2 = // (v_00; v_00)
|
||||
// v3 = // (v_01; v_01)
|
||||
// v4 = // (v_10; v_10)
|
||||
// v5 = // (v_11; v_11)
|
||||
|
||||
eor v28.16b, v0.16b, v1.16b // u_* = (u_00 + u_10; u_01 + u_11)
|
||||
eor v29.16b, v2.16b, v4.16b // v_*0 = v_00 + v_10
|
||||
eor v30.16b, v3.16b, v5.16b // v_*1 = v_01 + v_11
|
||||
|
||||
// Start by building the cross product, q = u_* v_*.
|
||||
pmull v24.1q, v28.1d, v30.1d // u_*0 v_*1
|
||||
pmull2 v25.1q, v28.2d, v29.2d // u_*1 v_*0
|
||||
pmull v20.1q, v28.1d, v29.1d // u_*0 v_*0
|
||||
pmull2 v21.1q, v28.2d, v30.2d // u_*1 v_*1
|
||||
eor v24.16b, v24.16b, v25.16b // u_*0 v_*1 + u_*1 v_*0
|
||||
vshr128 v25, v24, 64
|
||||
vshl128 v24, v24, 64
|
||||
eor v20.16b, v20.16b, v24.16b // q_0
|
||||
eor v21.16b, v21.16b, v25.16b // q_1
|
||||
|
||||
// Next, work on the low half, a = u_0 v_0
|
||||
pmull v24.1q, v0.1d, v3.1d // u_00 v_01
|
||||
pmull2 v25.1q, v0.2d, v2.2d // u_01 v_00
|
||||
pmull v16.1q, v0.1d, v2.1d // u_00 v_00
|
||||
pmull2 v17.1q, v0.2d, v3.2d // u_01 v_01
|
||||
eor v24.16b, v24.16b, v25.16b // u_00 v_01 + u_01 v_00
|
||||
vshr128 v25, v24, 64
|
||||
vshl128 v24, v24, 64
|
||||
eor v16.16b, v16.16b, v24.16b // a_0
|
||||
eor v17.16b, v17.16b, v25.16b // a_1
|
||||
|
||||
// Mix the pieces we have so far.
|
||||
eor v20.16b, v20.16b, v16.16b
|
||||
eor v21.16b, v21.16b, v17.16b
|
||||
|
||||
// Finally, work on the high half, c = u_1 v_1
|
||||
pmull v24.1q, v1.1d, v5.1d // u_10 v_11
|
||||
pmull2 v25.1q, v1.2d, v4.2d // u_11 v_10
|
||||
pmull v18.1q, v1.1d, v4.1d // u_10 v_10
|
||||
pmull2 v19.1q, v1.2d, v5.2d // u_11 v_11
|
||||
eor v24.16b, v24.16b, v25.16b // u_10 v_11 + u_11 v_10
|
||||
vshr128 v25, v24, 64
|
||||
vshl128 v24, v24, 64
|
||||
eor v18.16b, v18.16b, v24.16b // c_0
|
||||
eor v19.16b, v19.16b, v25.16b // c_1
|
||||
|
||||
// Finish mixing the product together.
|
||||
eor v20.16b, v20.16b, v18.16b
|
||||
eor v21.16b, v21.16b, v19.16b
|
||||
eor v17.16b, v17.16b, v20.16b
|
||||
eor v18.16b, v18.16b, v21.16b
|
||||
|
||||
// Now we must reduce. This is essentially the same as the 192-bit
|
||||
// case above, but more complicated because everything is bigger.
|
||||
// The polynomial this time is p(t) = t^256 + t^10 + t^5 + t^2 + 1.
|
||||
// v16 = // (y_0; y_1)
|
||||
// v17 = // (y_2; y_3)
|
||||
// v18 = // (y_4; y_5)
|
||||
// v19 = // (y_6; y_7)
|
||||
ushr v24.2d, v18.2d, #62 // (y_4; y_5) b_i for t^2
|
||||
ushr v25.2d, v19.2d, #62 // (y_6; y_7) b_i for t^2
|
||||
ushr v26.2d, v18.2d, #59 // (y_4; y_5) b_i for t^5
|
||||
ushr v27.2d, v19.2d, #59 // (y_6; y_7) b_i for t^5
|
||||
ushr v28.2d, v18.2d, #54 // (y_4; y_5) b_i for t^10
|
||||
ushr v29.2d, v19.2d, #54 // (y_6; y_7) b_i for t^10
|
||||
eor v24.16b, v24.16b, v26.16b // mix the contributions together
|
||||
eor v25.16b, v25.16b, v27.16b
|
||||
eor v24.16b, v24.16b, v28.16b
|
||||
eor v25.16b, v25.16b, v29.16b
|
||||
vshr128 v26, v25, 64 // slide contribs into position
|
||||
ext v25.16b, v24.16b, v25.16b, #8
|
||||
vshl128 v24, v24, 64
|
||||
eor v18.16b, v18.16b, v26.16b
|
||||
eor v17.16b, v17.16b, v25.16b
|
||||
eor v16.16b, v16.16b, v24.16b
|
||||
|
||||
// And then shift the low bits up.
|
||||
// v16 = // (y'_0; y'_1)
|
||||
// v17 = // (y'_2; y'_3)
|
||||
// v18 = // (y'_4; y'_5)
|
||||
// v19 = // (y'_6; y'_7)
|
||||
shl v24.2d, v18.2d, #2 // (y'_4; y_5) a_i for t^2
|
||||
shl v25.2d, v19.2d, #2 // (y_6; y_7) a_i for t^2
|
||||
shl v26.2d, v18.2d, #5 // (y'_4; y_5) a_i for t^5
|
||||
shl v27.2d, v19.2d, #5 // (y_6; y_7) a_i for t^5
|
||||
shl v28.2d, v18.2d, #10 // (y'_4; y_5) a_i for t^10
|
||||
shl v29.2d, v19.2d, #10 // (y_6; y_7) a_i for t^10
|
||||
eor v18.16b, v18.16b, v24.16b // mix the contributions together
|
||||
eor v19.16b, v19.16b, v25.16b
|
||||
eor v26.16b, v26.16b, v28.16b
|
||||
eor v27.16b, v27.16b, v29.16b
|
||||
eor v18.16b, v18.16b, v26.16b
|
||||
eor v19.16b, v19.16b, v27.16b
|
||||
eor v0.16b, v16.16b, v18.16b
|
||||
eor v1.16b, v17.16b, v19.16b
|
||||
.endm
|
||||
|
||||
///--------------------------------------------------------------------------
|
||||
/// Main code.
|
||||
|
||||
// There are a number of representations of field elements in this code and
|
||||
// it can be confusing.
|
||||
//
|
||||
// * The `external format' consists of a sequence of contiguous bytes in
|
||||
// memory called a `block'. The GCM spec explains how to interpret this
|
||||
// block as an element of a finite field. As discussed extensively, this
|
||||
// representation is very annoying for a number of reasons. On the other
|
||||
// hand, this code never actually deals with it directly.
|
||||
//
|
||||
// * The `register format' consists of one or more SIMD registers,
|
||||
// depending on the block size. The bits in each byte are reversed,
|
||||
// compared to the external format, which makes the polynomials
|
||||
// completely vanilla, unlike all of the other GCM implementations.
|
||||
//
|
||||
// * The `table format' is just like the `register format', only the two
|
||||
// halves of 128-bit SIMD register are the same, so we need twice as many
|
||||
// registers.
|
||||
//
|
||||
// * The `words' format consists of a sequence of bytes, as in the
|
||||
// `external format', but, according to the blockcipher in use, the bytes
|
||||
// within each 32-bit word may be reversed (`big-endian') or not
|
||||
// (`little-endian'). Accordingly, there are separate entry points for
|
||||
// each variant, identified with `b' or `l'.
|
||||
|
||||
FUNC(gcm_mulk_128b_arm64_pmull)
|
||||
// On entry, x0 points to a 128-bit field element A in big-endian
|
||||
// words format; x1 points to a field-element K in table format. On
|
||||
// exit, A is updated with the product A K.
|
||||
|
||||
ldr q0, [x0]
|
||||
ldp q1, q2, [x1]
|
||||
rev32 v0.16b, v0.16b
|
||||
vzero
|
||||
rbit v0.16b, v0.16b
|
||||
mul128
|
||||
rbit v0.16b, v0.16b
|
||||
rev32 v0.16b, v0.16b
|
||||
str q0, [x0]
|
||||
ret
|
||||
ENDFUNC
|
||||
|
||||
FUNC(gcm_mulk_128l_arm64_pmull)
|
||||
// On entry, x0 points to a 128-bit field element A in little-endian
|
||||
// words format; x1 points to a field-element K in table format. On
|
||||
// exit, A is updated with the product A K.
|
||||
|
||||
ldr q0, [x0]
|
||||
ldp q1, q2, [x1]
|
||||
vzero
|
||||
rbit v0.16b, v0.16b
|
||||
mul128
|
||||
rbit v0.16b, v0.16b
|
||||
str q0, [x0]
|
||||
ret
|
||||
ENDFUNC
|
||||
|
||||
FUNC(gcm_mulk_64b_arm64_pmull)
|
||||
// On entry, x0 points to a 64-bit field element A in big-endian
|
||||
// words format; x1 points to a field-element K in table format. On
|
||||
// exit, A is updated with the product A K.
|
||||
|
||||
ldr d0, [x0]
|
||||
ldr q1, [x1]
|
||||
rev32 v0.8b, v0.8b
|
||||
rbit v0.8b, v0.8b
|
||||
mul64
|
||||
rbit x2, x2
|
||||
ror x2, x2, #32
|
||||
str x2, [x0]
|
||||
ret
|
||||
ENDFUNC
|
||||
|
||||
FUNC(gcm_mulk_64l_arm64_pmull)
|
||||
// On entry, x0 points to a 64-bit field element A in little-endian
|
||||
// words format; x1 points to a field-element K in table format. On
|
||||
// exit, A is updated with the product A K.
|
||||
|
||||
ldr d0, [x0]
|
||||
ldr q1, [x1]
|
||||
rbit v0.8b, v0.8b
|
||||
mul64
|
||||
rbit x2, x2
|
||||
rev x2, x2
|
||||
str x2, [x0]
|
||||
ret
|
||||
ENDFUNC
|
||||
|
||||
FUNC(gcm_mulk_96b_arm64_pmull)
|
||||
// On entry, x0 points to a 96-bit field element A in big-endian
|
||||
// words format; x1 points to a field-element K in table format. On
|
||||
// exit, A is updated with the product A K.
|
||||
|
||||
ldr w2, [x0, #8]
|
||||
ldr d0, [x0, #0]
|
||||
mov v0.d[1], x2
|
||||
ldp q1, q2, [x1]
|
||||
rev32 v0.16b, v0.16b
|
||||
vzero
|
||||
rbit v0.16b, v0.16b
|
||||
mul96
|
||||
rbit v0.16b, v0.16b
|
||||
rev32 v0.16b, v0.16b
|
||||
mov w2, v0.s[2]
|
||||
str d0, [x0, #0]
|
||||
str w2, [x0, #8]
|
||||
ret
|
||||
ENDFUNC
|
||||
|
||||
FUNC(gcm_mulk_96l_arm64_pmull)
|
||||
// On entry, x0 points to a 96-bit field element A in little-endian
|
||||
// words format; x1 points to a field-element K in table format. On
|
||||
// exit, A is updated with the product A K.
|
||||
|
||||
ldr d0, [x0, #0]
|
||||
ldr w2, [x0, #8]
|
||||
mov v0.d[1], x2
|
||||
ldp q1, q2, [x1]
|
||||
rbit v0.16b, v0.16b
|
||||
vzero
|
||||
mul96
|
||||
rbit v0.16b, v0.16b
|
||||
mov w2, v0.s[2]
|
||||
str d0, [x0, #0]
|
||||
str w2, [x0, #8]
|
||||
ret
|
||||
ENDFUNC
|
||||
|
||||
FUNC(gcm_mulk_192b_arm64_pmull)
|
||||
// On entry, x0 points to a 192-bit field element A in big-endian
|
||||
// words format; x1 points to a field-element K in table format. On
|
||||
// exit, A is updated with the product A K.
|
||||
|
||||
ldr q0, [x0, #0]
|
||||
ldr d1, [x0, #16]
|
||||
ldp q2, q3, [x1, #0]
|
||||
ldr q4, [x1, #32]
|
||||
rev32 v0.16b, v0.16b
|
||||
rev32 v1.8b, v1.8b
|
||||
rbit v0.16b, v0.16b
|
||||
rbit v1.8b, v1.8b
|
||||
vzero
|
||||
mul192
|
||||
rev32 v0.16b, v0.16b
|
||||
rev32 v1.8b, v1.8b
|
||||
rbit v0.16b, v0.16b
|
||||
rbit v1.8b, v1.8b
|
||||
str q0, [x0, #0]
|
||||
str d1, [x0, #16]
|
||||
ret
|
||||
ENDFUNC
|
||||
|
||||
FUNC(gcm_mulk_192l_arm64_pmull)
|
||||
// On entry, x0 points to a 192-bit field element A in little-endian
|
||||
// words format; x1 points to a field-element K in table format. On
|
||||
// exit, A is updated with the product A K.
|
||||
|
||||
ldr q0, [x0, #0]
|
||||
ldr d1, [x0, #16]
|
||||
ldp q2, q3, [x1, #0]
|
||||
ldr q4, [x1, #32]
|
||||
rbit v0.16b, v0.16b
|
||||
rbit v1.8b, v1.8b
|
||||
vzero
|
||||
mul192
|
||||
rbit v0.16b, v0.16b
|
||||
rbit v1.8b, v1.8b
|
||||
str q0, [x0, #0]
|
||||
str d1, [x0, #16]
|
||||
ret
|
||||
ENDFUNC
|
||||
|
||||
FUNC(gcm_mulk_256b_arm64_pmull)
|
||||
// On entry, x0 points to a 256-bit field element A in big-endian
|
||||
// words format; x1 points to a field-element K in table format. On
|
||||
// exit, A is updated with the product A K.
|
||||
|
||||
ldp q0, q1, [x0]
|
||||
ldp q2, q3, [x1, #0]
|
||||
ldp q4, q5, [x1, #32]
|
||||
rev32 v0.16b, v0.16b
|
||||
rev32 v1.16b, v1.16b
|
||||
rbit v0.16b, v0.16b
|
||||
rbit v1.16b, v1.16b
|
||||
vzero
|
||||
mul256
|
||||
rev32 v0.16b, v0.16b
|
||||
rev32 v1.16b, v1.16b
|
||||
rbit v0.16b, v0.16b
|
||||
rbit v1.16b, v1.16b
|
||||
stp q0, q1, [x0]
|
||||
ret
|
||||
ENDFUNC
|
||||
|
||||
FUNC(gcm_mulk_256l_arm64_pmull)
|
||||
// On entry, x0 points to a 256-bit field element A in little-endian
|
||||
// words format; x1 points to a field-element K in table format. On
|
||||
// exit, A is updated with the product A K.
|
||||
|
||||
ldp q0, q1, [x0]
|
||||
ldp q2, q3, [x1, #0]
|
||||
ldp q4, q5, [x1, #32]
|
||||
rbit v0.16b, v0.16b
|
||||
rbit v1.16b, v1.16b
|
||||
vzero
|
||||
mul256
|
||||
rbit v0.16b, v0.16b
|
||||
rbit v1.16b, v1.16b
|
||||
stp q0, q1, [x0]
|
||||
ret
|
||||
ENDFUNC
|
||||
|
||||
///----- That's all, folks --------------------------------------------------
|
Завантаження…
Посилання в новій задачі
Block a user