Import

2023-11-30 09:37:13 +00:00 · 2023-11-30 09:37:13 +00:00 · be59b2b31d
--- a/aes-arm64.S
+++ b/aes-arm64.S
@ -0,0 +1,286 @@
+#include "asm-common.h"
+
+	.arch	armv8-a+crypto
+
+	.extern	F(abort)
+	.extern	F(rijndael_rcon)
+
+	.text
+
+///--------------------------------------------------------------------------
+/// Main code.
+
+/// The ARM crypto extension implements a little-endian version of AES
+/// (though the manual doesn't actually spell this out and you have to
+/// experiment)a, note that internal interface presents as big-endian so
+/// as to work better with things like GCM.  We therefore maintain the round
+/// keys in little-endian form, and have to end-swap blocks in and out.
+///
+/// For added amusement, the crypto extension doesn't implement the larger-
+/// block versions of Rijndael, so we have to end-swap the keys if we're
+/// preparing for one of those.
+
+	// Useful constants.
+	.equ	maxrounds, 16		// maximum number of rounds
+	.equ	maxblksz, 32		// maximum block size, in bytes
+	.equ	kbufsz, maxblksz*(maxrounds + 1) // size of key-sched buffer
+
+	// Context structure.
+	.equ	nr, 0			// number of rounds
+	.equ	w, nr + 4		// encryption key words
+	.equ	wi, w + kbufsz		// decryption key words
+
+///--------------------------------------------------------------------------
+/// Key setup.
+
+FUNC(rijndael_setup_arm64_crypto)
+
+	// Arguments:
+	//	x0 = pointer to context
+	//	w1 = block size in 32-bit words
+	//	x2 = pointer to key material
+	//	x3 = key size in words
+
+	pushreg	x29, x30
+	mov	x29, sp
+
+	// The initial round key material is taken directly from the input
+	// key, so copy it over.  Unfortunately, the key material is not
+	// guaranteed to be aligned in any especially useful way.  Assume
+	// that alignment traps are not enabled.  (Why would they be?  On
+	// A32, alignment traps were part of a transition plan which changed
+	// the way unaligned loads and stores behaved, but there's never been
+	// any other behaviour on A64.)
+	mov	x15, x3
+	add	x4, x0, #w
+0:	sub	x15, x15, #1
+	ldr	w14, [x2], #4
+	str	w14, [x4], #4
+	cbnz	x15, 0b
+
+	// Find out other useful things and prepare for the main loop.
+9:	ldr	w9, [x0, #nr]		// number of rounds
+	madd	w2, w1, w9, w1		// total key size in words
+	leaext	x5, rijndael_rcon	// round constants
+	sub	x6, x2, x3		// minus what we've copied already
+	add	x7, x0, #w		// position in previous cycle
+	movi	v1.4s, #0		// all-zero register for the key
+	mov	x8, #0			// position in current cycle
+
+	// Main key expansion loop.  Dispatch according to the position in
+	// the cycle.
+0:	ldr	w15, [x7], #4		// word from previous cycle
+	cbz	x8, 1f			// first word of the cycle?
+	cmp	x8, #4			// fourth word of the cycle?
+	b.ne	2f
+	cmp	x3, #7			// seven or eight words of key?
+	b.cc	2f
+
+	// Fourth word of the cycle, seven or eight words of key.  We must do
+	// the byte substitution.
+	dup	v0.4s, w14
+	aese	v0.16b, v1.16b		// effectively, just SubBytes
+	mov	w14, v0.s[0]
+	b	2f
+
+	// First word of the cycle.  Byte substitution, rotation, and round
+	// constant.
+1:	ldrb	w13, [x5], #1		// next round constant
+	dup	v0.4s, w14
+	aese	v0.16b, v1.16b		// effectively, just SubBytes
+	mov	w14, v0.s[0]
+	eor	w14, w13, w14, ror #8
+
+	// Common ending: mix in the word from the previous cycle and store.
+2:	eor	w14, w14, w15
+	str	w14, [x4], #4
+
+	// Prepare for the next iteration.  If we're done, then stop; if
+	// we've finished a cycle then reset the counter.
+	add	x8, x8, #1
+	sub	x6, x6, #1
+	cmp	x8, x3
+	cbz	x6, 9f
+	cmov.cs	x8, xzr
+	b	0b
+
+	// Next job is to construct the decryption keys.  The keys for the
+	// first and last rounds don't need to be mangled, but the remaining
+	// ones do -- and they all need to be reordered too.
+	//
+	// The plan of action, then, is to copy the final encryption round's
+	// keys into place first, then to do each of the intermediate rounds
+	// in reverse order, and finally do the first round.
+	//
+	// Do all the heavy lifting with the vector registers.  The order
+	// we're doing this in means that it's OK if we read or write too
+	// much, and there's easily enough buffer space for the
+	// over-enthusiastic reads and writes because the context has space
+	// for 32-byte blocks, which is our maximum and an exact fit for two
+	// full-width registers.
+9:	add	x5, x0, #wi
+	add	x4, x0, #w
+	add	x4, x4, w2, uxtw #2
+	sub	x4, x4, w1, uxtw #2		// last round's keys
+
+	// Copy the last encryption round's keys.
+	ld1	{v0.4s, v1.4s}, [x4]
+	st1	{v0.4s, v1.4s}, [x5]
+
+	// Update the loop variables and stop if we've finished.
+0:	sub	w9, w9, #1
+	add	x5, x5, w1, uxtw #2
+	sub	x4, x4, w1, uxtw #2
+	cbz	w9, 9f
+
+	// Do another middle round's keys...
+	ld1	{v0.4s, v1.4s}, [x4]
+	aesimc	v0.16b, v0.16b
+	aesimc	v1.16b, v1.16b
+	st1	{v0.4s, v1.4s}, [x5]
+	b	0b
+
+	// Finally do the first encryption round.
+9:	ld1	{v0.4s, v1.4s}, [x4]
+	st1	{v0.4s, v1.4s}, [x5]
+
+	// If the block size is not exactly four words then we must end-swap
+	// everything.  We can use fancy vector toys for this.
+	cmp	w1, #4
+	b.eq	9f
+
+	// End-swap the encryption keys.
+	add	x1, x0, #w
+	bl	endswap_block
+
+	// And the decryption keys
+	add	x1, x0, #wi
+	bl	endswap_block
+
+	// All done.
+9:	popreg	x29, x30
+	ret
+
+ENDFUNC
+
+INTFUNC(endswap_block)
+	// End-swap w2 words starting at x1.  x1 is clobbered; w2 is not.
+	// It's OK to work in 16-byte chunks.
+
+	mov	w3, w2
+0:	subs	w3, w3, #4
+	ld1	{v0.4s}, [x1]
+	rev32	v0.16b, v0.16b
+	st1	{v0.4s}, [x1], #16
+	b.hi	0b
+	ret
+
+ENDFUNC
+
+///--------------------------------------------------------------------------
+/// Encrypting and decrypting blocks.
+
+.macro	encdec	op, aes, mc, koff
+  FUNC(rijndael_\op\()_arm64_crypto)
+
+	// Arguments:
+	//	x0 = pointer to context
+	//	x1 = pointer to input block
+	//	x2 = pointer to output block
+
+	// Set things up ready.
+	ldr	w3, [x0, #nr]
+	add	x0, x0, #\koff
+	ld1	{v0.4s}, [x1]
+	rev32	v0.16b, v0.16b
+
+	// Check the number of rounds and dispatch.
+	cmp	w3, #14
+	b.eq	14f
+	cmp	w3, #10
+	b.eq	10f
+	cmp	w3, #12
+	b.eq	12f
+	cmp	w3, #13
+	b.eq	13f
+	cmp	w3, #11
+	b.eq	11f
+	callext	F(abort)
+
+	// Eleven rounds.
+11:	ld1	{v16.4s}, [x0], #16
+	\aes	v0.16b, v16.16b
+	\mc	v0.16b, v0.16b
+	b	10f
+
+	// Twelve rounds.
+12:	ld1	{v16.4s, v17.4s}, [x0], #32
+	\aes	v0.16b, v16.16b
+	\mc	v0.16b, v0.16b
+	\aes	v0.16b, v17.16b
+	\mc	v0.16b, v0.16b
+	b	10f
+
+	// Thirteen rounds.
+13:	ld1	{v16.4s-v18.4s}, [x0], #48
+	\aes	v0.16b, v16.16b
+	\mc	v0.16b, v0.16b
+	\aes	v0.16b, v17.16b
+	\mc	v0.16b, v0.16b
+	\aes	v0.16b, v18.16b
+	\mc	v0.16b, v0.16b
+	b	10f
+
+	// Fourteen rounds.  (Drops through to the ten round case because
+	// this is the next most common.)
+14:	ld1	{v16.4s-v19.4s}, [x0], #64
+	\aes	v0.16b, v16.16b
+	\mc	v0.16b, v0.16b
+	\aes	v0.16b, v17.16b
+	\mc	v0.16b, v0.16b
+	\aes	v0.16b, v18.16b
+	\mc	v0.16b, v0.16b
+	\aes	v0.16b, v19.16b
+	\mc	v0.16b, v0.16b
+	// Drop through...
+
+	// Ten rounds.
+10:	ld1	{v16.4s-v19.4s}, [x0], #64
+	ld1	{v20.4s-v23.4s}, [x0], #64
+	\aes	v0.16b, v16.16b
+	\mc	v0.16b, v0.16b
+	\aes	v0.16b, v17.16b
+	\mc	v0.16b, v0.16b
+	\aes	v0.16b, v18.16b
+	\mc	v0.16b, v0.16b
+	\aes	v0.16b, v19.16b
+	\mc	v0.16b, v0.16b
+
+	ld1	{v16.4s-v18.4s}, [x0], #48
+	\aes	v0.16b, v20.16b
+	\mc	v0.16b, v0.16b
+	\aes	v0.16b, v21.16b
+	\mc	v0.16b, v0.16b
+	\aes	v0.16b, v22.16b
+	\mc	v0.16b, v0.16b
+	\aes	v0.16b, v23.16b
+	\mc	v0.16b, v0.16b
+
+	// Final round has no MixColumns, but is followed by final whitening.
+	\aes	v0.16b, v16.16b
+	\mc	v0.16b, v0.16b
+	\aes	v0.16b, v17.16b
+	eor	v0.16b, v0.16b, v18.16b
+
+	// All done.
+	rev32	v0.16b, v0.16b
+	st1	{v0.4s}, [x2]
+	ret
+
+  ENDFUNC
+.endm
+
+	encdec	eblk, aese, aesmc, w
+	encdec	dblk, aesd, aesimc, wi
+
+///----- That's all, folks --------------------------------------------------
--- a/asm-common.h
+++ b/asm-common.h
--- a/gcm-arm64-pmull.S
+++ b/gcm-arm64-pmull.S
@ -0,0 +1,631 @@
+#include "asm-common.h"
+
+	.arch	armv8-a+crypto
+
+	.text
+
+///--------------------------------------------------------------------------
+/// Multiplication macros.
+
+	// The good news is that we have a fancy instruction to do the
+	// multiplications.  The bad news is that it's not particularly well-
+	// suited to the job.
+	//
+	// For one thing, it only does a 64-bit multiplication, so in general
+	// we'll need to synthesize the full-width multiply by hand.  For
+	// another thing, it doesn't help with the reduction, so we have to
+	// do that by hand too.  And, finally, GCM has crazy bit ordering,
+	// and the instruction does nothing useful for that at all.
+	//
+	// Focusing on that last problem first: the bits aren't in monotonic
+	// significance order unless we permute them.  Fortunately, ARM64 has
+	// an instruction which will just permute the bits in each byte for
+	// us, so we don't have to worry about this very much.
+	//
+	// Our main weapons, the `pmull' and `pmull2' instructions, work on
+	// 64-bit operands, in half of a vector register, and produce 128-bit
+	// results.  But neither of them will multiply the high half of one
+	// vector by the low half of a second one, so we have a problem,
+	// which we solve by representing one of the operands redundantly:
+	// rather than packing the 64-bit pieces together, we duplicate each
+	// 64-bit piece across both halves of a register.
+	//
+	// The commentary for `mul128' is the most detailed.  The other
+	// macros assume that you've already read and understood that.
+
+.macro	mul128
+	// Enter with u and v in v0 and v1/v2 respectively, and 0 in v31;
+	// leave with z = u v in v0.  Clobbers v1--v6.
+
+	// First for the double-precision multiplication.  It's tempting to
+	// use Karatsuba's identity here, but I suspect that loses more in
+	// the shifting, bit-twiddling, and dependency chains that it gains
+	// in saving a multiplication which otherwise pipelines well.
+	// v0 =				// (u_0; u_1)
+	// v1/v2 =			// (v_0; v_1)
+	pmull2	v3.1q, v0.2d, v1.2d	// u_1 v_0
+	pmull	v4.1q, v0.1d, v2.1d	// u_0 v_1
+	pmull2	v5.1q, v0.2d, v2.2d	// (t_1; x_3) = u_1 v_1
+	pmull	v6.1q, v0.1d, v1.1d	// (x_0; t_0) = u_0 v_0
+
+	// Arrange the pieces to form a double-precision polynomial.
+	eor	v3.16b, v3.16b, v4.16b	// (m_0; m_1) = u_0 v_1 + u_1 v_0
+	vshr128	v4, v3, 64		// (m_1; 0)
+	vshl128	v3, v3, 64		// (0; m_0)
+	eor	v1.16b, v5.16b, v4.16b	// (x_2; x_3)
+	eor	v0.16b, v6.16b, v3.16b	// (x_0; x_1)
+
+	// And now the only remaining difficulty is that the result needs to
+	// be reduced modulo p(t) = t^128 + t^7 + t^2 + t + 1.  Let R = t^128
+	// = t^7 + t^2 + t + 1 in our field.  So far, we've calculated z_0
+	// and z_1 such that z_0 + z_1 R = u v using the identity R = t^128:
+	// now we must collapse the two halves of y together using the other
+	// identity R = t^7 + t^2 + t + 1.
+	//
+	// We do this by working on y_2 and y_3 separately, so consider y_i
+	// for i = 2 or 3.  Certainly, y_i t^{64i} = y_i R t^{64(i-2) =
+	// (t^7 + t^2 + t + 1) y_i t^{64(i-2)}, but we can't use that
+	// directly without breaking up the 64-bit word structure.  Instead,
+	// we start by considering just y_i t^7 t^{64(i-2)}, which again
+	// looks tricky.  Now, split y_i = a_i + t^57 b_i, with deg a_i < 57;
+	// then
+	//
+	//	y_i t^7 t^{64(i-2)} = a_i t^7 t^{64(i-2)} + b_i t^{64(i-1)}
+	//
+	// We can similarly decompose y_i t^2 and y_i t into a pair of 64-bit
+	// contributions to the t^{64(i-2)} and t^{64(i-1)} words, but the
+	// splits are different.  This is lovely, with one small snag: when
+	// we do this to y_3, we end up with a contribution back into the
+	// t^128 coefficient word.  But notice that only the low seven bits
+	// of this word are affected, so there's no knock-on contribution
+	// into the t^64 word.  Therefore, if we handle the high bits of each
+	// word together, and then the low bits, everything will be fine.
+
+	// First, shift the high bits down.
+	ushr	v2.2d, v1.2d, #63	// the b_i for t
+	ushr	v3.2d, v1.2d, #62	// the b_i for t^2
+	ushr	v4.2d, v1.2d, #57	// the b_i for t^7
+	eor	v2.16b, v2.16b, v3.16b	// add them all together
+	eor	v2.16b, v2.16b, v4.16b
+	vshr128	v3, v2, 64
+	vshl128	v4, v2, 64
+	eor	v1.16b, v1.16b, v3.16b	// contribution into high half
+	eor	v0.16b, v0.16b, v4.16b	// and low half
+
+	// And then shift the low bits up.
+	shl	v2.2d, v1.2d, #1
+	shl	v3.2d, v1.2d, #2
+	shl	v4.2d, v1.2d, #7
+	eor	v1.16b, v1.16b, v2.16b	// unit and t contribs
+	eor	v3.16b, v3.16b, v4.16b	// t^2 and t^7 contribs
+	eor	v0.16b, v0.16b, v1.16b	// mix everything together
+	eor	v0.16b, v0.16b, v3.16b	// ... and we're done
+.endm
+
+.macro	mul64
+	// Enter with u and v in the low halves of v0 and v1, respectively;
+	// leave with z = u v in x2.  Clobbers x2--x4.
+
+	// The multiplication is thankfully easy.
+	// v0 =					// (u; ?)
+	// v1 =					// (v; ?)
+	pmull	v0.1q, v0.1d, v1.1d		// u v
+
+	// Now we must reduce.  This is essentially the same as the 128-bit
+	// case above, but mostly simpler because everything is smaller.  The
+	// polynomial this time is p(t) = t^64 + t^4 + t^3 + t + 1.
+
+	// Before we get stuck in, transfer the product to general-purpose
+	// registers.
+	mov	x3, v0.d[1]
+	mov	x2, v0.d[0]
+
+	// First, shift the high bits down.
+	eor	x4, x3, x3, lsr #1	// pre-mix t^3 and t^4
+	eor	x3, x3, x3, lsr #63	// mix in t contribution
+	eor	x3, x3, x4, lsr #60	// shift and mix in t^3 and t^4
+
+	// And then shift the low bits up.
+	eor	x3, x3, x3, lsl #1	// mix unit and t; pre-mix t^3, t^4
+	eor	x2, x2, x3		// fold them in
+	eor	x2, x2, x3, lsl #3	// and t^3 and t^4
+.endm
+
+.macro	mul96
+	// Enter with u in the least-significant 96 bits of v0, with zero in
+	// the upper 32 bits, and with the least-significant 64 bits of v in
+	// both halves of v1, and the upper 32 bits of v in the low 32 bits
+	// of each half of v2, with zero in the upper 32 bits; and with zero
+	// in v31.  Yes, that's a bit hairy.  Leave with the product u v in
+	// the low 96 bits of v0, and /junk/ in the high 32 bits.  Clobbers
+	// v1--v6.
+
+	// This is an inconvenient size.  There's nothing for it but to do
+	// four multiplications, as if for the 128-bit case.  It's possible
+	// that there's cruft in the top 32 bits of the input registers, so
+	// shift both of them up by four bytes before we start.  This will
+	// mean that the high 64 bits of the result (from GCM's viewpoint)
+	// will be zero.
+	// v0 =				// (u_0 + u_1 t^32; u_2)
+	// v1 =				// (v_0 + v_1 t^32; v_0 + v_1 t^32)
+	// v2 =				// (v_2; v_2)
+	pmull2	v5.1q, v0.2d, v1.2d	// u_2 (v_0 + v_1 t^32) t^32 = e_0
+	pmull	v4.1q, v0.1d, v2.1d	// v_2 (u_0 + u_1 t^32) t^32 = e_1
+	pmull2	v6.1q, v0.2d, v2.2d	// u_2 v_2 = d = (d; 0)
+	pmull	v3.1q, v0.1d, v1.1d	// u_0 v_0 + (u_0 v_1 + u_1 v_0) t^32
+					//   + u_1 v_1 t^64 = f
+
+	// Extract the high and low halves of the 192-bit result.  The answer
+	// we want is d t^128 + e t^64 + f, where e = e_0 + e_1.  The low 96
+	// bits of the answer will end up in v0, with junk in the top 32
+	// bits; the high 96 bits will end up in v1, which must have zero in
+	// its top 32 bits.
+	//
+	// Here, bot(x) is the low 96 bits of a 192-bit quantity x, arranged
+	// in the low 96 bits of a SIMD register, with junk in the top 32
+	// bits; and top(x) is the high 96 bits, also arranged in the low 96
+	// bits of a register, with /zero/ in the top 32 bits.
+	eor	v4.16b, v4.16b, v5.16b	// e_0 + e_1 = e
+	vshl128	v6, v6, 32		// top(d t^128)
+	vshr128	v5, v4, 32		// top(e t^64)
+	vshl128	v4, v4, 64		// bot(e t^64)
+	vshr128	v1, v3, 96		// top(f)
+	eor	v6.16b, v6.16b, v5.16b	// top(d t^128 + e t^64)
+	eor	v0.16b, v3.16b, v4.16b	// bot([d t^128] + e t^64 + f)
+	eor	v1.16b, v1.16b, v6.16b	// top(e t^64 + d t^128 + f)
+
+	// Finally, the reduction.  This is essentially the same as the
+	// 128-bit case, except that the polynomial is p(t) = t^96 + t^10 +
+	// t^9 + t^6 + 1.  The degrees are larger but not enough to cause
+	// trouble for the general approach.  Unfortunately, we have to do
+	// this in 32-bit pieces rather than 64.
+
+	// First, shift the high bits down.
+	ushr	v2.4s, v1.4s, #26	// the b_i for t^6
+	ushr	v3.4s, v1.4s, #23	// the b_i for t^9
+	ushr	v4.4s, v1.4s, #22	// the b_i for t^10
+	eor	v2.16b, v2.16b, v3.16b	// add them all together
+	eor	v2.16b, v2.16b, v4.16b
+	vshr128	v3, v2, 64		// contribution for high half
+	vshl128	v2, v2, 32		// contribution for low half
+	eor	v1.16b, v1.16b, v3.16b	// apply to high half
+	eor	v0.16b, v0.16b, v2.16b	// and low half
+
+	// And then shift the low bits up.
+	shl	v2.4s, v1.4s, #6
+	shl	v3.4s, v1.4s, #9
+	shl	v4.4s, v1.4s, #10
+	eor	v1.16b, v1.16b, v2.16b	// unit and t^6 contribs
+	eor	v3.16b, v3.16b, v4.16b	// t^9 and t^10 contribs
+	eor	v0.16b, v0.16b, v1.16b	// mix everything together
+	eor	v0.16b, v0.16b, v3.16b	// ... and we're done
+.endm
+
+.macro	mul192
+	// Enter with u in v0 and the less-significant half of v1, with v
+	// duplicated across both halves of v2/v3/v4, and with zero in v31.
+	// Leave with the product u v in v0 and the bottom half of v1.
+	// Clobbers v16--v25.
+
+	// Start multiplying and accumulating pieces of product.
+	// v0 =				// (u_0; u_1)
+	// v1 =				// (u_2; ?)
+	// v2 =				// (v_0; v_0)
+	// v3 =				// (v_1; v_1)
+	// v4 =				// (v_2; v_2)
+	pmull	v16.1q, v0.1d, v2.1d	//   a = u_0 v_0
+
+	pmull	v19.1q, v0.1d, v3.1d	//	 u_0 v_1
+	pmull2	v21.1q, v0.2d, v2.2d	//	 u_1 v_0
+
+	pmull	v17.1q, v0.1d, v4.1d	//	 u_0 v_2
+	pmull2	v22.1q, v0.2d, v3.2d	//	 u_1 v_1
+	pmull	v23.1q, v1.1d, v2.1d	//	 u_2 v_0
+	 eor	v19.16b, v19.16b, v21.16b // b = u_0 v_1 + u_1 v_0
+
+	pmull2	v20.1q, v0.2d, v4.2d	//	 u_1 v_2
+	pmull	v24.1q, v1.1d, v3.1d	//	 u_2 v_1
+	 eor	v17.16b, v17.16b, v22.16b //	 u_0 v_2 + u_1 v_1
+
+	pmull	v18.1q, v1.1d, v4.1d	//   e = u_2 v_2
+	 eor	v17.16b, v17.16b, v23.16b // c = u_0 v_2 + u_1 v_1 + u_2 v_1
+	 eor	v20.16b, v20.16b, v24.16b // d = u_1 v_2 + u_2 v_1
+
+	// Piece the product together.
+	// v16 =			// (a_0; a_1)
+	// v19 =			// (b_0; b_1)
+	// v17 =			// (c_0; c_1)
+	// v20 =			// (d_0; d_1)
+	// v18 =			// (e_0; e_1)
+	vshl128	v21, v19, 64		// (0; b_0)
+	ext	v22.16b, v19.16b, v20.16b, #8 // (b_1; d_0)
+	vshr128	v23, v20, 64		// (d_1; 0)
+	eor	v16.16b, v16.16b, v21.16b // (x_0; x_1)
+	eor	v17.16b, v17.16b, v22.16b // (x_2; x_3)
+	eor	v18.16b, v18.16b, v23.16b // (x_2; x_3)
+
+	// Next, the reduction.  Our polynomial this time is p(x) = t^192 +
+	// t^7 + t^2 + t + 1.  Yes, the magic numbers are the same as the
+	// 128-bit case.  I don't know why.
+
+	// First, shift the high bits down.
+	// v16 =			// (y_0; y_1)
+	// v17 =			// (y_2; y_3)
+	// v18 =			// (y_4; y_5)
+	mov	v19.d[0], v17.d[1]	// (y_3; ?)
+
+	ushr	v23.2d, v18.2d, #63	// hi b_i for t
+	ushr	d20, d19, #63		// lo b_i for t
+	ushr	v24.2d, v18.2d, #62	// hi b_i for t^2
+	ushr	d21, d19, #62		// lo b_i for t^2
+	ushr	v25.2d, v18.2d, #57	// hi b_i for t^7
+	ushr	d22, d19, #57		// lo b_i for t^7
+	eor	v23.16b, v23.16b, v24.16b // mix them all together
+	eor	v20.8b, v20.8b, v21.8b
+	eor	v23.16b, v23.16b, v25.16b
+	eor	v20.8b, v20.8b, v22.8b
+
+	// Permute the high pieces while we fold in the b_i.
+	eor	v17.16b, v17.16b, v23.16b
+	vshl128	v20, v20, 64
+	mov	v19.d[0], v18.d[1]	// (y_5; ?)
+	ext	v18.16b, v17.16b, v18.16b, #8 // (y_3; y_4)
+	eor	v16.16b, v16.16b, v20.16b
+
+	// And finally shift the low bits up.
+	// v16 =			// (y'_0; y'_1)
+	// v17 =			// (y'_2; ?)
+	// v18 =			// (y'_3; y'_4)
+	// v19 =			// (y'_5; ?)
+	shl	v20.2d, v18.2d, #1
+	shl	d23, d19, #1
+	shl	v21.2d, v18.2d, #2
+	shl	d24, d19, #2
+	shl	v22.2d, v18.2d, #7
+	shl	d25, d19, #7
+	eor	v18.16b, v18.16b, v20.16b // unit and t contribs
+	eor	v19.8b, v19.8b, v23.8b
+	eor	v21.16b, v21.16b, v22.16b // t^2 and t^7 contribs
+	eor	v24.8b, v24.8b, v25.8b
+	eor	v18.16b, v18.16b, v21.16b // all contribs
+	eor	v19.8b, v19.8b, v24.8b
+	eor	v0.16b, v16.16b, v18.16b // mix them into the low half
+	eor	v1.8b, v17.8b, v19.8b
+.endm
+
+.macro	mul256
+	// Enter with u in v0/v1, with v duplicated across both halves of
+	// v2--v5, and with zero in v31.  Leave with the product u v in
+	// v0/v1.  Clobbers ???.
+
+	// Now it's starting to look worthwhile to do Karatsuba.  Suppose
+	// u = u_0 + u_1 B and v = v_0 + v_1 B.  Then
+	//
+	//	u v = (u_0 v_0) + (u_0 v_1 + u_1 v_0) B + (u_1 v_1) B^2
+	//
+	// Name these coefficients of B^i be a, b, and c, respectively, and
+	// let r = u_0 + u_1 and s = v_0 + v_1.  Then observe that
+	//
+	//	q = r s = (u_0 + u_1) (v_0 + v_1)
+	//	  = (u_0 v_0) + (u1 v_1) + (u_0 v_1 + u_1 v_0)
+	//	  = a + d + c
+	//
+	// The first two terms we've already calculated; the last is the
+	// remaining one we want.  We'll set B = t^128.  We know how to do
+	// 128-bit multiplications already, and Karatsuba is too annoying
+	// there, so there'll be 12 multiplications altogether, rather than
+	// the 16 we'd have if we did this the naïve way.
+	// v0 =				// u_0 = (u_00; u_01)
+	// v1 =				// u_1 = (u_10; u_11)
+	// v2 =				// (v_00; v_00)
+	// v3 =				// (v_01; v_01)
+	// v4 =				// (v_10; v_10)
+	// v5 =				// (v_11; v_11)
+
+	eor	v28.16b, v0.16b, v1.16b	// u_* = (u_00 + u_10; u_01 + u_11)
+	eor	v29.16b, v2.16b, v4.16b	// v_*0 = v_00 + v_10
+	eor	v30.16b, v3.16b, v5.16b	// v_*1 = v_01 + v_11
+
+	// Start by building the cross product, q = u_* v_*.
+	pmull	v24.1q, v28.1d, v30.1d	// u_*0 v_*1
+	pmull2	v25.1q, v28.2d, v29.2d	// u_*1 v_*0
+	pmull	v20.1q, v28.1d, v29.1d	// u_*0 v_*0
+	pmull2	v21.1q, v28.2d, v30.2d	// u_*1 v_*1
+	eor	v24.16b, v24.16b, v25.16b // u_*0 v_*1 + u_*1 v_*0
+	vshr128	v25, v24, 64
+	vshl128	v24, v24, 64
+	eor	v20.16b, v20.16b, v24.16b // q_0
+	eor	v21.16b, v21.16b, v25.16b // q_1
+
+	// Next, work on the low half, a = u_0 v_0
+	pmull	v24.1q, v0.1d, v3.1d	// u_00 v_01
+	pmull2	v25.1q, v0.2d, v2.2d	// u_01 v_00
+	pmull	v16.1q, v0.1d, v2.1d	// u_00 v_00
+	pmull2	v17.1q, v0.2d, v3.2d	// u_01 v_01
+	eor	v24.16b, v24.16b, v25.16b // u_00 v_01 + u_01 v_00
+	vshr128	v25, v24, 64
+	vshl128	v24, v24, 64
+	eor	v16.16b, v16.16b, v24.16b // a_0
+	eor	v17.16b, v17.16b, v25.16b // a_1
+
+	// Mix the pieces we have so far.
+	eor	v20.16b, v20.16b, v16.16b
+	eor	v21.16b, v21.16b, v17.16b
+
+	// Finally, work on the high half, c = u_1 v_1
+	pmull	v24.1q, v1.1d, v5.1d	// u_10 v_11
+	pmull2	v25.1q, v1.2d, v4.2d	// u_11 v_10
+	pmull	v18.1q, v1.1d, v4.1d	// u_10 v_10
+	pmull2	v19.1q, v1.2d, v5.2d	// u_11 v_11
+	eor	v24.16b, v24.16b, v25.16b // u_10 v_11 + u_11 v_10
+	vshr128	v25, v24, 64
+	vshl128	v24, v24, 64
+	eor	v18.16b, v18.16b, v24.16b // c_0
+	eor	v19.16b, v19.16b, v25.16b // c_1
+
+	// Finish mixing the product together.
+	eor	v20.16b, v20.16b, v18.16b
+	eor	v21.16b, v21.16b, v19.16b
+	eor	v17.16b, v17.16b, v20.16b
+	eor	v18.16b, v18.16b, v21.16b
+
+	// Now we must reduce.  This is essentially the same as the 192-bit
+	// case above, but more complicated because everything is bigger.
+	// The polynomial this time is p(t) = t^256 + t^10 + t^5 + t^2 + 1.
+	// v16 =			// (y_0; y_1)
+	// v17 =			// (y_2; y_3)
+	// v18 =			// (y_4; y_5)
+	// v19 =			// (y_6; y_7)
+	ushr	v24.2d, v18.2d, #62	// (y_4; y_5) b_i for t^2
+	ushr	v25.2d, v19.2d, #62	// (y_6; y_7) b_i for t^2
+	ushr	v26.2d, v18.2d, #59	// (y_4; y_5) b_i for t^5
+	ushr	v27.2d, v19.2d, #59	// (y_6; y_7) b_i for t^5
+	ushr	v28.2d, v18.2d, #54	// (y_4; y_5) b_i for t^10
+	ushr	v29.2d, v19.2d, #54	// (y_6; y_7) b_i for t^10
+	eor	v24.16b, v24.16b, v26.16b // mix the contributions together
+	eor	v25.16b, v25.16b, v27.16b
+	eor	v24.16b, v24.16b, v28.16b
+	eor	v25.16b, v25.16b, v29.16b
+	vshr128	v26, v25, 64		// slide contribs into position
+	ext	v25.16b, v24.16b, v25.16b, #8
+	vshl128	v24, v24, 64
+	eor	v18.16b, v18.16b, v26.16b
+	eor	v17.16b, v17.16b, v25.16b
+	eor	v16.16b, v16.16b, v24.16b
+
+	// And then shift the low bits up.
+	// v16 =			// (y'_0; y'_1)
+	// v17 =			// (y'_2; y'_3)
+	// v18 =			// (y'_4; y'_5)
+	// v19 =			// (y'_6; y'_7)
+	shl	v24.2d, v18.2d, #2	// (y'_4; y_5) a_i for t^2
+	shl	v25.2d, v19.2d, #2	// (y_6; y_7) a_i for t^2
+	shl	v26.2d, v18.2d, #5	// (y'_4; y_5) a_i for t^5
+	shl	v27.2d, v19.2d, #5	// (y_6; y_7) a_i for t^5
+	shl	v28.2d, v18.2d, #10	// (y'_4; y_5) a_i for t^10
+	shl	v29.2d, v19.2d, #10	// (y_6; y_7) a_i for t^10
+	eor	v18.16b, v18.16b, v24.16b // mix the contributions together
+	eor	v19.16b, v19.16b, v25.16b
+	eor	v26.16b, v26.16b, v28.16b
+	eor	v27.16b, v27.16b, v29.16b
+	eor	v18.16b, v18.16b, v26.16b
+	eor	v19.16b, v19.16b, v27.16b
+	eor	v0.16b, v16.16b, v18.16b
+	eor	v1.16b, v17.16b, v19.16b
+.endm
+
+///--------------------------------------------------------------------------
+/// Main code.
+
+// There are a number of representations of field elements in this code and
+// it can be confusing.
+//
+//   * The `external format' consists of a sequence of contiguous bytes in
+//     memory called a `block'.  The GCM spec explains how to interpret this
+//     block as an element of a finite field.  As discussed extensively, this
+//     representation is very annoying for a number of reasons.  On the other
+//     hand, this code never actually deals with it directly.
+//
+//   * The `register format' consists of one or more SIMD registers,
+//     depending on the block size.  The bits in each byte are reversed,
+//     compared to the external format, which makes the polynomials
+//     completely vanilla, unlike all of the other GCM implementations.
+//
+//   * The `table format' is just like the `register format', only the two
+//     halves of 128-bit SIMD register are the same, so we need twice as many
+//     registers.
+//
+//   * The `words' format consists of a sequence of bytes, as in the
+//     `external format', but, according to the blockcipher in use, the bytes
+//     within each 32-bit word may be reversed (`big-endian') or not
+//     (`little-endian').  Accordingly, there are separate entry points for
+//     each variant, identified with `b' or `l'.
+
+FUNC(gcm_mulk_128b_arm64_pmull)
+	// On entry, x0 points to a 128-bit field element A in big-endian
+	// words format; x1 points to a field-element K in table format.  On
+	// exit, A is updated with the product A K.
+
+	ldr	q0, [x0]
+	ldp	q1, q2, [x1]
+	rev32	v0.16b, v0.16b
+	vzero
+	rbit	v0.16b, v0.16b
+	mul128
+	rbit	v0.16b, v0.16b
+	rev32	v0.16b, v0.16b
+	str	q0, [x0]
+	ret
+ENDFUNC
+
+FUNC(gcm_mulk_128l_arm64_pmull)
+	// On entry, x0 points to a 128-bit field element A in little-endian
+	// words format; x1 points to a field-element K in table format.  On
+	// exit, A is updated with the product A K.
+
+	ldr	q0, [x0]
+	ldp	q1, q2, [x1]
+	vzero
+	rbit	v0.16b, v0.16b
+	mul128
+	rbit	v0.16b, v0.16b
+	str	q0, [x0]
+	ret
+ENDFUNC
+
+FUNC(gcm_mulk_64b_arm64_pmull)
+	// On entry, x0 points to a 64-bit field element A in big-endian
+	// words format; x1 points to a field-element K in table format.  On
+	// exit, A is updated with the product A K.
+
+	ldr	d0, [x0]
+	ldr	q1, [x1]
+	rev32	v0.8b, v0.8b
+	rbit	v0.8b, v0.8b
+	mul64
+	rbit	x2, x2
+	ror	x2, x2, #32
+	str	x2, [x0]
+	ret
+ENDFUNC
+
+FUNC(gcm_mulk_64l_arm64_pmull)
+	// On entry, x0 points to a 64-bit field element A in little-endian
+	// words format; x1 points to a field-element K in table format.  On
+	// exit, A is updated with the product A K.
+
+	ldr	d0, [x0]
+	ldr	q1, [x1]
+	rbit	v0.8b, v0.8b
+	mul64
+	rbit	x2, x2
+	rev	x2, x2
+	str	x2, [x0]
+	ret
+ENDFUNC
+
+FUNC(gcm_mulk_96b_arm64_pmull)
+	// On entry, x0 points to a 96-bit field element A in big-endian
+	// words format; x1 points to a field-element K in table format.  On
+	// exit, A is updated with the product A K.
+
+	ldr	w2, [x0, #8]
+	ldr	d0, [x0, #0]
+	mov	v0.d[1], x2
+	ldp	q1, q2, [x1]
+	rev32	v0.16b, v0.16b
+	vzero
+	rbit	v0.16b, v0.16b
+	mul96
+	rbit	v0.16b, v0.16b
+	rev32	v0.16b, v0.16b
+	mov	w2, v0.s[2]
+	str	d0, [x0, #0]
+	str	w2, [x0, #8]
+	ret
+ENDFUNC
+
+FUNC(gcm_mulk_96l_arm64_pmull)
+	// On entry, x0 points to a 96-bit field element A in little-endian
+	// words format; x1 points to a field-element K in table format.  On
+	// exit, A is updated with the product A K.
+
+	ldr	d0, [x0, #0]
+	ldr	w2, [x0, #8]
+	mov	v0.d[1], x2
+	ldp	q1, q2, [x1]
+	rbit	v0.16b, v0.16b
+	vzero
+	mul96
+	rbit	v0.16b, v0.16b
+	mov	w2, v0.s[2]
+	str	d0, [x0, #0]
+	str	w2, [x0, #8]
+	ret
+ENDFUNC
+
+FUNC(gcm_mulk_192b_arm64_pmull)
+	// On entry, x0 points to a 192-bit field element A in big-endian
+	// words format; x1 points to a field-element K in table format.  On
+	// exit, A is updated with the product A K.
+
+	ldr	q0, [x0, #0]
+	ldr	d1, [x0, #16]
+	ldp	q2, q3, [x1, #0]
+	ldr	q4, [x1, #32]
+	rev32	v0.16b, v0.16b
+	rev32	v1.8b, v1.8b
+	rbit	v0.16b, v0.16b
+	rbit	v1.8b, v1.8b
+	vzero
+	mul192
+	rev32	v0.16b, v0.16b
+	rev32	v1.8b, v1.8b
+	rbit	v0.16b, v0.16b
+	rbit	v1.8b, v1.8b
+	str	q0, [x0, #0]
+	str	d1, [x0, #16]
+	ret
+ENDFUNC
+
+FUNC(gcm_mulk_192l_arm64_pmull)
+	// On entry, x0 points to a 192-bit field element A in little-endian
+	// words format; x1 points to a field-element K in table format.  On
+	// exit, A is updated with the product A K.
+
+	ldr	q0, [x0, #0]
+	ldr	d1, [x0, #16]
+	ldp	q2, q3, [x1, #0]
+	ldr	q4, [x1, #32]
+	rbit	v0.16b, v0.16b
+	rbit	v1.8b, v1.8b
+	vzero
+	mul192
+	rbit	v0.16b, v0.16b
+	rbit	v1.8b, v1.8b
+	str	q0, [x0, #0]
+	str	d1, [x0, #16]
+	ret
+ENDFUNC
+
+FUNC(gcm_mulk_256b_arm64_pmull)
+	// On entry, x0 points to a 256-bit field element A in big-endian
+	// words format; x1 points to a field-element K in table format.  On
+	// exit, A is updated with the product A K.
+
+	ldp	q0, q1, [x0]
+	ldp	q2, q3, [x1, #0]
+	ldp	q4, q5, [x1, #32]
+	rev32	v0.16b, v0.16b
+	rev32	v1.16b, v1.16b
+	rbit	v0.16b, v0.16b
+	rbit	v1.16b, v1.16b
+	vzero
+	mul256
+	rev32	v0.16b, v0.16b
+	rev32	v1.16b, v1.16b
+	rbit	v0.16b, v0.16b
+	rbit	v1.16b, v1.16b
+	stp	q0, q1, [x0]
+	ret
+ENDFUNC
+
+FUNC(gcm_mulk_256l_arm64_pmull)
+	// On entry, x0 points to a 256-bit field element A in little-endian
+	// words format; x1 points to a field-element K in table format.  On
+	// exit, A is updated with the product A K.
+
+	ldp	q0, q1, [x0]
+	ldp	q2, q3, [x1, #0]
+	ldp	q4, q5, [x1, #32]
+	rbit	v0.16b, v0.16b
+	rbit	v1.16b, v1.16b
+	vzero
+	mul256
+	rbit	v0.16b, v0.16b
+	rbit	v1.16b, v1.16b
+	stp	q0, q1, [x0]
+	ret
+ENDFUNC
+
+///----- That's all, folks --------------------------------------------------