commit be59b2b31d5ad1f4c72dc351c59b0331caf3897d
Author: Kris Kwiatkowski <kris@amongbytes.com>
Date:   Thu Nov 30 09:37:13 2023 +0000

    Import

diff --git a/aes-arm64.S b/aes-arm64.S
new file mode 100644
index 0000000..91e3867
--- /dev/null
+++ b/aes-arm64.S
@@ -0,0 +1,286 @@
+#include "asm-common.h"
+
+	.arch	armv8-a+crypto
+
+	.extern	F(abort)
+	.extern	F(rijndael_rcon)
+
+	.text
+
+///--------------------------------------------------------------------------
+/// Main code.
+
+/// The ARM crypto extension implements a little-endian version of AES
+/// (though the manual doesn't actually spell this out and you have to
+/// experiment)a, note that internal interface presents as big-endian so
+/// as to work better with things like GCM.  We therefore maintain the round
+/// keys in little-endian form, and have to end-swap blocks in and out.
+///
+/// For added amusement, the crypto extension doesn't implement the larger-
+/// block versions of Rijndael, so we have to end-swap the keys if we're
+/// preparing for one of those.
+
+	// Useful constants.
+	.equ	maxrounds, 16		// maximum number of rounds
+	.equ	maxblksz, 32		// maximum block size, in bytes
+	.equ	kbufsz, maxblksz*(maxrounds + 1) // size of key-sched buffer
+
+	// Context structure.
+	.equ	nr, 0			// number of rounds
+	.equ	w, nr + 4		// encryption key words
+	.equ	wi, w + kbufsz		// decryption key words
+
+///--------------------------------------------------------------------------
+/// Key setup.
+
+FUNC(rijndael_setup_arm64_crypto)
+
+	// Arguments:
+	//	x0 = pointer to context
+	//	w1 = block size in 32-bit words
+	//	x2 = pointer to key material
+	//	x3 = key size in words
+
+	pushreg	x29, x30
+	mov	x29, sp
+
+	// The initial round key material is taken directly from the input
+	// key, so copy it over.  Unfortunately, the key material is not
+	// guaranteed to be aligned in any especially useful way.  Assume
+	// that alignment traps are not enabled.  (Why would they be?  On
+	// A32, alignment traps were part of a transition plan which changed
+	// the way unaligned loads and stores behaved, but there's never been
+	// any other behaviour on A64.)
+	mov	x15, x3
+	add	x4, x0, #w
+0:	sub	x15, x15, #1
+	ldr	w14, [x2], #4
+	str	w14, [x4], #4
+	cbnz	x15, 0b
+
+	// Find out other useful things and prepare for the main loop.
+9:	ldr	w9, [x0, #nr]		// number of rounds
+	madd	w2, w1, w9, w1		// total key size in words
+	leaext	x5, rijndael_rcon	// round constants
+	sub	x6, x2, x3		// minus what we've copied already
+	add	x7, x0, #w		// position in previous cycle
+	movi	v1.4s, #0		// all-zero register for the key
+	mov	x8, #0			// position in current cycle
+
+	// Main key expansion loop.  Dispatch according to the position in
+	// the cycle.
+0:	ldr	w15, [x7], #4		// word from previous cycle
+	cbz	x8, 1f			// first word of the cycle?
+	cmp	x8, #4			// fourth word of the cycle?
+	b.ne	2f
+	cmp	x3, #7			// seven or eight words of key?
+	b.cc	2f
+
+	// Fourth word of the cycle, seven or eight words of key.  We must do
+	// the byte substitution.
+	dup	v0.4s, w14
+	aese	v0.16b, v1.16b		// effectively, just SubBytes
+	mov	w14, v0.s[0]
+	b	2f
+
+	// First word of the cycle.  Byte substitution, rotation, and round
+	// constant.
+1:	ldrb	w13, [x5], #1		// next round constant
+	dup	v0.4s, w14
+	aese	v0.16b, v1.16b		// effectively, just SubBytes
+	mov	w14, v0.s[0]
+	eor	w14, w13, w14, ror #8
+
+	// Common ending: mix in the word from the previous cycle and store.
+2:	eor	w14, w14, w15
+	str	w14, [x4], #4
+
+	// Prepare for the next iteration.  If we're done, then stop; if
+	// we've finished a cycle then reset the counter.
+	add	x8, x8, #1
+	sub	x6, x6, #1
+	cmp	x8, x3
+	cbz	x6, 9f
+	cmov.cs	x8, xzr
+	b	0b
+
+	// Next job is to construct the decryption keys.  The keys for the
+	// first and last rounds don't need to be mangled, but the remaining
+	// ones do -- and they all need to be reordered too.
+	//
+	// The plan of action, then, is to copy the final encryption round's
+	// keys into place first, then to do each of the intermediate rounds
+	// in reverse order, and finally do the first round.
+	//
+	// Do all the heavy lifting with the vector registers.  The order
+	// we're doing this in means that it's OK if we read or write too
+	// much, and there's easily enough buffer space for the
+	// over-enthusiastic reads and writes because the context has space
+	// for 32-byte blocks, which is our maximum and an exact fit for two
+	// full-width registers.
+9:	add	x5, x0, #wi
+	add	x4, x0, #w
+	add	x4, x4, w2, uxtw #2
+	sub	x4, x4, w1, uxtw #2		// last round's keys
+
+	// Copy the last encryption round's keys.
+	ld1	{v0.4s, v1.4s}, [x4]
+	st1	{v0.4s, v1.4s}, [x5]
+
+	// Update the loop variables and stop if we've finished.
+0:	sub	w9, w9, #1
+	add	x5, x5, w1, uxtw #2
+	sub	x4, x4, w1, uxtw #2
+	cbz	w9, 9f
+
+	// Do another middle round's keys...
+	ld1	{v0.4s, v1.4s}, [x4]
+	aesimc	v0.16b, v0.16b
+	aesimc	v1.16b, v1.16b
+	st1	{v0.4s, v1.4s}, [x5]
+	b	0b
+
+	// Finally do the first encryption round.
+9:	ld1	{v0.4s, v1.4s}, [x4]
+	st1	{v0.4s, v1.4s}, [x5]
+
+	// If the block size is not exactly four words then we must end-swap
+	// everything.  We can use fancy vector toys for this.
+	cmp	w1, #4
+	b.eq	9f
+
+	// End-swap the encryption keys.
+	add	x1, x0, #w
+	bl	endswap_block
+
+	// And the decryption keys
+	add	x1, x0, #wi
+	bl	endswap_block
+
+	// All done.
+9:	popreg	x29, x30
+	ret
+
+ENDFUNC
+
+INTFUNC(endswap_block)
+	// End-swap w2 words starting at x1.  x1 is clobbered; w2 is not.
+	// It's OK to work in 16-byte chunks.
+
+	mov	w3, w2
+0:	subs	w3, w3, #4
+	ld1	{v0.4s}, [x1]
+	rev32	v0.16b, v0.16b
+	st1	{v0.4s}, [x1], #16
+	b.hi	0b
+	ret
+
+ENDFUNC
+
+///--------------------------------------------------------------------------
+/// Encrypting and decrypting blocks.
+
+.macro	encdec	op, aes, mc, koff
+  FUNC(rijndael_\op\()_arm64_crypto)
+
+	// Arguments:
+	//	x0 = pointer to context
+	//	x1 = pointer to input block
+	//	x2 = pointer to output block
+
+	// Set things up ready.
+	ldr	w3, [x0, #nr]
+	add	x0, x0, #\koff
+	ld1	{v0.4s}, [x1]
+	rev32	v0.16b, v0.16b
+
+	// Check the number of rounds and dispatch.
+	cmp	w3, #14
+	b.eq	14f
+	cmp	w3, #10
+	b.eq	10f
+	cmp	w3, #12
+	b.eq	12f
+	cmp	w3, #13
+	b.eq	13f
+	cmp	w3, #11
+	b.eq	11f
+	callext	F(abort)
+
+	// Eleven rounds.
+11:	ld1	{v16.4s}, [x0], #16
+	\aes	v0.16b, v16.16b
+	\mc	v0.16b, v0.16b
+	b	10f
+
+	// Twelve rounds.
+12:	ld1	{v16.4s, v17.4s}, [x0], #32
+	\aes	v0.16b, v16.16b
+	\mc	v0.16b, v0.16b
+	\aes	v0.16b, v17.16b
+	\mc	v0.16b, v0.16b
+	b	10f
+
+	// Thirteen rounds.
+13:	ld1	{v16.4s-v18.4s}, [x0], #48
+	\aes	v0.16b, v16.16b
+	\mc	v0.16b, v0.16b
+	\aes	v0.16b, v17.16b
+	\mc	v0.16b, v0.16b
+	\aes	v0.16b, v18.16b
+	\mc	v0.16b, v0.16b
+	b	10f
+
+	// Fourteen rounds.  (Drops through to the ten round case because
+	// this is the next most common.)
+14:	ld1	{v16.4s-v19.4s}, [x0], #64
+	\aes	v0.16b, v16.16b
+	\mc	v0.16b, v0.16b
+	\aes	v0.16b, v17.16b
+	\mc	v0.16b, v0.16b
+	\aes	v0.16b, v18.16b
+	\mc	v0.16b, v0.16b
+	\aes	v0.16b, v19.16b
+	\mc	v0.16b, v0.16b
+	// Drop through...
+
+	// Ten rounds.
+10:	ld1	{v16.4s-v19.4s}, [x0], #64
+	ld1	{v20.4s-v23.4s}, [x0], #64
+	\aes	v0.16b, v16.16b
+	\mc	v0.16b, v0.16b
+	\aes	v0.16b, v17.16b
+	\mc	v0.16b, v0.16b
+	\aes	v0.16b, v18.16b
+	\mc	v0.16b, v0.16b
+	\aes	v0.16b, v19.16b
+	\mc	v0.16b, v0.16b
+
+	ld1	{v16.4s-v18.4s}, [x0], #48
+	\aes	v0.16b, v20.16b
+	\mc	v0.16b, v0.16b
+	\aes	v0.16b, v21.16b
+	\mc	v0.16b, v0.16b
+	\aes	v0.16b, v22.16b
+	\mc	v0.16b, v0.16b
+	\aes	v0.16b, v23.16b
+	\mc	v0.16b, v0.16b
+
+	// Final round has no MixColumns, but is followed by final whitening.
+	\aes	v0.16b, v16.16b
+	\mc	v0.16b, v0.16b
+	\aes	v0.16b, v17.16b
+	eor	v0.16b, v0.16b, v18.16b
+
+	// All done.
+	rev32	v0.16b, v0.16b
+	st1	{v0.4s}, [x2]
+	ret
+
+  ENDFUNC
+.endm
+
+	encdec	eblk, aese, aesmc, w
+	encdec	dblk, aesd, aesimc, wi
+
+///----- That's all, folks --------------------------------------------------
diff --git a/asm-common.h b/asm-common.h
new file mode 100644
index 0000000..50a3004
--- /dev/null
+++ b/asm-common.h
@@ -0,0 +1,1180 @@
+///--------------------------------------------------------------------------
+/// General definitions.
+
+// Preprocessor hacks.
+#define STRINGY(x) _STRINGY(x, y)
+#define _STRINGY(x) #x
+#define GLUE(x, y) _GLUE(x, y)
+#define _GLUE(x, y) x##y
+#define _EMPTY
+
+// Some useful variables.
+	.L$_subsec = 0
+
+// Literal pools done the hard way.
+#define _LIT .text .L$_subsec + 1
+#define _ENDLIT .text .L$_subsec
+#define _LTORG .L$_subsec = .L$_subsec + 2; .text .L$_subsec
+
+// ELF section types.
+#if __ELF__
+#  if CPUFAM_ARMEL
+#    define _SECTTY(ty) %ty
+#  else
+#    define _SECTTY(ty) @ty
+#  endif
+#endif
+
+// Section selection.
+#define TEXT .text .L$_subsec
+#if ABI_WIN
+#  define RODATA .section .rdata, "dr"
+#elif __ELF__
+#  define RODATA .section .rodata, "a", _SECTTY(progbits)
+#else
+#  define RODATA TEXT
+#endif
+#define DATA .data
+
+// Announcing an internal function.
+#define INTFUNC(name)							\
+	TYPE_FUNC(name);						\
+	.macro ENDFUNC; _ENDFUNC(name); .endm;				\
+	.L$_prologue_p = 0; .L$_frameptr_p = 0;				\
+	FUNC_PREHOOK(name);						\
+name:									\
+	FUNC_POSTHOOK(name)
+
+// Announcing an external function.
+#define FUNC(name)							\
+	.globl	F(name);						\
+INTFUNC(F(name))
+
+// Marking the end of a function.
+#define _ENDFUNC(name)							\
+	.if ~ .L$_prologue_p; .error "Missing `endprologue'"; .endif;	\
+	.if .L$_frameptr_p; .purgem dropfp; .endif;			\
+	.purgem	ENDFUNC;						\
+	SIZE_OBJ(name);							\
+	ENDFUNC_HOOK(name);						\
+	_LTORG
+
+// Make a helper function, if necessary.
+#define AUXFN(name)							\
+  .ifndef .L$_auxfn_def.name;						\
+	.text 7128;							\
+	.macro _ENDAUXFN; _ENDAUXFN_TAIL(name); .endm;			\
+	FUNC_PREHOOK(name);						\
+name:
+#define _ENDAUXFN_TAIL(name)						\
+	.purgem _ENDAUXFN;						\
+	.text .L$_subsec;						\
+	.L$_auxfn_def.name = 1
+#define ENDAUXFN _ENDAUXFN; .endif
+
+///--------------------------------------------------------------------------
+/// ELF-specific hacking.
+
+#if __ELF__
+
+#if __PIC__ || __PIE__
+#  define WANT_PIC 1
+#endif
+
+#define TYPE_FUNC(name) .type name, STT_FUNC
+
+#define SIZE_OBJ(name) .size name, . - name
+
+#endif
+
+///--------------------------------------------------------------------------
+/// Windows-specific hacking.
+
+#if ABI_WIN
+
+#if CPUFAM_X86
+#  define F(name) _##name
+#endif
+
+#endif
+
+///--------------------------------------------------------------------------
+/// x86- and amd64-specific hacking.
+///
+/// It's (slightly) easier to deal with both of these in one go.
+
+#if CPUFAM_X86 || CPUFAM_AMD64
+
+// Word size.
+#if CPUFAM_X86
+#  define WORDSZ 4
+#endif
+#if CPUFAM_AMD64
+#  define WORDSZ 8
+#endif
+
+// Set the function hooks.
+#define FUNC_PREHOOK(_) .balign 16
+
+// On Windows, arrange to install stack-unwinding data.
+#if CPUFAM_AMD64 && ABI_WIN
+#  define FUNC_POSTHOOK(name) .seh_proc name
+#  define ENDFUNC_HOOK(_) .seh_endproc
+// Procedures are expected to invoke `.seh_setframe' if necessary, and
+// `.seh_pushreg' and friends, and `.seh_endprologue'.
+#endif
+
+#if __ELF__
+#  define FUNC_POSTHOOK(_) .cfi_startproc
+#  define ENDFUNC_HOOK(_) .cfi_endproc
+#endif
+
+// Don't use the wretched AT&T syntax.  It's festooned with pointless
+// punctuation, and all of the data movement is backwards.  Ugh!
+	.intel_syntax noprefix
+
+// Call external subroutine at ADDR, possibly via PLT.
+.macro	callext addr
+#if WANT_PIC
+	call	\addr@PLT
+#else
+	call	\addr
+#endif
+.endm
+
+// Do I need to arrange a spare GOT register?
+#if WANT_PIC && CPUFAM_X86
+#  define NEED_GOT 1
+#endif
+#define GOTREG ebx			// Not needed in AMD64 so don't care.
+
+// Maybe load GOT address into GOT.
+.macro	ldgot got=GOTREG
+#if WANT_PIC && CPUFAM_X86
+  AUXFN(_ldgot.\got)
+	mov	\got, [esp]
+	ret
+  ENDAUXFN
+	call	_ldgot.\got
+	add	\got, offset _GLOBAL_OFFSET_TABLE_
+#endif
+.endm
+
+// Load address of external symbol ADDR into REG, maybe using GOT.
+.macro	leaext reg, addr, got=GOTREG
+#if WANT_PIC
+#  if CPUFAM_X86
+	mov	\reg, [\got + \addr@GOT]
+#  endif
+#  if CPUFAM_AMD64
+	mov	\reg, \addr@GOTPCREL[rip]
+#  endif
+#else
+#  if CPUFAM_X86
+	mov	\reg, offset \addr
+#  endif
+#  if CPUFAM_AMD64
+	lea	\reg, \addr[rip]
+#  endif
+#endif
+.endm
+
+// Address expression (possibly using a base register, and a displacement)
+// referring to ADDR, which is within our module, maybe using GOT.
+#define INTADDR(...) INTADDR__0(__VA_ARGS__, GOTREG, dummy)
+#define INTADDR__0(addr, got, ...) INTADDR__1(addr, got)
+#if CPUFAM_AMD64
+#  define INTADDR__1(addr, got) addr + rip
+#elif WANT_PIC
+#  define INTADDR__1(addr, got) got + addr@GOTOFF
+#else
+#  define INTADDR__1(addr, got) addr
+#endif
+
+// Permutations for SIMD instructions.  SHUF(A, B, C, D) is an immediate,
+// suitable for use in `pshufd' or `shufpd', which copies element A
+// (0 <= A < 4) of the source to element 0 of the destination, element B to
+// element 1, element C to element 2, and element D to element 3.
+#define SHUF(a, b, c, d) ((a) + 4*(b) + 16*(c) + 64*(d))
+
+// Map register names to their individual pieces.
+
+// Apply decoration decor to (internal) register name reg of type ty.
+//
+// See `R_...' for internal register names.  Decorations are as follows.
+//
+//	b	low byte (e.g., `al', `r8b')
+//	h	high byte (e.g., `ah')
+//	w	word (e.g., `ax', `r8w')
+//	d	doubleword (e.g., `eax', `r8d')
+//	q	quadword (e.g., `rax', `r8')
+//	r	whole register (doubleword on x86, quadword on amd64)
+//
+// And types are as follows.
+//
+//	abcd	the four traditional registers `a', `b', `c', `d'
+//	xp	the four pointer registers `si', `di', `bp', `sp'
+//	ip	the instruction pointer `ip'
+//	rn	the AMD64 numbered registers `r8'--`r15'
+#define _DECOR(ty, decor, reg) _DECOR_##ty##_##decor(reg)
+
+// Internal macros: _DECOR_ty_decor(reg) applies decoration decor to
+// (internal) register name reg of type ty.
+
+#define _DECOR_abcd_b(reg) reg##l
+#define _DECOR_abcd_h(reg) reg##h
+#define _DECOR_abcd_w(reg) reg##x
+#define _DECOR_abcd_d(reg) e##reg##x
+#if CPUFAM_AMD64
+#  define _DECOR_abcd_q(reg) r##reg##x
+#endif
+
+#define _DECOR_xp_w(reg) reg
+#define _DECOR_xp_d(reg) e##reg
+#if CPUFAM_AMD64
+#  define _DECOR_xp_b(reg) reg##l
+#  define _DECOR_xp_q(reg) r##reg
+#endif
+
+#define _DECOR_ip_w(reg) reg
+#define _DECOR_ip_d(reg) e##reg
+#if CPUFAM_AMD64
+#  define _DECOR_ip_q(reg) r##reg
+#endif
+
+#if CPUFAM_AMD64
+#  define _DECOR_rn_b(reg) reg##b
+#  define _DECOR_rn_w(reg) reg##w
+#  define _DECOR_rn_d(reg) reg##d
+#  define _DECOR_rn_q(reg) reg
+#  define _DECOR_rn_r(reg) reg
+#endif
+
+#define _DECOR_mem_b(addr) byte ptr addr
+#define _DECOR_mem_w(addr) word ptr addr
+#define _DECOR_mem_d(addr) dword ptr addr
+#if CPUFAM_AMD64
+#  define _DECOR_mem_q(addr) qword ptr addr
+#endif
+
+#define _DECOR_imm_b(imm) byte imm
+#define _DECOR_imm_w(imm) word imm
+#define _DECOR_imm_d(imm) dword imm
+#if CPUFAM_AMD64
+#  define _DECOR_imm_q(imm) qword imm
+#endif
+
+#if CPUFAM_X86
+#  define _DECOR_abcd_r(reg) e##reg##x
+#  define _DECOR_xp_r(reg) e##reg
+#  define _DECOR_ip_r(reg) e##reg
+#  define _DECOR_mem_r(addr) dword ptr addr
+#  define _DECOR_imm_r(imm) dword imm
+#endif
+#if CPUFAM_AMD64
+#  define _DECOR_abcd_r(reg) r##reg##x
+#  define _DECOR_xp_r(reg) r##reg
+#  define _DECOR_ip_r(reg) r##reg
+#  define _DECOR_mem_r(addr) qword ptr addr
+#  define _DECOR_imm_r(imm) qword imm
+#endif
+
+// R_r(decor) applies decoration decor to register r, which is an internal
+// register name.  The internal register names are: `ip', `a', `b', `c', `d',
+// `si', `di', `bp', `sp', `r8'--`r15'.
+#define R_nil(decor) nil
+#define R_ip(decor) _DECOR(ip, decor, ip)
+#define R_a(decor) _DECOR(abcd, decor, a)
+#define R_b(decor) _DECOR(abcd, decor, b)
+#define R_c(decor) _DECOR(abcd, decor, c)
+#define R_d(decor) _DECOR(abcd, decor, d)
+#define R_si(decor) _DECOR(xp, decor, si)
+#define R_di(decor) _DECOR(xp, decor, di)
+#define R_bp(decor) _DECOR(xp, decor, bp)
+#define R_sp(decor) _DECOR(xp, decor, sp)
+#if CPUFAM_AMD64
+#  define R_r8(decor) _DECOR(rn, decor, r8)
+#  define R_r9(decor) _DECOR(rn, decor, r9)
+#  define R_r10(decor) _DECOR(rn, decor, r10)
+#  define R_r11(decor) _DECOR(rn, decor, r11)
+#  define R_r12(decor) _DECOR(rn, decor, r12)
+#  define R_r13(decor) _DECOR(rn, decor, r13)
+#  define R_r14(decor) _DECOR(rn, decor, r14)
+#  define R_r15(decor) _DECOR(rn, decor, r15)
+#endif
+
+// Refer to an in-memory datum of the type implied by decor residing at
+// address addr (which should supply its own square-brackets).
+#define MEM(decor, addr) _DECOR(mem, decor, addr)
+
+// Refer to an immediate datum of the type implied by decor.
+#define IMM(decor, imm) _DECOR(mem, decor, imm)
+
+// Applies decoration decor to assembler-level register name reg.
+#define _REGFORM(reg, decor) _GLUE(_REGFORM_, reg)(decor)
+
+// Internal macros: _REGFORM_r(decor) applies decoration decor to an
+// assembler-level register name, in place of any decoration that register
+// name has already.
+
+#define _REGFORM_nil(decor) R_nil(decor)
+
+#define _REGFORM_ip(decor) R_ip(decor)
+#define _REGFORM_eip(decor) R_ip(decor)
+
+#define _REGFORM_a(decor) R_a(decor)
+#define _REGFORM_al(decor) R_a(decor)
+#define _REGFORM_ah(decor) R_a(decor)
+#define _REGFORM_ax(decor) R_a(decor)
+#define _REGFORM_eax(decor) R_a(decor)
+
+#define _REGFORM_b(decor) R_b(decor)
+#define _REGFORM_bl(decor) R_b(decor)
+#define _REGFORM_bh(decor) R_b(decor)
+#define _REGFORM_bx(decor) R_b(decor)
+#define _REGFORM_ebx(decor) R_b(decor)
+
+#define _REGFORM_c(decor) R_c(decor)
+#define _REGFORM_cl(decor) R_c(decor)
+#define _REGFORM_ch(decor) R_c(decor)
+#define _REGFORM_cx(decor) R_c(decor)
+#define _REGFORM_ecx(decor) R_c(decor)
+
+#define _REGFORM_d(decor) R_d(decor)
+#define _REGFORM_dl(decor) R_d(decor)
+#define _REGFORM_dh(decor) R_d(decor)
+#define _REGFORM_dx(decor) R_d(decor)
+#define _REGFORM_edx(decor) R_d(decor)
+
+#define _REGFORM_si(decor) R_si(decor)
+#define _REGFORM_sil(decor) R_si(decor)
+#define _REGFORM_esi(decor) R_si(decor)
+
+#define _REGFORM_di(decor) R_di(decor)
+#define _REGFORM_dil(decor) R_di(decor)
+#define _REGFORM_edi(decor) R_di(decor)
+
+#define _REGFORM_bp(decor) R_bp(decor)
+#define _REGFORM_bpl(decor) R_bp(decor)
+#define _REGFORM_ebp(decor) R_bp(decor)
+
+#define _REGFORM_sp(decor) R_sp(decor)
+#define _REGFORM_spl(decor) R_sp(decor)
+#define _REGFORM_esp(decor) R_sp(decor)
+
+#if CPUFAM_AMD64
+
+#  define _REGFORM_rip(decor) R_ip(decor)
+#  define _REGFORM_rsp(decor) R_sp(decor)
+#  define _REGFORM_rbp(decor) R_bp(decor)
+#  define _REGFORM_rdi(decor) R_di(decor)
+#  define _REGFORM_rsi(decor) R_si(decor)
+#  define _REGFORM_rdx(decor) R_d(decor)
+#  define _REGFORM_rcx(decor) R_c(decor)
+#  define _REGFORM_rbx(decor) R_b(decor)
+#  define _REGFORM_rax(decor) R_a(decor)
+
+#  define _REGFORM_r8(decor) R_r8(decor)
+#  define _REGFORM_r8b(decor) R_r8(decor)
+#  define _REGFORM_r8w(decor) R_r8(decor)
+#  define _REGFORM_r8d(decor) R_r8(decor)
+
+#  define _REGFORM_r9(decor) R_r9(decor)
+#  define _REGFORM_r9b(decor) R_r9(decor)
+#  define _REGFORM_r9w(decor) R_r9(decor)
+#  define _REGFORM_r9d(decor) R_r9(decor)
+
+#  define _REGFORM_r10(decor) R_r10(decor)
+#  define _REGFORM_r10b(decor) R_r10(decor)
+#  define _REGFORM_r10w(decor) R_r10(decor)
+#  define _REGFORM_r10d(decor) R_r10(decor)
+
+#  define _REGFORM_r11(decor) R_r11(decor)
+#  define _REGFORM_r11b(decor) R_r11(decor)
+#  define _REGFORM_r11w(decor) R_r11(decor)
+#  define _REGFORM_r11d(decor) R_r11(decor)
+
+#  define _REGFORM_r12(decor) R_r12(decor)
+#  define _REGFORM_r12b(decor) R_r12(decor)
+#  define _REGFORM_r12w(decor) R_r12(decor)
+#  define _REGFORM_r12d(decor) R_r12(decor)
+
+#  define _REGFORM_r13(decor) R_r13(decor)
+#  define _REGFORM_r13b(decor) R_r13(decor)
+#  define _REGFORM_r13w(decor) R_r13(decor)
+#  define _REGFORM_r13d(decor) R_r13(decor)
+
+#  define _REGFORM_r14(decor) R_r14(decor)
+#  define _REGFORM_r14b(decor) R_r14(decor)
+#  define _REGFORM_r14w(decor) R_r14(decor)
+#  define _REGFORM_r14d(decor) R_r14(decor)
+
+#  define _REGFORM_r15(decor) R_r15(decor)
+#  define _REGFORM_r15b(decor) R_r15(decor)
+#  define _REGFORM_r15w(decor) R_r15(decor)
+#  define _REGFORM_r15d(decor) R_r15(decor)
+
+#endif
+
+// Macros for converting register names.
+#define BYTE(reg) _REGFORM(reg, b)
+#define HIBYTE(reg) _REGFORM(reg, h)
+#define WORD(reg) _REGFORM(reg, w)
+#define DWORD(reg) _REGFORM(reg, d)
+#if CPUFAM_AMD64
+#  define QWORD(reg) _REGFORM(reg, q)
+#endif
+#define WHOLE(reg) _REGFORM(reg, r)
+
+// Macros for some common registers.
+#define AX R_a(r)
+#define BX R_b(r)
+#define CX R_c(r)
+#define DX R_d(r)
+#define SI R_si(r)
+#define DI R_di(r)
+#define BP R_bp(r)
+#define SP R_sp(r)
+
+// Stack management and unwinding.
+.macro	setfp	fp=BP, offset=0
+  .if \offset == 0
+	mov	\fp, SP
+#if __ELF__
+	  .cfi_def_cfa_register \fp
+#endif
+#if ABI_WIN && CPUFAM_AMD64
+	  .seh_setframe \fp, 0
+#endif
+  .else
+	lea	\fp, [SP + \offset]
+#if __ELF__
+	  .cfi_def_cfa_register \fp
+	  .cfi_adjust_cfa_offset -\offset
+#endif
+#if ABI_WIN && CPUFAM_AMD64
+	  .seh_setframe \fp, \offset
+#endif
+  .endif
+	.L$_frameptr_p = -1
+	.macro dropfp; _dropfp	\fp, \offset; .endm
+.endm
+
+.macro	_dropfp	fp, offset=0
+  .if \offset == 0
+	mov	SP, \fp
+#if __ELF__
+	  .cfi_def_cfa_register SP
+#endif
+  .else
+	lea	SP, [\fp - \offset]
+#if __ELF__
+	  .cfi_def_cfa_register SP
+	  .cfi_adjust_cfa_offset +\offset
+#endif
+  .endif
+	.L$_frameptr_p = 0
+	.purgem	dropfp
+.endm
+
+.macro	stalloc	n
+	sub	SP, \n
+#if __ELF__
+	  .cfi_adjust_cfa_offset +\n
+#endif
+#if ABI_WIN && CPUFAM_AMD64
+	  .seh_stackalloc \n
+#endif
+.endm
+
+.macro	stfree	n
+	add	SP, \n
+#if __ELF__
+	  .cfi_adjust_cfa_offset -\n
+#endif
+.endm
+
+.macro	pushreg	r
+	push	\r
+#if __ELF__
+	  .cfi_adjust_cfa_offset +WORDSZ
+	  .cfi_rel_offset \r, 0
+#endif
+#if ABI_WIN && CPUFAM_AMD64
+	  .seh_pushreg \r
+#endif
+.endm
+
+.macro	popreg	r
+	pop	\r
+#if __ELF__
+	  .cfi_adjust_cfa_offset -WORDSZ
+	  .cfi_restore \r
+#endif
+.endm
+
+.macro	savexmm	r, offset
+	movdqa	[SP + \offset], \r
+#if ABI_WIN && CPUFAM_AMD64
+	  .seh_savexmm \r, \offset
+#endif
+.endm
+
+.macro	rstrxmm	r, offset
+	movdqa	\r, [SP + \offset]
+.endm
+
+.macro	endprologue
+#if ABI_WIN && CPUFAM_AMD64
+	  .seh_endprologue
+#endif
+	.L$_prologue_p = -1
+.endm
+
+#endif
+
+///--------------------------------------------------------------------------
+/// ARM-specific hacking.
+
+#if CPUFAM_ARMEL
+
+// ARM/Thumb mode things.  Use ARM by default.
+#define ARM .arm; .L$_pcoff = 8
+#define THUMB .thumb; .L$_pcoff = 4
+	ARM
+
+// Set the function hooks.
+#define FUNC_PREHOOK(_) .balign 4; .fnstart
+#define ENDFUNC_HOOK(_) .fnend; .ltorg
+
+// Call external subroutine at ADDR, possibly via PLT.
+.macro	callext addr, cond=
+#if WANT_PIC
+	bl\cond	\addr(PLT)
+#else
+	bl\cond	\addr
+#endif
+.endm
+
+// Do I need to arrange a spare GOT register?
+#if WANT_PIC
+#  define NEED_GOT 1
+#endif
+#define GOTREG r9
+
+// Maybe load GOT address into GOT.
+.macro	ldgot	cond=, got=GOTREG
+#if WANT_PIC
+	ldr\cond \got, .L$_ldgot$\@
+.L$_ldgot_pc$\@:
+	add\cond \got, pc, \got
+  _LIT
+	.balign	4
+.L$_ldgot$\@:
+	.word	_GLOBAL_OFFSET_TABLE_ - .L$_ldgot_pc$\@ - .L$_pcoff
+  _ENDLIT
+#endif
+.endm
+
+// Load address of external symbol ADDR into REG, maybe using GOT.
+.macro	leaext	reg, addr, cond=, got=GOTREG
+#if WANT_PIC
+	ldr\cond \reg, .L$_leaext$\@
+	ldr\cond \reg, [\got, \reg]
+  _LIT
+	.balign	4
+.L$_leaext$\@:
+	.word	\addr(GOT)
+  _ENDLIT
+#else
+	ldr\cond \reg, =\addr
+#endif
+.endm
+
+// Load address of external symbol ADDR into REG directly.
+.macro	leaextq	reg, addr, cond=
+#if WANT_PIC
+	ldr\cond \reg, .L$_leaextq$\@
+.L$_leaextq_pc$\@:
+  .if .L$_pcoff == 8
+	ldr\cond \reg, [pc, \reg]
+  .else
+	add\cond \reg, pc
+	ldr\cond \reg, [\reg]
+  .endif
+  _LIT
+	.balign	4
+.L$_leaextq$\@:
+	.word	\addr(GOT_PREL) + (. - .L$_leaextq_pc$\@ - .L$_pcoff)
+  _ENDLIT
+#else
+	ldr\cond \reg, =\addr
+#endif
+.endm
+
+.macro	vzero	vz=q15
+	// Set VZ (default q15) to zero.
+	vmov.u32 \vz, #0
+.endm
+
+.macro	vshl128	vd, vn, nbit, vz=q15
+	// Set VD to VN shifted left by NBIT.  Assume VZ (default q15) is
+	// all-bits-zero.  NBIT must be a multiple of 8.
+  .if \nbit&3 != 0
+	.error	"shift quantity must be whole number of bytes"
+  .endif
+	vext.8	\vd, \vz, \vn, #16 - (\nbit >> 3)
+.endm
+
+.macro	vshr128	vd, vn, nbit, vz=q15
+	// Set VD to VN shifted right by NBIT.  Assume VZ (default q15) is
+	// all-bits-zero.  NBIT must be a multiple of 8.
+  .if \nbit&3 != 0
+	.error	"shift quantity must be whole number of bytes"
+  .endif
+	vext.8	\vd, \vn, \vz, #\nbit >> 3
+.endm
+
+// Apply decoration decor to register name reg.
+#define _REGFORM(reg, decor) _GLUE(_REGFORM_, reg)(decor)
+
+// Internal macros: `_REGFORM_r(decor)' applies decoration decor to register
+// name r.
+
+#define _REGFORM_nil(decor) nil
+
+#define _REGFORM_s0(decor) _DECOR(s, decor, 0)
+#define _REGFORM_s1(decor) _DECOR(s, decor, 1)
+#define _REGFORM_s2(decor) _DECOR(s, decor, 2)
+#define _REGFORM_s3(decor) _DECOR(s, decor, 3)
+#define _REGFORM_s4(decor) _DECOR(s, decor, 4)
+#define _REGFORM_s5(decor) _DECOR(s, decor, 5)
+#define _REGFORM_s6(decor) _DECOR(s, decor, 6)
+#define _REGFORM_s7(decor) _DECOR(s, decor, 7)
+#define _REGFORM_s8(decor) _DECOR(s, decor, 8)
+#define _REGFORM_s9(decor) _DECOR(s, decor, 9)
+#define _REGFORM_s10(decor) _DECOR(s, decor, 10)
+#define _REGFORM_s11(decor) _DECOR(s, decor, 11)
+#define _REGFORM_s12(decor) _DECOR(s, decor, 12)
+#define _REGFORM_s13(decor) _DECOR(s, decor, 13)
+#define _REGFORM_s14(decor) _DECOR(s, decor, 14)
+#define _REGFORM_s15(decor) _DECOR(s, decor, 15)
+#define _REGFORM_s16(decor) _DECOR(s, decor, 16)
+#define _REGFORM_s17(decor) _DECOR(s, decor, 17)
+#define _REGFORM_s18(decor) _DECOR(s, decor, 18)
+#define _REGFORM_s19(decor) _DECOR(s, decor, 19)
+#define _REGFORM_s20(decor) _DECOR(s, decor, 20)
+#define _REGFORM_s21(decor) _DECOR(s, decor, 21)
+#define _REGFORM_s22(decor) _DECOR(s, decor, 22)
+#define _REGFORM_s23(decor) _DECOR(s, decor, 23)
+#define _REGFORM_s24(decor) _DECOR(s, decor, 24)
+#define _REGFORM_s25(decor) _DECOR(s, decor, 25)
+#define _REGFORM_s26(decor) _DECOR(s, decor, 26)
+#define _REGFORM_s27(decor) _DECOR(s, decor, 27)
+#define _REGFORM_s28(decor) _DECOR(s, decor, 28)
+#define _REGFORM_s29(decor) _DECOR(s, decor, 29)
+#define _REGFORM_s30(decor) _DECOR(s, decor, 30)
+#define _REGFORM_s31(decor) _DECOR(s, decor, 31)
+
+#define _REGFORM_d0(decor) _DECOR(d, decor, 0)
+#define _REGFORM_d1(decor) _DECOR(d, decor, 1)
+#define _REGFORM_d2(decor) _DECOR(d, decor, 2)
+#define _REGFORM_d3(decor) _DECOR(d, decor, 3)
+#define _REGFORM_d4(decor) _DECOR(d, decor, 4)
+#define _REGFORM_d5(decor) _DECOR(d, decor, 5)
+#define _REGFORM_d6(decor) _DECOR(d, decor, 6)
+#define _REGFORM_d7(decor) _DECOR(d, decor, 7)
+#define _REGFORM_d8(decor) _DECOR(d, decor, 8)
+#define _REGFORM_d9(decor) _DECOR(d, decor, 9)
+#define _REGFORM_d10(decor) _DECOR(d, decor, 10)
+#define _REGFORM_d11(decor) _DECOR(d, decor, 11)
+#define _REGFORM_d12(decor) _DECOR(d, decor, 12)
+#define _REGFORM_d13(decor) _DECOR(d, decor, 13)
+#define _REGFORM_d14(decor) _DECOR(d, decor, 14)
+#define _REGFORM_d15(decor) _DECOR(d, decor, 15)
+#define _REGFORM_d16(decor) _DECOR(d, decor, 16)
+#define _REGFORM_d17(decor) _DECOR(d, decor, 17)
+#define _REGFORM_d18(decor) _DECOR(d, decor, 18)
+#define _REGFORM_d19(decor) _DECOR(d, decor, 19)
+#define _REGFORM_d20(decor) _DECOR(d, decor, 20)
+#define _REGFORM_d21(decor) _DECOR(d, decor, 21)
+#define _REGFORM_d22(decor) _DECOR(d, decor, 22)
+#define _REGFORM_d23(decor) _DECOR(d, decor, 23)
+#define _REGFORM_d24(decor) _DECOR(d, decor, 24)
+#define _REGFORM_d25(decor) _DECOR(d, decor, 25)
+#define _REGFORM_d26(decor) _DECOR(d, decor, 26)
+#define _REGFORM_d27(decor) _DECOR(d, decor, 27)
+#define _REGFORM_d28(decor) _DECOR(d, decor, 28)
+#define _REGFORM_d29(decor) _DECOR(d, decor, 29)
+#define _REGFORM_d30(decor) _DECOR(d, decor, 30)
+#define _REGFORM_d31(decor) _DECOR(d, decor, 31)
+
+#define _REGFORM_q0(decor) _DECOR(q, decor, 0)
+#define _REGFORM_q1(decor) _DECOR(q, decor, 1)
+#define _REGFORM_q2(decor) _DECOR(q, decor, 2)
+#define _REGFORM_q3(decor) _DECOR(q, decor, 3)
+#define _REGFORM_q4(decor) _DECOR(q, decor, 4)
+#define _REGFORM_q5(decor) _DECOR(q, decor, 5)
+#define _REGFORM_q6(decor) _DECOR(q, decor, 6)
+#define _REGFORM_q7(decor) _DECOR(q, decor, 7)
+#define _REGFORM_q8(decor) _DECOR(q, decor, 8)
+#define _REGFORM_q9(decor) _DECOR(q, decor, 9)
+#define _REGFORM_q10(decor) _DECOR(q, decor, 10)
+#define _REGFORM_q11(decor) _DECOR(q, decor, 11)
+#define _REGFORM_q12(decor) _DECOR(q, decor, 12)
+#define _REGFORM_q13(decor) _DECOR(q, decor, 13)
+#define _REGFORM_q14(decor) _DECOR(q, decor, 14)
+#define _REGFORM_q15(decor) _DECOR(q, decor, 15)
+
+// `_LOPART(n)' and `_HIPART(n)' return the numbers of the register halves of
+// register n, i.e., 2*n and 2*n + 1 respectively.
+#define _LOPART(n) _GLUE(_LOPART_, n)
+#define _HIPART(n) _GLUE(_HIPART_, n)
+
+// Internal macros: `_LOPART_n' and `_HIPART_n' return the numbers of the
+// register halves of register n, i.e., 2*n and 2*n + 1 respectively.
+
+#define _LOPART_0 0
+#define _HIPART_0 1
+#define _LOPART_1 2
+#define _HIPART_1 3
+#define _LOPART_2 4
+#define _HIPART_2 5
+#define _LOPART_3 6
+#define _HIPART_3 7
+#define _LOPART_4 8
+#define _HIPART_4 9
+#define _LOPART_5 10
+#define _HIPART_5 11
+#define _LOPART_6 12
+#define _HIPART_6 13
+#define _LOPART_7 14
+#define _HIPART_7 15
+#define _LOPART_8 16
+#define _HIPART_8 17
+#define _LOPART_9 18
+#define _HIPART_9 19
+#define _LOPART_10 20
+#define _HIPART_10 21
+#define _LOPART_11 22
+#define _HIPART_11 23
+#define _LOPART_12 24
+#define _HIPART_12 25
+#define _LOPART_13 26
+#define _HIPART_13 27
+#define _LOPART_14 28
+#define _HIPART_14 29
+#define _LOPART_15 30
+#define _HIPART_15 31
+
+// Return the register number of the pair containing register n, i.e.,
+// floor(n/2).
+#define _PAIR(n) _GLUE(_PAIR_, n)
+
+// Internal macros: `_PAIR_n' returns the register number of the pair
+// containing register n, i.e., floor(n/2).
+#define _PAIR_0 0
+#define _PAIR_1 0
+#define _PAIR_2 1
+#define _PAIR_3 1
+#define _PAIR_4 2
+#define _PAIR_5 2
+#define _PAIR_6 3
+#define _PAIR_7 3
+#define _PAIR_8 4
+#define _PAIR_9 4
+#define _PAIR_10 5
+#define _PAIR_11 5
+#define _PAIR_12 6
+#define _PAIR_13 6
+#define _PAIR_14 7
+#define _PAIR_15 7
+#define _PAIR_16 8
+#define _PAIR_17 8
+#define _PAIR_18 9
+#define _PAIR_19 9
+#define _PAIR_20 10
+#define _PAIR_21 10
+#define _PAIR_22 11
+#define _PAIR_23 11
+#define _PAIR_24 12
+#define _PAIR_25 12
+#define _PAIR_26 13
+#define _PAIR_27 13
+#define _PAIR_28 14
+#define _PAIR_29 14
+#define _PAIR_30 15
+#define _PAIR_31 15
+
+// Apply decoration decor to register number n of type ty.  Decorations are
+// as follows.
+//
+//	decor	types	meaning
+//	Q	s, d	the NEON qN register containing this one
+//	D	s	the NEON dN register containing this one
+//	D0	q	the low 64-bit half of this one
+//	D1	q	the high 64-bit half of this one
+//	S0	d, q	the first 32-bit piece of this one
+//	S1	d, q	the second 32-bit piece of this one
+//	S2	q	the third 32-bit piece of this one
+//	S3	q	the fourth 32-bit piece of this one
+//	Bn	q	the nth byte of this register, as a scalar
+//	Hn	q	the nth halfword of this register, as a scalar
+//	Wn	q	the nth word of this register, as a scalar
+#define _DECOR(ty, decor, n) _DECOR_##ty##_##decor(n)
+
+// Internal macros: `_DECOR_ty_decor(n)' applies decoration decor to register
+// number n of type ty.
+
+#define _DECOR_s_Q(n) GLUE(q, _PAIR(_PAIR(n)))
+#define _DECOR_s_D(n) GLUE(d, _PAIR(n))
+
+#define _DECOR_d_Q(n) GLUE(q, _PAIR(n))
+#define _DECOR_d_S0(n) GLUE(s, _LOPART(n))
+#define _DECOR_d_S1(n) GLUE(s, _LOPART(n))
+
+#define _DECOR_q_D0(n) GLUE(d, _LOPART(n))
+#define _DECOR_q_D1(n) GLUE(d, _HIPART(n))
+#define _DECOR_q_S0(n) GLUE(s, _LOPART(_LOPART(n)))
+#define _DECOR_q_S1(n) GLUE(s, _HIPART(_LOPART(n)))
+#define _DECOR_q_S2(n) GLUE(s, _LOPART(_HIPART(n)))
+#define _DECOR_q_S3(n) GLUE(s, _HIPART(_HIPART(n)))
+#define _DECOR_q_W0(n) GLUE(d, _LOPART(n))[0]
+#define _DECOR_q_W1(n) GLUE(d, _LOPART(n))[1]
+#define _DECOR_q_W2(n) GLUE(d, _HIPART(n))[0]
+#define _DECOR_q_W3(n) GLUE(d, _HIPART(n))[1]
+#define _DECOR_q_H0(n) GLUE(d, _LOPART(n))[0]
+#define _DECOR_q_H1(n) GLUE(d, _LOPART(n))[1]
+#define _DECOR_q_H2(n) GLUE(d, _LOPART(n))[2]
+#define _DECOR_q_H3(n) GLUE(d, _LOPART(n))[3]
+#define _DECOR_q_H4(n) GLUE(d, _HIPART(n))[0]
+#define _DECOR_q_H5(n) GLUE(d, _HIPART(n))[1]
+#define _DECOR_q_H6(n) GLUE(d, _HIPART(n))[2]
+#define _DECOR_q_H7(n) GLUE(d, _HIPART(n))[3]
+#define _DECOR_q_B0(n) GLUE(d, _LOPART(n))[0]
+#define _DECOR_q_B1(n) GLUE(d, _LOPART(n))[1]
+#define _DECOR_q_B2(n) GLUE(d, _LOPART(n))[2]
+#define _DECOR_q_B3(n) GLUE(d, _LOPART(n))[3]
+#define _DECOR_q_B4(n) GLUE(d, _LOPART(n))[4]
+#define _DECOR_q_B5(n) GLUE(d, _LOPART(n))[5]
+#define _DECOR_q_B6(n) GLUE(d, _LOPART(n))[6]
+#define _DECOR_q_B7(n) GLUE(d, _LOPART(n))[7]
+#define _DECOR_q_B8(n) GLUE(d, _HIPART(n))[0]
+#define _DECOR_q_B9(n) GLUE(d, _HIPART(n))[1]
+#define _DECOR_q_B10(n) GLUE(d, _HIPART(n))[2]
+#define _DECOR_q_B11(n) GLUE(d, _HIPART(n))[3]
+#define _DECOR_q_B12(n) GLUE(d, _HIPART(n))[4]
+#define _DECOR_q_B13(n) GLUE(d, _HIPART(n))[5]
+#define _DECOR_q_B14(n) GLUE(d, _HIPART(n))[6]
+#define _DECOR_q_B15(n) GLUE(d, _HIPART(n))[7]
+
+// Macros for navigating the NEON register hierarchy.
+#define S0(reg) _REGFORM(reg, S0)
+#define S1(reg) _REGFORM(reg, S1)
+#define S2(reg) _REGFORM(reg, S2)
+#define S3(reg) _REGFORM(reg, S3)
+#define D(reg) _REGFORM(reg, D)
+#define D0(reg) _REGFORM(reg, D0)
+#define D1(reg) _REGFORM(reg, D1)
+#define Q(reg) _REGFORM(reg, Q)
+
+// Macros for indexing quadword registers.
+#define QB(reg, i) _REGFORM(reg, B##i)
+#define QH(reg, i) _REGFORM(reg, H##i)
+#define QW(reg, i) _REGFORM(reg, W##i)
+
+// Macros for converting vldm/vstm ranges.
+#define QQ(qlo, qhi) D0(qlo)-D1(qhi)
+
+// Stack management and unwinding.
+.macro	setfp	fp=r11, offset=0
+  .if \offset == 0
+	mov	\fp, sp
+	  .setfp \fp, sp
+  .else
+	add	\fp, sp, #\offset
+	  .setfp \fp, sp, #\offset
+  .endif
+	.macro dropfp; _dropfp	\fp, \offset; .endm
+	.L$_frameptr_p = -1
+.endm
+
+.macro	_dropfp	fp, offset=0
+  .if \offset == 0
+	mov	sp, \fp
+  .else
+	sub	sp, \fp, #\offset
+  .endif
+	.purgem	dropfp
+	.L$_frameptr_p = 0
+.endm
+
+.macro	stalloc	n
+	sub	sp, sp, #\n
+	  .pad #\n
+.endm
+
+.macro	stfree	n
+	add	sp, sp, #\n
+	  .pad #-\n
+.endm
+
+.macro	pushreg	rr:vararg
+	push	{\rr}
+	  .save {\rr}
+.endm
+
+.macro	popreg	rr:vararg
+	pop	{\rr}
+.endm
+
+.macro	pushvfp rr:vararg
+	vstmdb	sp!, {\rr}
+	  .vsave {\rr}
+.endm
+
+.macro	popvfp rr:vararg
+	vldmia	sp!, {\rr}
+.endm
+
+.macro	endprologue
+.endm
+
+// No need for prologue markers on ARM.
+#define FUNC_POSTHOOK(_) .L$_prologue_p = -1
+
+#endif
+
+///--------------------------------------------------------------------------
+/// AArch64-specific hacking.
+
+#if CPUFAM_ARM64
+
+// Set the function hooks.
+#define FUNC_PREHOOK(_) .balign 4
+#define FUNC_POSTHOOK(_) .cfi_startproc; .L$_prologue_p = -1
+#define ENDFUNC_HOOK(_) .cfi_endproc
+
+// Call external subroutine at ADDR, possibly via PLT.
+.macro	callext addr
+	bl	\addr
+.endm
+
+// Load address of external symbol ADDR into REG.
+.macro	leaext	reg, addr
+#if WANT_PIC
+	adrp	\reg, :got:\addr
+	ldr	\reg, [\reg, #:got_lo12:\addr]
+#else
+	adrp	\reg, \addr
+	add	\reg, \reg, #:lo12:\addr
+#endif
+.endm
+
+.macro	vzero	vz=v31
+	// Set VZ (default v31) to zero.
+	dup	\vz\().4s, wzr
+.endm
+
+.macro	vshl128	vd, vn, nbit, vz=v31
+	// Set VD to VN shifted left by NBIT.  Assume VZ (default v31) is
+	// all-bits-zero.  NBIT must be a multiple of 8.
+  .if \nbit&3 != 0
+	.error	"shift quantity must be whole number of bytes"
+  .endif
+	ext	\vd\().16b, \vz\().16b, \vn\().16b, #16 - (\nbit >> 3)
+.endm
+
+.macro	vshr128	vd, vn, nbit, vz=v31
+	// Set VD to VN shifted right by NBIT.  Assume VZ (default v31) is
+	// all-bits-zero.  NBIT must be a multiple of 8.
+  .if \nbit&3 != 0
+	.error	"shift quantity must be whole number of bytes"
+  .endif
+	ext	\vd\().16b, \vn\().16b, \vz\().16b, #\nbit >> 3
+.endm
+
+// Stack management and unwinding.
+.macro	setfp	fp=x29, offset=0
+  // If you're just going through the motions with a fixed-size stack frame,
+  // then you want to say `add x29, sp, #OFFSET' directly, which will avoid
+  // pointlessly restoring sp later.
+  .if \offset == 0
+	mov	\fp, sp
+	  .cfi_def_cfa_register \fp
+  .else
+	add	\fp, sp, #\offset
+	  .cfi_def_cfa_register \fp
+	  .cfi_adjust_cfa_offset -\offset
+  .endif
+	.macro dropfp; _dropfp	\fp, \offset; .endm
+	.L$_frameptr_p = -1
+.endm
+
+.macro	_dropfp	fp, offset=0
+  .if \offset == 0
+	mov	sp, \fp
+	  .cfi_def_cfa_register sp
+  .else
+	sub	sp, \fp, #\offset
+	  .cfi_def_cfa_register sp
+	  .cfi_adjust_cfa_offset +\offset
+  .endif
+	.purgem	dropfp
+	.L$_frameptr_p = 0
+.endm
+
+.macro	stalloc	n
+	sub	sp, sp, #\n
+	  .cfi_adjust_cfa_offset +\n
+.endm
+
+.macro	stfree	n
+	add	sp, sp, #\n
+	  .cfi_adjust_cfa_offset -\n
+.endm
+
+.macro	pushreg	x, y=nil
+  .ifeqs "\y", "nil"
+	str	\x, [sp, #-16]!
+	  .cfi_adjust_cfa_offset +16
+	  .cfi_rel_offset \x, 0
+  .else
+	stp	\x, \y, [sp, #-16]!
+	  .cfi_adjust_cfa_offset +16
+	  .cfi_rel_offset \x, 0
+	  .cfi_rel_offset \y, 8
+  .endif
+.endm
+
+.macro	popreg	x, y=nil
+  .ifeqs "\y", "nil"
+	ldr	\x, [sp], #16
+	  .cfi_restore \x
+	  .cfi_adjust_cfa_offset -16
+  .else
+	ldp	\x, \y, [sp], #16
+	  .cfi_restore \x
+	  .cfi_restore \y
+	  .cfi_adjust_cfa_offset -16
+  .endif
+.endm
+
+.macro	savereg	x, y, z=nil
+  .ifeqs "\z", "nil"
+	str	\x, [sp, \y]
+	  .cfi_rel_offset \x, \y
+  .else
+	stp	\x, \y, [sp, #\z]
+	  .cfi_rel_offset \x, \z
+	  .cfi_rel_offset \y, \z + 8
+  .endif
+.endm
+
+.macro	rstrreg	x, y, z=nil
+  .ifeqs "\z", "nil"
+	ldr	\x, [sp, \y]
+	  .cfi_restore \x
+  .else
+	ldp	\x, \y, [sp, #\z]
+	  .cfi_restore \x
+	  .cfi_restore \y
+  .endif
+.endm
+
+.macro	endprologue
+.endm
+
+// cmov RD, RN, CC: set RD to RN if CC is satisfied, otherwise do nothing
+.macro	cmov	rd, rn, cc
+	csel	\rd, \rn, \rd, \cc
+.endm
+
+// Notational improvement: write `csel.CC' etc., rather than `csel ..., CC'.
+#define _COND(_)							\
+	_(eq) _(ne) _(cs) _(cc) _(vs) _(vc) _(mi) _(pl)			\
+	_(ge) _(lt) _(gt) _(le) _(hi) _(ls) _(al) _(nv)			\
+	_(hs) _(lo)
+#define _INST(_)							\
+	_(ccmp) _(ccmn)							\
+	_(csel) _(cmov)							\
+	_(csinc) _(cinc) _(cset)					\
+	_(csneg) _(cneg)						\
+	_(csinv) _(cinv) _(csetm)
+#define _CONDVAR(cc) _definstvar cc;
+#define _INSTVARS(inst)							\
+	.macro _definstvar cc;						\
+	  .macro inst.\cc args:vararg; inst \args, \cc; .endm;		\
+	.endm;								\
+	_COND(_CONDVAR);						\
+	.purgem _definstvar;
+	_INST(_INSTVARS)
+#undef _COND
+#undef _INST
+#undef _CONDVAR
+#undef _INSTVARS
+
+// Flag bits for `ccmp' and friends.
+#define CCMP_N 8
+#define CCMP_Z 4
+#define CCMP_C 2
+#define CCMP_V 1
+
+// Flag settings for satisfying conditions.
+#define CCMP_MI CCMP_N
+#define CCMP_PL 0
+#define CCMP_EQ CCMP_Z
+#define CCMP_NE 0
+#define CCMP_CS CCMP_C
+#define CCMP_HS CCMP_C
+#define CCMP_CC 0
+#define CCMP_LO 0
+#define CCMP_VS CCMP_V
+#define CCMP_VC 0
+#define CCMP_HI CCMP_C
+#define CCMP_LS 0
+#define CCMP_LT CCMP_N
+#define CCMP_GE 0
+#define CCMP_LE CCMP_N
+#define CCMP_GT 0
+
+#endif
+
+///--------------------------------------------------------------------------
+/// Final stuff.
+
+// Default values for the various hooks.
+#ifndef FUNC_PREHOOK
+#  define FUNC_PREHOOK(_)
+#endif
+#ifndef FUNC_POSTHOOK
+#  define FUNC_POSTHOOK(_)
+#endif
+#ifndef ENDFUNC_HOOK
+#  define ENDFUNC_HOOK(_)
+#endif
+
+#ifndef F
+#  ifdef SYM_USCORE
+#    define F(name) _##name
+#  else
+#    define F(name) name
+#  endif
+#endif
+
+#ifndef TYPE_FUNC
+#  define TYPE_FUNC(name)
+#endif
+
+#ifndef SIZE_OBJ
+#  define SIZE_OBJ(name)
+#endif
+
+#if __ELF__ && !defined(WANT_EXECUTABLE_STACK)
+	.pushsection .note.GNU-stack, "", _SECTTY(progbits)
+	.popsection
+#endif
+
+///----- That's all, folks --------------------------------------------------
+
+#endif
diff --git a/gcm-arm64-pmull.S b/gcm-arm64-pmull.S
new file mode 100644
index 0000000..6564fdd
--- /dev/null
+++ b/gcm-arm64-pmull.S
@@ -0,0 +1,631 @@
+#include "asm-common.h"
+
+	.arch	armv8-a+crypto
+
+	.text
+
+///--------------------------------------------------------------------------
+/// Multiplication macros.
+
+	// The good news is that we have a fancy instruction to do the
+	// multiplications.  The bad news is that it's not particularly well-
+	// suited to the job.
+	//
+	// For one thing, it only does a 64-bit multiplication, so in general
+	// we'll need to synthesize the full-width multiply by hand.  For
+	// another thing, it doesn't help with the reduction, so we have to
+	// do that by hand too.  And, finally, GCM has crazy bit ordering,
+	// and the instruction does nothing useful for that at all.
+	//
+	// Focusing on that last problem first: the bits aren't in monotonic
+	// significance order unless we permute them.  Fortunately, ARM64 has
+	// an instruction which will just permute the bits in each byte for
+	// us, so we don't have to worry about this very much.
+	//
+	// Our main weapons, the `pmull' and `pmull2' instructions, work on
+	// 64-bit operands, in half of a vector register, and produce 128-bit
+	// results.  But neither of them will multiply the high half of one
+	// vector by the low half of a second one, so we have a problem,
+	// which we solve by representing one of the operands redundantly:
+	// rather than packing the 64-bit pieces together, we duplicate each
+	// 64-bit piece across both halves of a register.
+	//
+	// The commentary for `mul128' is the most detailed.  The other
+	// macros assume that you've already read and understood that.
+
+.macro	mul128
+	// Enter with u and v in v0 and v1/v2 respectively, and 0 in v31;
+	// leave with z = u v in v0.  Clobbers v1--v6.
+
+	// First for the double-precision multiplication.  It's tempting to
+	// use Karatsuba's identity here, but I suspect that loses more in
+	// the shifting, bit-twiddling, and dependency chains that it gains
+	// in saving a multiplication which otherwise pipelines well.
+	// v0 =				// (u_0; u_1)
+	// v1/v2 =			// (v_0; v_1)
+	pmull2	v3.1q, v0.2d, v1.2d	// u_1 v_0
+	pmull	v4.1q, v0.1d, v2.1d	// u_0 v_1
+	pmull2	v5.1q, v0.2d, v2.2d	// (t_1; x_3) = u_1 v_1
+	pmull	v6.1q, v0.1d, v1.1d	// (x_0; t_0) = u_0 v_0
+
+	// Arrange the pieces to form a double-precision polynomial.
+	eor	v3.16b, v3.16b, v4.16b	// (m_0; m_1) = u_0 v_1 + u_1 v_0
+	vshr128	v4, v3, 64		// (m_1; 0)
+	vshl128	v3, v3, 64		// (0; m_0)
+	eor	v1.16b, v5.16b, v4.16b	// (x_2; x_3)
+	eor	v0.16b, v6.16b, v3.16b	// (x_0; x_1)
+
+	// And now the only remaining difficulty is that the result needs to
+	// be reduced modulo p(t) = t^128 + t^7 + t^2 + t + 1.  Let R = t^128
+	// = t^7 + t^2 + t + 1 in our field.  So far, we've calculated z_0
+	// and z_1 such that z_0 + z_1 R = u v using the identity R = t^128:
+	// now we must collapse the two halves of y together using the other
+	// identity R = t^7 + t^2 + t + 1.
+	//
+	// We do this by working on y_2 and y_3 separately, so consider y_i
+	// for i = 2 or 3.  Certainly, y_i t^{64i} = y_i R t^{64(i-2) =
+	// (t^7 + t^2 + t + 1) y_i t^{64(i-2)}, but we can't use that
+	// directly without breaking up the 64-bit word structure.  Instead,
+	// we start by considering just y_i t^7 t^{64(i-2)}, which again
+	// looks tricky.  Now, split y_i = a_i + t^57 b_i, with deg a_i < 57;
+	// then
+	//
+	//	y_i t^7 t^{64(i-2)} = a_i t^7 t^{64(i-2)} + b_i t^{64(i-1)}
+	//
+	// We can similarly decompose y_i t^2 and y_i t into a pair of 64-bit
+	// contributions to the t^{64(i-2)} and t^{64(i-1)} words, but the
+	// splits are different.  This is lovely, with one small snag: when
+	// we do this to y_3, we end up with a contribution back into the
+	// t^128 coefficient word.  But notice that only the low seven bits
+	// of this word are affected, so there's no knock-on contribution
+	// into the t^64 word.  Therefore, if we handle the high bits of each
+	// word together, and then the low bits, everything will be fine.
+
+	// First, shift the high bits down.
+	ushr	v2.2d, v1.2d, #63	// the b_i for t
+	ushr	v3.2d, v1.2d, #62	// the b_i for t^2
+	ushr	v4.2d, v1.2d, #57	// the b_i for t^7
+	eor	v2.16b, v2.16b, v3.16b	// add them all together
+	eor	v2.16b, v2.16b, v4.16b
+	vshr128	v3, v2, 64
+	vshl128	v4, v2, 64
+	eor	v1.16b, v1.16b, v3.16b	// contribution into high half
+	eor	v0.16b, v0.16b, v4.16b	// and low half
+
+	// And then shift the low bits up.
+	shl	v2.2d, v1.2d, #1
+	shl	v3.2d, v1.2d, #2
+	shl	v4.2d, v1.2d, #7
+	eor	v1.16b, v1.16b, v2.16b	// unit and t contribs
+	eor	v3.16b, v3.16b, v4.16b	// t^2 and t^7 contribs
+	eor	v0.16b, v0.16b, v1.16b	// mix everything together
+	eor	v0.16b, v0.16b, v3.16b	// ... and we're done
+.endm
+
+.macro	mul64
+	// Enter with u and v in the low halves of v0 and v1, respectively;
+	// leave with z = u v in x2.  Clobbers x2--x4.
+
+	// The multiplication is thankfully easy.
+	// v0 =					// (u; ?)
+	// v1 =					// (v; ?)
+	pmull	v0.1q, v0.1d, v1.1d		// u v
+
+	// Now we must reduce.  This is essentially the same as the 128-bit
+	// case above, but mostly simpler because everything is smaller.  The
+	// polynomial this time is p(t) = t^64 + t^4 + t^3 + t + 1.
+
+	// Before we get stuck in, transfer the product to general-purpose
+	// registers.
+	mov	x3, v0.d[1]
+	mov	x2, v0.d[0]
+
+	// First, shift the high bits down.
+	eor	x4, x3, x3, lsr #1	// pre-mix t^3 and t^4
+	eor	x3, x3, x3, lsr #63	// mix in t contribution
+	eor	x3, x3, x4, lsr #60	// shift and mix in t^3 and t^4
+
+	// And then shift the low bits up.
+	eor	x3, x3, x3, lsl #1	// mix unit and t; pre-mix t^3, t^4
+	eor	x2, x2, x3		// fold them in
+	eor	x2, x2, x3, lsl #3	// and t^3 and t^4
+.endm
+
+.macro	mul96
+	// Enter with u in the least-significant 96 bits of v0, with zero in
+	// the upper 32 bits, and with the least-significant 64 bits of v in
+	// both halves of v1, and the upper 32 bits of v in the low 32 bits
+	// of each half of v2, with zero in the upper 32 bits; and with zero
+	// in v31.  Yes, that's a bit hairy.  Leave with the product u v in
+	// the low 96 bits of v0, and /junk/ in the high 32 bits.  Clobbers
+	// v1--v6.
+
+	// This is an inconvenient size.  There's nothing for it but to do
+	// four multiplications, as if for the 128-bit case.  It's possible
+	// that there's cruft in the top 32 bits of the input registers, so
+	// shift both of them up by four bytes before we start.  This will
+	// mean that the high 64 bits of the result (from GCM's viewpoint)
+	// will be zero.
+	// v0 =				// (u_0 + u_1 t^32; u_2)
+	// v1 =				// (v_0 + v_1 t^32; v_0 + v_1 t^32)
+	// v2 =				// (v_2; v_2)
+	pmull2	v5.1q, v0.2d, v1.2d	// u_2 (v_0 + v_1 t^32) t^32 = e_0
+	pmull	v4.1q, v0.1d, v2.1d	// v_2 (u_0 + u_1 t^32) t^32 = e_1
+	pmull2	v6.1q, v0.2d, v2.2d	// u_2 v_2 = d = (d; 0)
+	pmull	v3.1q, v0.1d, v1.1d	// u_0 v_0 + (u_0 v_1 + u_1 v_0) t^32
+					//   + u_1 v_1 t^64 = f
+
+	// Extract the high and low halves of the 192-bit result.  The answer
+	// we want is d t^128 + e t^64 + f, where e = e_0 + e_1.  The low 96
+	// bits of the answer will end up in v0, with junk in the top 32
+	// bits; the high 96 bits will end up in v1, which must have zero in
+	// its top 32 bits.
+	//
+	// Here, bot(x) is the low 96 bits of a 192-bit quantity x, arranged
+	// in the low 96 bits of a SIMD register, with junk in the top 32
+	// bits; and top(x) is the high 96 bits, also arranged in the low 96
+	// bits of a register, with /zero/ in the top 32 bits.
+	eor	v4.16b, v4.16b, v5.16b	// e_0 + e_1 = e
+	vshl128	v6, v6, 32		// top(d t^128)
+	vshr128	v5, v4, 32		// top(e t^64)
+	vshl128	v4, v4, 64		// bot(e t^64)
+	vshr128	v1, v3, 96		// top(f)
+	eor	v6.16b, v6.16b, v5.16b	// top(d t^128 + e t^64)
+	eor	v0.16b, v3.16b, v4.16b	// bot([d t^128] + e t^64 + f)
+	eor	v1.16b, v1.16b, v6.16b	// top(e t^64 + d t^128 + f)
+
+	// Finally, the reduction.  This is essentially the same as the
+	// 128-bit case, except that the polynomial is p(t) = t^96 + t^10 +
+	// t^9 + t^6 + 1.  The degrees are larger but not enough to cause
+	// trouble for the general approach.  Unfortunately, we have to do
+	// this in 32-bit pieces rather than 64.
+
+	// First, shift the high bits down.
+	ushr	v2.4s, v1.4s, #26	// the b_i for t^6
+	ushr	v3.4s, v1.4s, #23	// the b_i for t^9
+	ushr	v4.4s, v1.4s, #22	// the b_i for t^10
+	eor	v2.16b, v2.16b, v3.16b	// add them all together
+	eor	v2.16b, v2.16b, v4.16b
+	vshr128	v3, v2, 64		// contribution for high half
+	vshl128	v2, v2, 32		// contribution for low half
+	eor	v1.16b, v1.16b, v3.16b	// apply to high half
+	eor	v0.16b, v0.16b, v2.16b	// and low half
+
+	// And then shift the low bits up.
+	shl	v2.4s, v1.4s, #6
+	shl	v3.4s, v1.4s, #9
+	shl	v4.4s, v1.4s, #10
+	eor	v1.16b, v1.16b, v2.16b	// unit and t^6 contribs
+	eor	v3.16b, v3.16b, v4.16b	// t^9 and t^10 contribs
+	eor	v0.16b, v0.16b, v1.16b	// mix everything together
+	eor	v0.16b, v0.16b, v3.16b	// ... and we're done
+.endm
+
+.macro	mul192
+	// Enter with u in v0 and the less-significant half of v1, with v
+	// duplicated across both halves of v2/v3/v4, and with zero in v31.
+	// Leave with the product u v in v0 and the bottom half of v1.
+	// Clobbers v16--v25.
+
+	// Start multiplying and accumulating pieces of product.
+	// v0 =				// (u_0; u_1)
+	// v1 =				// (u_2; ?)
+	// v2 =				// (v_0; v_0)
+	// v3 =				// (v_1; v_1)
+	// v4 =				// (v_2; v_2)
+	pmull	v16.1q, v0.1d, v2.1d	//   a = u_0 v_0
+
+	pmull	v19.1q, v0.1d, v3.1d	//	 u_0 v_1
+	pmull2	v21.1q, v0.2d, v2.2d	//	 u_1 v_0
+
+	pmull	v17.1q, v0.1d, v4.1d	//	 u_0 v_2
+	pmull2	v22.1q, v0.2d, v3.2d	//	 u_1 v_1
+	pmull	v23.1q, v1.1d, v2.1d	//	 u_2 v_0
+	 eor	v19.16b, v19.16b, v21.16b // b = u_0 v_1 + u_1 v_0
+
+	pmull2	v20.1q, v0.2d, v4.2d	//	 u_1 v_2
+	pmull	v24.1q, v1.1d, v3.1d	//	 u_2 v_1
+	 eor	v17.16b, v17.16b, v22.16b //	 u_0 v_2 + u_1 v_1
+
+	pmull	v18.1q, v1.1d, v4.1d	//   e = u_2 v_2
+	 eor	v17.16b, v17.16b, v23.16b // c = u_0 v_2 + u_1 v_1 + u_2 v_1
+	 eor	v20.16b, v20.16b, v24.16b // d = u_1 v_2 + u_2 v_1
+
+	// Piece the product together.
+	// v16 =			// (a_0; a_1)
+	// v19 =			// (b_0; b_1)
+	// v17 =			// (c_0; c_1)
+	// v20 =			// (d_0; d_1)
+	// v18 =			// (e_0; e_1)
+	vshl128	v21, v19, 64		// (0; b_0)
+	ext	v22.16b, v19.16b, v20.16b, #8 // (b_1; d_0)
+	vshr128	v23, v20, 64		// (d_1; 0)
+	eor	v16.16b, v16.16b, v21.16b // (x_0; x_1)
+	eor	v17.16b, v17.16b, v22.16b // (x_2; x_3)
+	eor	v18.16b, v18.16b, v23.16b // (x_2; x_3)
+
+	// Next, the reduction.  Our polynomial this time is p(x) = t^192 +
+	// t^7 + t^2 + t + 1.  Yes, the magic numbers are the same as the
+	// 128-bit case.  I don't know why.
+
+	// First, shift the high bits down.
+	// v16 =			// (y_0; y_1)
+	// v17 =			// (y_2; y_3)
+	// v18 =			// (y_4; y_5)
+	mov	v19.d[0], v17.d[1]	// (y_3; ?)
+
+	ushr	v23.2d, v18.2d, #63	// hi b_i for t
+	ushr	d20, d19, #63		// lo b_i for t
+	ushr	v24.2d, v18.2d, #62	// hi b_i for t^2
+	ushr	d21, d19, #62		// lo b_i for t^2
+	ushr	v25.2d, v18.2d, #57	// hi b_i for t^7
+	ushr	d22, d19, #57		// lo b_i for t^7
+	eor	v23.16b, v23.16b, v24.16b // mix them all together
+	eor	v20.8b, v20.8b, v21.8b
+	eor	v23.16b, v23.16b, v25.16b
+	eor	v20.8b, v20.8b, v22.8b
+
+	// Permute the high pieces while we fold in the b_i.
+	eor	v17.16b, v17.16b, v23.16b
+	vshl128	v20, v20, 64
+	mov	v19.d[0], v18.d[1]	// (y_5; ?)
+	ext	v18.16b, v17.16b, v18.16b, #8 // (y_3; y_4)
+	eor	v16.16b, v16.16b, v20.16b
+
+	// And finally shift the low bits up.
+	// v16 =			// (y'_0; y'_1)
+	// v17 =			// (y'_2; ?)
+	// v18 =			// (y'_3; y'_4)
+	// v19 =			// (y'_5; ?)
+	shl	v20.2d, v18.2d, #1
+	shl	d23, d19, #1
+	shl	v21.2d, v18.2d, #2
+	shl	d24, d19, #2
+	shl	v22.2d, v18.2d, #7
+	shl	d25, d19, #7
+	eor	v18.16b, v18.16b, v20.16b // unit and t contribs
+	eor	v19.8b, v19.8b, v23.8b
+	eor	v21.16b, v21.16b, v22.16b // t^2 and t^7 contribs
+	eor	v24.8b, v24.8b, v25.8b
+	eor	v18.16b, v18.16b, v21.16b // all contribs
+	eor	v19.8b, v19.8b, v24.8b
+	eor	v0.16b, v16.16b, v18.16b // mix them into the low half
+	eor	v1.8b, v17.8b, v19.8b
+.endm
+
+.macro	mul256
+	// Enter with u in v0/v1, with v duplicated across both halves of
+	// v2--v5, and with zero in v31.  Leave with the product u v in
+	// v0/v1.  Clobbers ???.
+
+	// Now it's starting to look worthwhile to do Karatsuba.  Suppose
+	// u = u_0 + u_1 B and v = v_0 + v_1 B.  Then
+	//
+	//	u v = (u_0 v_0) + (u_0 v_1 + u_1 v_0) B + (u_1 v_1) B^2
+	//
+	// Name these coefficients of B^i be a, b, and c, respectively, and
+	// let r = u_0 + u_1 and s = v_0 + v_1.  Then observe that
+	//
+	//	q = r s = (u_0 + u_1) (v_0 + v_1)
+	//	  = (u_0 v_0) + (u1 v_1) + (u_0 v_1 + u_1 v_0)
+	//	  = a + d + c
+	//
+	// The first two terms we've already calculated; the last is the
+	// remaining one we want.  We'll set B = t^128.  We know how to do
+	// 128-bit multiplications already, and Karatsuba is too annoying
+	// there, so there'll be 12 multiplications altogether, rather than
+	// the 16 we'd have if we did this the naïve way.
+	// v0 =				// u_0 = (u_00; u_01)
+	// v1 =				// u_1 = (u_10; u_11)
+	// v2 =				// (v_00; v_00)
+	// v3 =				// (v_01; v_01)
+	// v4 =				// (v_10; v_10)
+	// v5 =				// (v_11; v_11)
+
+	eor	v28.16b, v0.16b, v1.16b	// u_* = (u_00 + u_10; u_01 + u_11)
+	eor	v29.16b, v2.16b, v4.16b	// v_*0 = v_00 + v_10
+	eor	v30.16b, v3.16b, v5.16b	// v_*1 = v_01 + v_11
+
+	// Start by building the cross product, q = u_* v_*.
+	pmull	v24.1q, v28.1d, v30.1d	// u_*0 v_*1
+	pmull2	v25.1q, v28.2d, v29.2d	// u_*1 v_*0
+	pmull	v20.1q, v28.1d, v29.1d	// u_*0 v_*0
+	pmull2	v21.1q, v28.2d, v30.2d	// u_*1 v_*1
+	eor	v24.16b, v24.16b, v25.16b // u_*0 v_*1 + u_*1 v_*0
+	vshr128	v25, v24, 64
+	vshl128	v24, v24, 64
+	eor	v20.16b, v20.16b, v24.16b // q_0
+	eor	v21.16b, v21.16b, v25.16b // q_1
+
+	// Next, work on the low half, a = u_0 v_0
+	pmull	v24.1q, v0.1d, v3.1d	// u_00 v_01
+	pmull2	v25.1q, v0.2d, v2.2d	// u_01 v_00
+	pmull	v16.1q, v0.1d, v2.1d	// u_00 v_00
+	pmull2	v17.1q, v0.2d, v3.2d	// u_01 v_01
+	eor	v24.16b, v24.16b, v25.16b // u_00 v_01 + u_01 v_00
+	vshr128	v25, v24, 64
+	vshl128	v24, v24, 64
+	eor	v16.16b, v16.16b, v24.16b // a_0
+	eor	v17.16b, v17.16b, v25.16b // a_1
+
+	// Mix the pieces we have so far.
+	eor	v20.16b, v20.16b, v16.16b
+	eor	v21.16b, v21.16b, v17.16b
+
+	// Finally, work on the high half, c = u_1 v_1
+	pmull	v24.1q, v1.1d, v5.1d	// u_10 v_11
+	pmull2	v25.1q, v1.2d, v4.2d	// u_11 v_10
+	pmull	v18.1q, v1.1d, v4.1d	// u_10 v_10
+	pmull2	v19.1q, v1.2d, v5.2d	// u_11 v_11
+	eor	v24.16b, v24.16b, v25.16b // u_10 v_11 + u_11 v_10
+	vshr128	v25, v24, 64
+	vshl128	v24, v24, 64
+	eor	v18.16b, v18.16b, v24.16b // c_0
+	eor	v19.16b, v19.16b, v25.16b // c_1
+
+	// Finish mixing the product together.
+	eor	v20.16b, v20.16b, v18.16b
+	eor	v21.16b, v21.16b, v19.16b
+	eor	v17.16b, v17.16b, v20.16b
+	eor	v18.16b, v18.16b, v21.16b
+
+	// Now we must reduce.  This is essentially the same as the 192-bit
+	// case above, but more complicated because everything is bigger.
+	// The polynomial this time is p(t) = t^256 + t^10 + t^5 + t^2 + 1.
+	// v16 =			// (y_0; y_1)
+	// v17 =			// (y_2; y_3)
+	// v18 =			// (y_4; y_5)
+	// v19 =			// (y_6; y_7)
+	ushr	v24.2d, v18.2d, #62	// (y_4; y_5) b_i for t^2
+	ushr	v25.2d, v19.2d, #62	// (y_6; y_7) b_i for t^2
+	ushr	v26.2d, v18.2d, #59	// (y_4; y_5) b_i for t^5
+	ushr	v27.2d, v19.2d, #59	// (y_6; y_7) b_i for t^5
+	ushr	v28.2d, v18.2d, #54	// (y_4; y_5) b_i for t^10
+	ushr	v29.2d, v19.2d, #54	// (y_6; y_7) b_i for t^10
+	eor	v24.16b, v24.16b, v26.16b // mix the contributions together
+	eor	v25.16b, v25.16b, v27.16b
+	eor	v24.16b, v24.16b, v28.16b
+	eor	v25.16b, v25.16b, v29.16b
+	vshr128	v26, v25, 64		// slide contribs into position
+	ext	v25.16b, v24.16b, v25.16b, #8
+	vshl128	v24, v24, 64
+	eor	v18.16b, v18.16b, v26.16b
+	eor	v17.16b, v17.16b, v25.16b
+	eor	v16.16b, v16.16b, v24.16b
+
+	// And then shift the low bits up.
+	// v16 =			// (y'_0; y'_1)
+	// v17 =			// (y'_2; y'_3)
+	// v18 =			// (y'_4; y'_5)
+	// v19 =			// (y'_6; y'_7)
+	shl	v24.2d, v18.2d, #2	// (y'_4; y_5) a_i for t^2
+	shl	v25.2d, v19.2d, #2	// (y_6; y_7) a_i for t^2
+	shl	v26.2d, v18.2d, #5	// (y'_4; y_5) a_i for t^5
+	shl	v27.2d, v19.2d, #5	// (y_6; y_7) a_i for t^5
+	shl	v28.2d, v18.2d, #10	// (y'_4; y_5) a_i for t^10
+	shl	v29.2d, v19.2d, #10	// (y_6; y_7) a_i for t^10
+	eor	v18.16b, v18.16b, v24.16b // mix the contributions together
+	eor	v19.16b, v19.16b, v25.16b
+	eor	v26.16b, v26.16b, v28.16b
+	eor	v27.16b, v27.16b, v29.16b
+	eor	v18.16b, v18.16b, v26.16b
+	eor	v19.16b, v19.16b, v27.16b
+	eor	v0.16b, v16.16b, v18.16b
+	eor	v1.16b, v17.16b, v19.16b
+.endm
+
+///--------------------------------------------------------------------------
+/// Main code.
+
+// There are a number of representations of field elements in this code and
+// it can be confusing.
+//
+//   * The `external format' consists of a sequence of contiguous bytes in
+//     memory called a `block'.  The GCM spec explains how to interpret this
+//     block as an element of a finite field.  As discussed extensively, this
+//     representation is very annoying for a number of reasons.  On the other
+//     hand, this code never actually deals with it directly.
+//
+//   * The `register format' consists of one or more SIMD registers,
+//     depending on the block size.  The bits in each byte are reversed,
+//     compared to the external format, which makes the polynomials
+//     completely vanilla, unlike all of the other GCM implementations.
+//
+//   * The `table format' is just like the `register format', only the two
+//     halves of 128-bit SIMD register are the same, so we need twice as many
+//     registers.
+//
+//   * The `words' format consists of a sequence of bytes, as in the
+//     `external format', but, according to the blockcipher in use, the bytes
+//     within each 32-bit word may be reversed (`big-endian') or not
+//     (`little-endian').  Accordingly, there are separate entry points for
+//     each variant, identified with `b' or `l'.
+
+FUNC(gcm_mulk_128b_arm64_pmull)
+	// On entry, x0 points to a 128-bit field element A in big-endian
+	// words format; x1 points to a field-element K in table format.  On
+	// exit, A is updated with the product A K.
+
+	ldr	q0, [x0]
+	ldp	q1, q2, [x1]
+	rev32	v0.16b, v0.16b
+	vzero
+	rbit	v0.16b, v0.16b
+	mul128
+	rbit	v0.16b, v0.16b
+	rev32	v0.16b, v0.16b
+	str	q0, [x0]
+	ret
+ENDFUNC
+
+FUNC(gcm_mulk_128l_arm64_pmull)
+	// On entry, x0 points to a 128-bit field element A in little-endian
+	// words format; x1 points to a field-element K in table format.  On
+	// exit, A is updated with the product A K.
+
+	ldr	q0, [x0]
+	ldp	q1, q2, [x1]
+	vzero
+	rbit	v0.16b, v0.16b
+	mul128
+	rbit	v0.16b, v0.16b
+	str	q0, [x0]
+	ret
+ENDFUNC
+
+FUNC(gcm_mulk_64b_arm64_pmull)
+	// On entry, x0 points to a 64-bit field element A in big-endian
+	// words format; x1 points to a field-element K in table format.  On
+	// exit, A is updated with the product A K.
+
+	ldr	d0, [x0]
+	ldr	q1, [x1]
+	rev32	v0.8b, v0.8b
+	rbit	v0.8b, v0.8b
+	mul64
+	rbit	x2, x2
+	ror	x2, x2, #32
+	str	x2, [x0]
+	ret
+ENDFUNC
+
+FUNC(gcm_mulk_64l_arm64_pmull)
+	// On entry, x0 points to a 64-bit field element A in little-endian
+	// words format; x1 points to a field-element K in table format.  On
+	// exit, A is updated with the product A K.
+
+	ldr	d0, [x0]
+	ldr	q1, [x1]
+	rbit	v0.8b, v0.8b
+	mul64
+	rbit	x2, x2
+	rev	x2, x2
+	str	x2, [x0]
+	ret
+ENDFUNC
+
+FUNC(gcm_mulk_96b_arm64_pmull)
+	// On entry, x0 points to a 96-bit field element A in big-endian
+	// words format; x1 points to a field-element K in table format.  On
+	// exit, A is updated with the product A K.
+
+	ldr	w2, [x0, #8]
+	ldr	d0, [x0, #0]
+	mov	v0.d[1], x2
+	ldp	q1, q2, [x1]
+	rev32	v0.16b, v0.16b
+	vzero
+	rbit	v0.16b, v0.16b
+	mul96
+	rbit	v0.16b, v0.16b
+	rev32	v0.16b, v0.16b
+	mov	w2, v0.s[2]
+	str	d0, [x0, #0]
+	str	w2, [x0, #8]
+	ret
+ENDFUNC
+
+FUNC(gcm_mulk_96l_arm64_pmull)
+	// On entry, x0 points to a 96-bit field element A in little-endian
+	// words format; x1 points to a field-element K in table format.  On
+	// exit, A is updated with the product A K.
+
+	ldr	d0, [x0, #0]
+	ldr	w2, [x0, #8]
+	mov	v0.d[1], x2
+	ldp	q1, q2, [x1]
+	rbit	v0.16b, v0.16b
+	vzero
+	mul96
+	rbit	v0.16b, v0.16b
+	mov	w2, v0.s[2]
+	str	d0, [x0, #0]
+	str	w2, [x0, #8]
+	ret
+ENDFUNC
+
+FUNC(gcm_mulk_192b_arm64_pmull)
+	// On entry, x0 points to a 192-bit field element A in big-endian
+	// words format; x1 points to a field-element K in table format.  On
+	// exit, A is updated with the product A K.
+
+	ldr	q0, [x0, #0]
+	ldr	d1, [x0, #16]
+	ldp	q2, q3, [x1, #0]
+	ldr	q4, [x1, #32]
+	rev32	v0.16b, v0.16b
+	rev32	v1.8b, v1.8b
+	rbit	v0.16b, v0.16b
+	rbit	v1.8b, v1.8b
+	vzero
+	mul192
+	rev32	v0.16b, v0.16b
+	rev32	v1.8b, v1.8b
+	rbit	v0.16b, v0.16b
+	rbit	v1.8b, v1.8b
+	str	q0, [x0, #0]
+	str	d1, [x0, #16]
+	ret
+ENDFUNC
+
+FUNC(gcm_mulk_192l_arm64_pmull)
+	// On entry, x0 points to a 192-bit field element A in little-endian
+	// words format; x1 points to a field-element K in table format.  On
+	// exit, A is updated with the product A K.
+
+	ldr	q0, [x0, #0]
+	ldr	d1, [x0, #16]
+	ldp	q2, q3, [x1, #0]
+	ldr	q4, [x1, #32]
+	rbit	v0.16b, v0.16b
+	rbit	v1.8b, v1.8b
+	vzero
+	mul192
+	rbit	v0.16b, v0.16b
+	rbit	v1.8b, v1.8b
+	str	q0, [x0, #0]
+	str	d1, [x0, #16]
+	ret
+ENDFUNC
+
+FUNC(gcm_mulk_256b_arm64_pmull)
+	// On entry, x0 points to a 256-bit field element A in big-endian
+	// words format; x1 points to a field-element K in table format.  On
+	// exit, A is updated with the product A K.
+
+	ldp	q0, q1, [x0]
+	ldp	q2, q3, [x1, #0]
+	ldp	q4, q5, [x1, #32]
+	rev32	v0.16b, v0.16b
+	rev32	v1.16b, v1.16b
+	rbit	v0.16b, v0.16b
+	rbit	v1.16b, v1.16b
+	vzero
+	mul256
+	rev32	v0.16b, v0.16b
+	rev32	v1.16b, v1.16b
+	rbit	v0.16b, v0.16b
+	rbit	v1.16b, v1.16b
+	stp	q0, q1, [x0]
+	ret
+ENDFUNC
+
+FUNC(gcm_mulk_256l_arm64_pmull)
+	// On entry, x0 points to a 256-bit field element A in little-endian
+	// words format; x1 points to a field-element K in table format.  On
+	// exit, A is updated with the product A K.
+
+	ldp	q0, q1, [x0]
+	ldp	q2, q3, [x1, #0]
+	ldp	q4, q5, [x1, #32]
+	rbit	v0.16b, v0.16b
+	rbit	v1.16b, v1.16b
+	vzero
+	mul256
+	rbit	v0.16b, v0.16b
+	rbit	v1.16b, v1.16b
+	stp	q0, q1, [x0]
+	ret
+ENDFUNC
+
+///----- That's all, folks --------------------------------------------------