|
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286 |
- #include "asm-common.h"
-
- .arch armv8-a+crypto
-
- .extern F(abort)
- .extern F(rijndael_rcon)
-
- .text
-
- ///--------------------------------------------------------------------------
- /// Main code.
-
- /// The ARM crypto extension implements a little-endian version of AES
- /// (though the manual doesn't actually spell this out and you have to
- /// experiment)a, note that internal interface presents as big-endian so
- /// as to work better with things like GCM. We therefore maintain the round
- /// keys in little-endian form, and have to end-swap blocks in and out.
- ///
- /// For added amusement, the crypto extension doesn't implement the larger-
- /// block versions of Rijndael, so we have to end-swap the keys if we're
- /// preparing for one of those.
-
- // Useful constants.
- .equ maxrounds, 16 // maximum number of rounds
- .equ maxblksz, 32 // maximum block size, in bytes
- .equ kbufsz, maxblksz*(maxrounds + 1) // size of key-sched buffer
-
- // Context structure.
- .equ nr, 0 // number of rounds
- .equ w, nr + 4 // encryption key words
- .equ wi, w + kbufsz // decryption key words
-
- ///--------------------------------------------------------------------------
- /// Key setup.
-
- FUNC(rijndael_setup_arm64_crypto)
-
- // Arguments:
- // x0 = pointer to context
- // w1 = block size in 32-bit words
- // x2 = pointer to key material
- // x3 = key size in words
-
- pushreg x29, x30
- mov x29, sp
-
- // The initial round key material is taken directly from the input
- // key, so copy it over. Unfortunately, the key material is not
- // guaranteed to be aligned in any especially useful way. Assume
- // that alignment traps are not enabled. (Why would they be? On
- // A32, alignment traps were part of a transition plan which changed
- // the way unaligned loads and stores behaved, but there's never been
- // any other behaviour on A64.)
- mov x15, x3
- add x4, x0, #w
- 0: sub x15, x15, #1
- ldr w14, [x2], #4
- str w14, [x4], #4
- cbnz x15, 0b
-
- // Find out other useful things and prepare for the main loop.
- 9: ldr w9, [x0, #nr] // number of rounds
- madd w2, w1, w9, w1 // total key size in words
- leaext x5, rijndael_rcon // round constants
- sub x6, x2, x3 // minus what we've copied already
- add x7, x0, #w // position in previous cycle
- movi v1.4s, #0 // all-zero register for the key
- mov x8, #0 // position in current cycle
-
- // Main key expansion loop. Dispatch according to the position in
- // the cycle.
- 0: ldr w15, [x7], #4 // word from previous cycle
- cbz x8, 1f // first word of the cycle?
- cmp x8, #4 // fourth word of the cycle?
- b.ne 2f
- cmp x3, #7 // seven or eight words of key?
- b.cc 2f
-
- // Fourth word of the cycle, seven or eight words of key. We must do
- // the byte substitution.
- dup v0.4s, w14
- aese v0.16b, v1.16b // effectively, just SubBytes
- mov w14, v0.s[0]
- b 2f
-
- // First word of the cycle. Byte substitution, rotation, and round
- // constant.
- 1: ldrb w13, [x5], #1 // next round constant
- dup v0.4s, w14
- aese v0.16b, v1.16b // effectively, just SubBytes
- mov w14, v0.s[0]
- eor w14, w13, w14, ror #8
-
- // Common ending: mix in the word from the previous cycle and store.
- 2: eor w14, w14, w15
- str w14, [x4], #4
-
- // Prepare for the next iteration. If we're done, then stop; if
- // we've finished a cycle then reset the counter.
- add x8, x8, #1
- sub x6, x6, #1
- cmp x8, x3
- cbz x6, 9f
- cmov.cs x8, xzr
- b 0b
-
- // Next job is to construct the decryption keys. The keys for the
- // first and last rounds don't need to be mangled, but the remaining
- // ones do -- and they all need to be reordered too.
- //
- // The plan of action, then, is to copy the final encryption round's
- // keys into place first, then to do each of the intermediate rounds
- // in reverse order, and finally do the first round.
- //
- // Do all the heavy lifting with the vector registers. The order
- // we're doing this in means that it's OK if we read or write too
- // much, and there's easily enough buffer space for the
- // over-enthusiastic reads and writes because the context has space
- // for 32-byte blocks, which is our maximum and an exact fit for two
- // full-width registers.
- 9: add x5, x0, #wi
- add x4, x0, #w
- add x4, x4, w2, uxtw #2
- sub x4, x4, w1, uxtw #2 // last round's keys
-
- // Copy the last encryption round's keys.
- ld1 {v0.4s, v1.4s}, [x4]
- st1 {v0.4s, v1.4s}, [x5]
-
- // Update the loop variables and stop if we've finished.
- 0: sub w9, w9, #1
- add x5, x5, w1, uxtw #2
- sub x4, x4, w1, uxtw #2
- cbz w9, 9f
-
- // Do another middle round's keys...
- ld1 {v0.4s, v1.4s}, [x4]
- aesimc v0.16b, v0.16b
- aesimc v1.16b, v1.16b
- st1 {v0.4s, v1.4s}, [x5]
- b 0b
-
- // Finally do the first encryption round.
- 9: ld1 {v0.4s, v1.4s}, [x4]
- st1 {v0.4s, v1.4s}, [x5]
-
- // If the block size is not exactly four words then we must end-swap
- // everything. We can use fancy vector toys for this.
- cmp w1, #4
- b.eq 9f
-
- // End-swap the encryption keys.
- add x1, x0, #w
- bl endswap_block
-
- // And the decryption keys
- add x1, x0, #wi
- bl endswap_block
-
- // All done.
- 9: popreg x29, x30
- ret
-
- ENDFUNC
-
- INTFUNC(endswap_block)
- // End-swap w2 words starting at x1. x1 is clobbered; w2 is not.
- // It's OK to work in 16-byte chunks.
-
- mov w3, w2
- 0: subs w3, w3, #4
- ld1 {v0.4s}, [x1]
- rev32 v0.16b, v0.16b
- st1 {v0.4s}, [x1], #16
- b.hi 0b
- ret
-
- ENDFUNC
-
- ///--------------------------------------------------------------------------
- /// Encrypting and decrypting blocks.
-
- .macro encdec op, aes, mc, koff
- FUNC(rijndael_\op\()_arm64_crypto)
-
- // Arguments:
- // x0 = pointer to context
- // x1 = pointer to input block
- // x2 = pointer to output block
-
- // Set things up ready.
- ldr w3, [x0, #nr]
- add x0, x0, #\koff
- ld1 {v0.4s}, [x1]
- rev32 v0.16b, v0.16b
-
- // Check the number of rounds and dispatch.
- cmp w3, #14
- b.eq 14f
- cmp w3, #10
- b.eq 10f
- cmp w3, #12
- b.eq 12f
- cmp w3, #13
- b.eq 13f
- cmp w3, #11
- b.eq 11f
- callext F(abort)
-
- // Eleven rounds.
- 11: ld1 {v16.4s}, [x0], #16
- \aes v0.16b, v16.16b
- \mc v0.16b, v0.16b
- b 10f
-
- // Twelve rounds.
- 12: ld1 {v16.4s, v17.4s}, [x0], #32
- \aes v0.16b, v16.16b
- \mc v0.16b, v0.16b
- \aes v0.16b, v17.16b
- \mc v0.16b, v0.16b
- b 10f
-
- // Thirteen rounds.
- 13: ld1 {v16.4s-v18.4s}, [x0], #48
- \aes v0.16b, v16.16b
- \mc v0.16b, v0.16b
- \aes v0.16b, v17.16b
- \mc v0.16b, v0.16b
- \aes v0.16b, v18.16b
- \mc v0.16b, v0.16b
- b 10f
-
- // Fourteen rounds. (Drops through to the ten round case because
- // this is the next most common.)
- 14: ld1 {v16.4s-v19.4s}, [x0], #64
- \aes v0.16b, v16.16b
- \mc v0.16b, v0.16b
- \aes v0.16b, v17.16b
- \mc v0.16b, v0.16b
- \aes v0.16b, v18.16b
- \mc v0.16b, v0.16b
- \aes v0.16b, v19.16b
- \mc v0.16b, v0.16b
- // Drop through...
-
- // Ten rounds.
- 10: ld1 {v16.4s-v19.4s}, [x0], #64
- ld1 {v20.4s-v23.4s}, [x0], #64
- \aes v0.16b, v16.16b
- \mc v0.16b, v0.16b
- \aes v0.16b, v17.16b
- \mc v0.16b, v0.16b
- \aes v0.16b, v18.16b
- \mc v0.16b, v0.16b
- \aes v0.16b, v19.16b
- \mc v0.16b, v0.16b
-
- ld1 {v16.4s-v18.4s}, [x0], #48
- \aes v0.16b, v20.16b
- \mc v0.16b, v0.16b
- \aes v0.16b, v21.16b
- \mc v0.16b, v0.16b
- \aes v0.16b, v22.16b
- \mc v0.16b, v0.16b
- \aes v0.16b, v23.16b
- \mc v0.16b, v0.16b
-
- // Final round has no MixColumns, but is followed by final whitening.
- \aes v0.16b, v16.16b
- \mc v0.16b, v0.16b
- \aes v0.16b, v17.16b
- eor v0.16b, v0.16b, v18.16b
-
- // All done.
- rev32 v0.16b, v0.16b
- st1 {v0.4s}, [x2]
- ret
-
- ENDFUNC
- .endm
-
- encdec eblk, aese, aesmc, w
- encdec dblk, aesd, aesimc, wi
-
- ///----- That's all, folks --------------------------------------------------
|