From ae96383af375d52f30f72554b75272fa226ca795 Mon Sep 17 00:00:00 2001 From: David Benjamin Date: Mon, 12 Jun 2017 18:31:15 -0400 Subject: [PATCH] ARMv4 assembly pack: implement support for Thumb2. As some of ARM processors, more specifically Cortex-Mx series, are Thumb2-only, we need to support Thumb2-only builds even in assembly. (Imported from upstream's 11208dcfb9105e8afa37233185decefd45e89e17.) Change-Id: I7cb48ce6a842cf3cfdf553f6e6e6227d52d525c0 Reviewed-on: https://boringssl-review.googlesource.com/17108 Reviewed-by: Adam Langley --- crypto/fipsmodule/aes/asm/aes-armv4.pl | 24 ++++------ crypto/fipsmodule/bn/asm/armv4-mont.pl | 35 +++++++++++--- crypto/fipsmodule/modes/asm/ghash-armv4.pl | 45 ++++++++++++++++-- crypto/fipsmodule/sha/asm/sha1-armv4-large.pl | 47 ++++++++++++++++--- crypto/fipsmodule/sha/asm/sha512-armv4.pl | 16 +++---- 5 files changed, 126 insertions(+), 41 deletions(-) diff --git a/crypto/fipsmodule/aes/asm/aes-armv4.pl b/crypto/fipsmodule/aes/asm/aes-armv4.pl index e8d7f2bf..c1cf4b77 100644 --- a/crypto/fipsmodule/aes/asm/aes-armv4.pl +++ b/crypto/fipsmodule/aes/asm/aes-armv4.pl @@ -70,15 +70,11 @@ $code=<<___; #endif .text -#if __ARM_ARCH__<7 -.code 32 -#else +#if defined(__thumb2__) && !defined(__APPLE__) .syntax unified -# if defined(__thumb2__) && !defined(__APPLE__) .thumb -# else +#else .code 32 -# endif #endif .type AES_Te,%object @@ -193,7 +189,7 @@ AES_Te: .type asm_AES_encrypt,%function .align 5 asm_AES_encrypt: -#if __ARM_ARCH__<7 +#ifndef __thumb2__ sub r3,pc,#8 @ asm_AES_encrypt #else adr r3,asm_AES_encrypt @@ -443,19 +439,19 @@ _armv4_AES_encrypt: .align 5 asm_AES_set_encrypt_key: _armv4_AES_set_encrypt_key: -#if __ARM_ARCH__<7 +#ifndef __thumb2__ sub r3,pc,#8 @ asm_AES_set_encrypt_key #else adr r3,asm_AES_set_encrypt_key #endif teq r0,#0 -#if __ARM_ARCH__>=7 +#ifdef __thumb2__ itt eq @ Thumb2 thing, sanity check in ARM #endif moveq r0,#-1 beq .Labrt teq r2,#0 -#if __ARM_ARCH__>=7 +#ifdef __thumb2__ itt eq @ Thumb2 thing, sanity check in ARM #endif moveq r0,#-1 @@ -466,7 +462,7 @@ _armv4_AES_set_encrypt_key: teq r1,#192 beq .Lok teq r1,#256 -#if __ARM_ARCH__>=7 +#ifdef __thumb2__ itt ne @ Thumb2 thing, sanity check in ARM #endif movne r0,#-1 @@ -627,7 +623,7 @@ _armv4_AES_set_encrypt_key: str $s2,[$key,#-16] subs $rounds,$rounds,#1 str $s3,[$key,#-12] -#if __ARM_ARCH__>=7 +#ifdef __thumb2__ itt eq @ Thumb2 thing, sanity check in ARM #endif subeq r2,$key,#216 @@ -699,7 +695,7 @@ _armv4_AES_set_encrypt_key: str $s2,[$key,#-24] subs $rounds,$rounds,#1 str $s3,[$key,#-20] -#if __ARM_ARCH__>=7 +#ifdef __thumb2__ itt eq @ Thumb2 thing, sanity check in ARM #endif subeq r2,$key,#256 @@ -969,7 +965,7 @@ AES_Td: .type asm_AES_decrypt,%function .align 5 asm_AES_decrypt: -#if __ARM_ARCH__<7 +#ifndef __thumb2__ sub r3,pc,#8 @ asm_AES_decrypt #else adr r3,asm_AES_decrypt diff --git a/crypto/fipsmodule/bn/asm/armv4-mont.pl b/crypto/fipsmodule/bn/asm/armv4-mont.pl index acda9e52..ad4c12b9 100644 --- a/crypto/fipsmodule/bn/asm/armv4-mont.pl +++ b/crypto/fipsmodule/bn/asm/armv4-mont.pl @@ -82,7 +82,12 @@ $code=<<___; #include .text +#if defined(__thumb2__) && !defined(__APPLE__) +.syntax unified +.thumb +#else .code 32 +#endif #if __ARM_MAX_ARCH__>=7 .align 5 @@ -101,7 +106,7 @@ bn_mul_mont: #if __ARM_MAX_ARCH__>=7 tst ip,#7 bne .Lialu - adr r0,bn_mul_mont + adr r0,.Lbn_mul_mont ldr r2,.LOPENSSL_armcap ldr r0,[r0,r2] #ifdef __APPLE__ @@ -117,6 +122,9 @@ bn_mul_mont: #endif cmp ip,#2 mov $num,ip @ load num +#ifdef __thumb2__ + ittt lt +#endif movlt r0,#0 addlt sp,sp,#2*4 blt .Labrt @@ -164,10 +172,11 @@ bn_mul_mont: ldr $n0,[$_n0] @ restore n0 adc $nhi,$nhi,#0 str $nlo,[$num] @ tp[num-1]= + mov $tj,sp str $nhi,[$num,#4] @ tp[num]= .Louter: - sub $tj,$num,sp @ "original" $num-1 value + sub $tj,$num,$tj @ "original" $num-1 value sub $ap,$ap,$tj @ "rewind" ap to &ap[1] ldr $bi,[$tp,#4]! @ *(++bp) sub $np,$np,$tj @ "rewind" np to &np[1] @@ -212,11 +221,16 @@ bn_mul_mont: str $nhi,[$num,#4] @ tp[num]= cmp $tp,$tj +#ifdef __thumb2__ + itt ne +#endif + movne $tj,sp bne .Louter ldr $rp,[$_rp] @ pull rp + mov $aj,sp add $num,$num,#4 @ $num to point at &tp[num] - sub $aj,$num,sp @ "original" num value + sub $aj,$num,$aj @ "original" num value mov $tp,sp @ "rewind" $tp mov $ap,$tp @ "borrow" $ap sub $np,$np,$aj @ "rewind" $np to &np[0] @@ -242,7 +256,8 @@ bn_mul_mont: cmp $tp,$num bne .Lcopy - add sp,$num,#4 @ skip over tp[num+1] + mov sp,$num + add sp,sp,#4 @ skip over tp[num+1] ldmia sp!,{r4-r12,lr} @ restore registers add sp,sp,#2*4 @ skip over {r0,r2} mov r0,#1 @@ -283,6 +298,7 @@ bn_mul8x_mont_neon: stmdb sp!,{r4-r11} vstmdb sp!,{d8-d15} @ ABI specification says so ldmia ip,{r4-r5} @ load rest of parameter block + mov ip,sp sub $toutptr,sp,#16 vld1.32 {${Bi}[0]}, [$bptr,:32]! @@ -638,8 +654,9 @@ bn_mul8x_mont_neon: bne .LNEON_sub ldr r10, [$aptr] @ load top-most bit + mov r11,sp veor q0,q0,q0 - sub r11,$bptr,sp @ this is num*4 + sub r11,$bptr,r11 @ this is num*4 veor q1,q1,q1 mov $aptr,sp sub $rptr,$rptr,r11 @ rewind $rptr @@ -649,27 +666,33 @@ bn_mul8x_mont_neon: .LNEON_copy_n_zap: ldmia $aptr!, {r4-r7} ldmia $rptr, {r8-r11} + it cc movcc r8, r4 vst1.64 {q0-q1}, [$nptr,:256]! @ wipe + itt cc movcc r9, r5 movcc r10,r6 vst1.64 {q0-q1}, [$nptr,:256]! @ wipe + it cc movcc r11,r7 ldmia $aptr, {r4-r7} stmia $rptr!, {r8-r11} sub $aptr,$aptr,#16 ldmia $rptr, {r8-r11} + it cc movcc r8, r4 vst1.64 {q0-q1}, [$aptr,:256]! @ wipe + itt cc movcc r9, r5 movcc r10,r6 vst1.64 {q0-q1}, [$nptr,:256]! @ wipe + it cc movcc r11,r7 teq $aptr,$bptr @ preserves carry stmia $rptr!, {r8-r11} bne .LNEON_copy_n_zap - sub sp,ip,#96 + mov sp,ip vldmia sp!,{d8-d15} ldmia sp!,{r4-r11} ret @ bx lr diff --git a/crypto/fipsmodule/modes/asm/ghash-armv4.pl b/crypto/fipsmodule/modes/asm/ghash-armv4.pl index 868a020a..cf4c4b28 100644 --- a/crypto/fipsmodule/modes/asm/ghash-armv4.pl +++ b/crypto/fipsmodule/modes/asm/ghash-armv4.pl @@ -136,7 +136,12 @@ $code=<<___; #include .text +#if defined(__thumb2__) && !defined(__APPLE__) +.syntax unified +.thumb +#else .code 32 +#endif #ifdef __clang__ #define ldrplb ldrbpl @@ -154,19 +159,27 @@ rem_4bit: .type rem_4bit_get,%function rem_4bit_get: - sub $rem_4bit,pc,#8 - sub $rem_4bit,$rem_4bit,#32 @ &rem_4bit +#if defined(__thumb2__) + adr $rem_4bit,rem_4bit +#else + sub $rem_4bit,pc,#8+32 @ &rem_4bit +#endif b .Lrem_4bit_got nop + nop .size rem_4bit_get,.-rem_4bit_get .global gcm_ghash_4bit .type gcm_ghash_4bit,%function +.align 4 gcm_ghash_4bit: - sub r12,pc,#8 +#if defined(__thumb2__) + adr r12,rem_4bit +#else + sub r12,pc,#8+48 @ &rem_4bit +#endif add $len,$inp,$len @ $len to point at the end stmdb sp!,{r3-r11,lr} @ save $len/end too - sub r12,r12,#48 @ &rem_4bit ldmia r12,{r4-r11} @ copy rem_4bit ... stmdb sp!,{r4-r11} @ ... to stack @@ -213,6 +226,9 @@ gcm_ghash_4bit: eor $Zlh,$Zlh,$Zhl,lsl#28 ldrh $Tll,[sp,$nlo] @ rem_4bit[rem] eor $Zhl,$Thl,$Zhl,lsr#4 +#ifdef __thumb2__ + it pl +#endif ldrplb $nlo,[$inp,$cnt] eor $Zhl,$Zhl,$Zhh,lsl#28 eor $Zhh,$Thh,$Zhh,lsr#4 @@ -223,6 +239,9 @@ gcm_ghash_4bit: add $nhi,$nhi,$nhi ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi] eor $Zll,$Tll,$Zll,lsr#4 +#ifdef __thumb2__ + it pl +#endif ldrplb $Tll,[$Xi,$cnt] eor $Zll,$Zll,$Zlh,lsl#28 eor $Zlh,$Tlh,$Zlh,lsr#4 @@ -230,8 +249,14 @@ gcm_ghash_4bit: eor $Zlh,$Zlh,$Zhl,lsl#28 eor $Zhl,$Thl,$Zhl,lsr#4 eor $Zhl,$Zhl,$Zhh,lsl#28 +#ifdef __thumb2__ + it pl +#endif eorpl $nlo,$nlo,$Tll eor $Zhh,$Thh,$Zhh,lsr#4 +#ifdef __thumb2__ + itt pl +#endif andpl $nhi,$nlo,#0xf0 andpl $nlo,$nlo,#0x0f eor $Zhh,$Zhh,$Tlh,lsl#16 @ ^= rem_4bit[rem] @@ -241,7 +266,11 @@ gcm_ghash_4bit: add $inp,$inp,#16 mov $nhi,$Zll ___ - &Zsmash("cmp\t$inp,$len","ldrneb\t$nlo,[$inp,#15]"); + &Zsmash("cmp\t$inp,$len","\n". + "#ifdef __thumb2__\n". + " it ne\n". + "#endif\n". + " ldrneb $nlo,[$inp,#15]"); $code.=<<___; bne .Louter @@ -299,6 +328,9 @@ gcm_gmult_4bit: eor $Zlh,$Zlh,$Zhl,lsl#28 ldrh $Tll,[$rem_4bit,$nlo] @ rem_4bit[rem] eor $Zhl,$Thl,$Zhl,lsr#4 +#ifdef __thumb2__ + it pl +#endif ldrplb $nlo,[$Xi,$cnt] eor $Zhl,$Zhl,$Zhh,lsl#28 eor $Zhh,$Thh,$Zhh,lsr#4 @@ -316,6 +348,9 @@ gcm_gmult_4bit: eor $Zhl,$Thl,$Zhl,lsr#4 eor $Zhl,$Zhl,$Zhh,lsl#28 eor $Zhh,$Thh,$Zhh,lsr#4 +#ifdef __thumb2__ + itt pl +#endif andpl $nhi,$nlo,#0xf0 andpl $nlo,$nlo,#0x0f eor $Zhh,$Zhh,$Tll,lsl#16 @ ^= rem_4bit[rem] diff --git a/crypto/fipsmodule/sha/asm/sha1-armv4-large.pl b/crypto/fipsmodule/sha/asm/sha1-armv4-large.pl index 944248fe..93265cdb 100644 --- a/crypto/fipsmodule/sha/asm/sha1-armv4-large.pl +++ b/crypto/fipsmodule/sha/asm/sha1-armv4-large.pl @@ -181,7 +181,12 @@ $code=<<___; #include .text +#if defined(__thumb2__) && !defined(__APPLE__) +.syntax unified +.thumb +#else .code 32 +#endif .global sha1_block_data_order .type sha1_block_data_order,%function @@ -189,7 +194,8 @@ $code=<<___; .align 5 sha1_block_data_order: #if __ARM_MAX_ARCH__>=7 - sub r3,pc,#8 @ sha1_block_data_order +.Lsha1_block: + adr r3,.Lsha1_block ldr r12,.LOPENSSL_armcap ldr r12,[r3,r12] @ OPENSSL_armcap_P #ifdef __APPLE__ @@ -216,7 +222,12 @@ for($i=0;$i<5;$i++) { &BODY_00_15(@V); unshift(@V,pop(@V)); } $code.=<<___; +#if defined(__thumb2__) && !defined(__APPLE__) + mov $t3,sp + teq $Xi,$t3 +#else teq $Xi,sp +#endif bne .L_00_15 @ [((11+4)*5+2)*3] sub sp,sp,#25*4 ___ @@ -235,7 +246,12 @@ for($i=0;$i<5;$i++) { &BODY_20_39(@V); unshift(@V,pop(@V)); } $code.=<<___; +#if defined(__thumb2__) && !defined(__APPLE__) + mov $t3,sp + teq $Xi,$t3 +#else teq $Xi,sp @ preserve carry +#endif bne .L_20_39_or_60_79 @ [+((12+3)*5+2)*4] bcs .L_done @ [+((12+3)*5+2)*4], spare 300 bytes @@ -247,7 +263,12 @@ for($i=0;$i<5;$i++) { &BODY_40_59(@V); unshift(@V,pop(@V)); } $code.=<<___; +#if defined(__thumb2__) && !defined(__APPLE__) + mov $t3,sp + teq $Xi,$t3 +#else teq $Xi,sp +#endif bne .L_40_59 @ [+((12+5)*5+2)*4] ldr $K,.LK_60_79 @@ -283,7 +304,7 @@ $code.=<<___; .LK_60_79: .word 0xca62c1d6 #if __ARM_MAX_ARCH__>=7 .LOPENSSL_armcap: -.word OPENSSL_armcap_P-sha1_block_data_order +.word OPENSSL_armcap_P-.Lsha1_block #endif .asciz "SHA1 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by " .align 5 @@ -458,6 +479,7 @@ sub Xuplast_80 () &teq ($inp,$len); &sub ($K_XX_XX,$K_XX_XX,16); # rewind $K_XX_XX + &it ("eq"); &subeq ($inp,$inp,64); # reload last block to avoid SEGV &vld1_8 ("{@X[-4&7]-@X[-3&7]}","[$inp]!"); eval(shift(@insns)); @@ -508,12 +530,12 @@ sha1_block_data_order_neon: @ dmb @ errata #451034 on early Cortex A8 @ vstmdb sp!,{d8-d15} @ ABI specification says so mov $saved_sp,sp - sub sp,sp,#64 @ alloca + sub $Xfer,sp,#64 adr $K_XX_XX,.LK_00_19 - bic sp,sp,#15 @ align for 128-bit stores + bic $Xfer,$Xfer,#15 @ align for 128-bit stores ldmia $ctx,{$a,$b,$c,$d,$e} @ load context - mov $Xfer,sp + mov sp,$Xfer @ alloca vld1.8 {@X[-4&7]-@X[-3&7]},[$inp]! @ handles unaligned veor $zero,$zero,$zero @@ -560,10 +582,13 @@ $code.=<<___; add $b,$b,$t0 add $c,$c,$t1 add $d,$d,$Xfer + it eq moveq sp,$saved_sp add $e,$e,$Ki + it ne ldrne $Ki,[sp] stmia $ctx,{$a,$b,$c,$d,$e} + itt ne addne $Xfer,sp,#3*16 bne .Loop_neon @@ -584,6 +609,13 @@ my ($W0,$W1,$ABCD_SAVE)=map("q$_",(12..14)); $code.=<<___; #if __ARM_MAX_ARCH__>=7 + +# if defined(__thumb2__) && !defined(__APPLE__) +# define INST(a,b,c,d) .byte c,d|0xf,a,b +# else +# define INST(a,b,c,d) .byte a,b,c,d|0x10 +# endif + .type sha1_block_data_order_armv8,%function .align 5 sha1_block_data_order_armv8: @@ -678,7 +710,10 @@ ___ # since ARMv7 instructions are always encoded little-endian. # correct solution is to use .inst directive, but older # assemblers don't implement it:-( - sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s", + + # this fix-up provides Thumb encoding in conjunction with INST + $word &= ~0x10000000 if (($word & 0x0f000000) == 0x02000000); + sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s", $word&0xff,($word>>8)&0xff, ($word>>16)&0xff,($word>>24)&0xff, $mnemonic,$arg; diff --git a/crypto/fipsmodule/sha/asm/sha512-armv4.pl b/crypto/fipsmodule/sha/asm/sha512-armv4.pl index fd92f7a1..33c4e8cb 100644 --- a/crypto/fipsmodule/sha/asm/sha512-armv4.pl +++ b/crypto/fipsmodule/sha/asm/sha512-armv4.pl @@ -212,16 +212,12 @@ $code=<<___; #endif .text -#if __ARM_ARCH__<7 || defined(__APPLE__) -.code 32 -#else +#if defined(__thumb2__) && !defined(__APPLE__) .syntax unified -# ifdef __thumb2__ -# define adrl adr .thumb -# else -.code 32 -# endif +# define adrl adr +#else +.code 32 #endif .type K512,%object @@ -280,10 +276,10 @@ WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817) .type sha512_block_data_order,%function sha512_block_data_order: .Lsha512_block_data_order: -#if __ARM_ARCH__<7 +#if __ARM_ARCH__<7 && !defined(__thumb2__) sub r3,pc,#8 @ sha512_block_data_order #else - adr r3,sha512_block_data_order + adr r3,.Lsha512_block_data_order #endif #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) ldr r12,.LOPENSSL_armcap