As some of ARM processors, more specifically Cortex-Mx series, are Thumb2-only, we need to support Thumb2-only builds even in assembly. (Imported from upstream's 11208dcfb9105e8afa37233185decefd45e89e17.) Change-Id: I7cb48ce6a842cf3cfdf553f6e6e6227d52d525c0 Reviewed-on: https://boringssl-review.googlesource.com/17108 Reviewed-by: Adam Langley <agl@google.com>kris/onging/CECPQ3_patch15
@@ -70,15 +70,11 @@ $code=<<___; | |||
#endif | |||
.text | |||
#if __ARM_ARCH__<7 | |||
.code 32 | |||
#else | |||
#if defined(__thumb2__) && !defined(__APPLE__) | |||
.syntax unified | |||
# if defined(__thumb2__) && !defined(__APPLE__) | |||
.thumb | |||
# else | |||
#else | |||
.code 32 | |||
# endif | |||
#endif | |||
.type AES_Te,%object | |||
@@ -193,7 +189,7 @@ AES_Te: | |||
.type asm_AES_encrypt,%function | |||
.align 5 | |||
asm_AES_encrypt: | |||
#if __ARM_ARCH__<7 | |||
#ifndef __thumb2__ | |||
sub r3,pc,#8 @ asm_AES_encrypt | |||
#else | |||
adr r3,asm_AES_encrypt | |||
@@ -443,19 +439,19 @@ _armv4_AES_encrypt: | |||
.align 5 | |||
asm_AES_set_encrypt_key: | |||
_armv4_AES_set_encrypt_key: | |||
#if __ARM_ARCH__<7 | |||
#ifndef __thumb2__ | |||
sub r3,pc,#8 @ asm_AES_set_encrypt_key | |||
#else | |||
adr r3,asm_AES_set_encrypt_key | |||
#endif | |||
teq r0,#0 | |||
#if __ARM_ARCH__>=7 | |||
#ifdef __thumb2__ | |||
itt eq @ Thumb2 thing, sanity check in ARM | |||
#endif | |||
moveq r0,#-1 | |||
beq .Labrt | |||
teq r2,#0 | |||
#if __ARM_ARCH__>=7 | |||
#ifdef __thumb2__ | |||
itt eq @ Thumb2 thing, sanity check in ARM | |||
#endif | |||
moveq r0,#-1 | |||
@@ -466,7 +462,7 @@ _armv4_AES_set_encrypt_key: | |||
teq r1,#192 | |||
beq .Lok | |||
teq r1,#256 | |||
#if __ARM_ARCH__>=7 | |||
#ifdef __thumb2__ | |||
itt ne @ Thumb2 thing, sanity check in ARM | |||
#endif | |||
movne r0,#-1 | |||
@@ -627,7 +623,7 @@ _armv4_AES_set_encrypt_key: | |||
str $s2,[$key,#-16] | |||
subs $rounds,$rounds,#1 | |||
str $s3,[$key,#-12] | |||
#if __ARM_ARCH__>=7 | |||
#ifdef __thumb2__ | |||
itt eq @ Thumb2 thing, sanity check in ARM | |||
#endif | |||
subeq r2,$key,#216 | |||
@@ -699,7 +695,7 @@ _armv4_AES_set_encrypt_key: | |||
str $s2,[$key,#-24] | |||
subs $rounds,$rounds,#1 | |||
str $s3,[$key,#-20] | |||
#if __ARM_ARCH__>=7 | |||
#ifdef __thumb2__ | |||
itt eq @ Thumb2 thing, sanity check in ARM | |||
#endif | |||
subeq r2,$key,#256 | |||
@@ -969,7 +965,7 @@ AES_Td: | |||
.type asm_AES_decrypt,%function | |||
.align 5 | |||
asm_AES_decrypt: | |||
#if __ARM_ARCH__<7 | |||
#ifndef __thumb2__ | |||
sub r3,pc,#8 @ asm_AES_decrypt | |||
#else | |||
adr r3,asm_AES_decrypt | |||
@@ -82,7 +82,12 @@ $code=<<___; | |||
#include <openssl/arm_arch.h> | |||
.text | |||
#if defined(__thumb2__) && !defined(__APPLE__) | |||
.syntax unified | |||
.thumb | |||
#else | |||
.code 32 | |||
#endif | |||
#if __ARM_MAX_ARCH__>=7 | |||
.align 5 | |||
@@ -101,7 +106,7 @@ bn_mul_mont: | |||
#if __ARM_MAX_ARCH__>=7 | |||
tst ip,#7 | |||
bne .Lialu | |||
adr r0,bn_mul_mont | |||
adr r0,.Lbn_mul_mont | |||
ldr r2,.LOPENSSL_armcap | |||
ldr r0,[r0,r2] | |||
#ifdef __APPLE__ | |||
@@ -117,6 +122,9 @@ bn_mul_mont: | |||
#endif | |||
cmp ip,#2 | |||
mov $num,ip @ load num | |||
#ifdef __thumb2__ | |||
ittt lt | |||
#endif | |||
movlt r0,#0 | |||
addlt sp,sp,#2*4 | |||
blt .Labrt | |||
@@ -164,10 +172,11 @@ bn_mul_mont: | |||
ldr $n0,[$_n0] @ restore n0 | |||
adc $nhi,$nhi,#0 | |||
str $nlo,[$num] @ tp[num-1]= | |||
mov $tj,sp | |||
str $nhi,[$num,#4] @ tp[num]= | |||
.Louter: | |||
sub $tj,$num,sp @ "original" $num-1 value | |||
sub $tj,$num,$tj @ "original" $num-1 value | |||
sub $ap,$ap,$tj @ "rewind" ap to &ap[1] | |||
ldr $bi,[$tp,#4]! @ *(++bp) | |||
sub $np,$np,$tj @ "rewind" np to &np[1] | |||
@@ -212,11 +221,16 @@ bn_mul_mont: | |||
str $nhi,[$num,#4] @ tp[num]= | |||
cmp $tp,$tj | |||
#ifdef __thumb2__ | |||
itt ne | |||
#endif | |||
movne $tj,sp | |||
bne .Louter | |||
ldr $rp,[$_rp] @ pull rp | |||
mov $aj,sp | |||
add $num,$num,#4 @ $num to point at &tp[num] | |||
sub $aj,$num,sp @ "original" num value | |||
sub $aj,$num,$aj @ "original" num value | |||
mov $tp,sp @ "rewind" $tp | |||
mov $ap,$tp @ "borrow" $ap | |||
sub $np,$np,$aj @ "rewind" $np to &np[0] | |||
@@ -242,7 +256,8 @@ bn_mul_mont: | |||
cmp $tp,$num | |||
bne .Lcopy | |||
add sp,$num,#4 @ skip over tp[num+1] | |||
mov sp,$num | |||
add sp,sp,#4 @ skip over tp[num+1] | |||
ldmia sp!,{r4-r12,lr} @ restore registers | |||
add sp,sp,#2*4 @ skip over {r0,r2} | |||
mov r0,#1 | |||
@@ -283,6 +298,7 @@ bn_mul8x_mont_neon: | |||
stmdb sp!,{r4-r11} | |||
vstmdb sp!,{d8-d15} @ ABI specification says so | |||
ldmia ip,{r4-r5} @ load rest of parameter block | |||
mov ip,sp | |||
sub $toutptr,sp,#16 | |||
vld1.32 {${Bi}[0]}, [$bptr,:32]! | |||
@@ -638,8 +654,9 @@ bn_mul8x_mont_neon: | |||
bne .LNEON_sub | |||
ldr r10, [$aptr] @ load top-most bit | |||
mov r11,sp | |||
veor q0,q0,q0 | |||
sub r11,$bptr,sp @ this is num*4 | |||
sub r11,$bptr,r11 @ this is num*4 | |||
veor q1,q1,q1 | |||
mov $aptr,sp | |||
sub $rptr,$rptr,r11 @ rewind $rptr | |||
@@ -649,27 +666,33 @@ bn_mul8x_mont_neon: | |||
.LNEON_copy_n_zap: | |||
ldmia $aptr!, {r4-r7} | |||
ldmia $rptr, {r8-r11} | |||
it cc | |||
movcc r8, r4 | |||
vst1.64 {q0-q1}, [$nptr,:256]! @ wipe | |||
itt cc | |||
movcc r9, r5 | |||
movcc r10,r6 | |||
vst1.64 {q0-q1}, [$nptr,:256]! @ wipe | |||
it cc | |||
movcc r11,r7 | |||
ldmia $aptr, {r4-r7} | |||
stmia $rptr!, {r8-r11} | |||
sub $aptr,$aptr,#16 | |||
ldmia $rptr, {r8-r11} | |||
it cc | |||
movcc r8, r4 | |||
vst1.64 {q0-q1}, [$aptr,:256]! @ wipe | |||
itt cc | |||
movcc r9, r5 | |||
movcc r10,r6 | |||
vst1.64 {q0-q1}, [$nptr,:256]! @ wipe | |||
it cc | |||
movcc r11,r7 | |||
teq $aptr,$bptr @ preserves carry | |||
stmia $rptr!, {r8-r11} | |||
bne .LNEON_copy_n_zap | |||
sub sp,ip,#96 | |||
mov sp,ip | |||
vldmia sp!,{d8-d15} | |||
ldmia sp!,{r4-r11} | |||
ret @ bx lr | |||
@@ -136,7 +136,12 @@ $code=<<___; | |||
#include <openssl/arm_arch.h> | |||
.text | |||
#if defined(__thumb2__) && !defined(__APPLE__) | |||
.syntax unified | |||
.thumb | |||
#else | |||
.code 32 | |||
#endif | |||
#ifdef __clang__ | |||
#define ldrplb ldrbpl | |||
@@ -154,19 +159,27 @@ rem_4bit: | |||
.type rem_4bit_get,%function | |||
rem_4bit_get: | |||
sub $rem_4bit,pc,#8 | |||
sub $rem_4bit,$rem_4bit,#32 @ &rem_4bit | |||
#if defined(__thumb2__) | |||
adr $rem_4bit,rem_4bit | |||
#else | |||
sub $rem_4bit,pc,#8+32 @ &rem_4bit | |||
#endif | |||
b .Lrem_4bit_got | |||
nop | |||
nop | |||
.size rem_4bit_get,.-rem_4bit_get | |||
.global gcm_ghash_4bit | |||
.type gcm_ghash_4bit,%function | |||
.align 4 | |||
gcm_ghash_4bit: | |||
sub r12,pc,#8 | |||
#if defined(__thumb2__) | |||
adr r12,rem_4bit | |||
#else | |||
sub r12,pc,#8+48 @ &rem_4bit | |||
#endif | |||
add $len,$inp,$len @ $len to point at the end | |||
stmdb sp!,{r3-r11,lr} @ save $len/end too | |||
sub r12,r12,#48 @ &rem_4bit | |||
ldmia r12,{r4-r11} @ copy rem_4bit ... | |||
stmdb sp!,{r4-r11} @ ... to stack | |||
@@ -213,6 +226,9 @@ gcm_ghash_4bit: | |||
eor $Zlh,$Zlh,$Zhl,lsl#28 | |||
ldrh $Tll,[sp,$nlo] @ rem_4bit[rem] | |||
eor $Zhl,$Thl,$Zhl,lsr#4 | |||
#ifdef __thumb2__ | |||
it pl | |||
#endif | |||
ldrplb $nlo,[$inp,$cnt] | |||
eor $Zhl,$Zhl,$Zhh,lsl#28 | |||
eor $Zhh,$Thh,$Zhh,lsr#4 | |||
@@ -223,6 +239,9 @@ gcm_ghash_4bit: | |||
add $nhi,$nhi,$nhi | |||
ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi] | |||
eor $Zll,$Tll,$Zll,lsr#4 | |||
#ifdef __thumb2__ | |||
it pl | |||
#endif | |||
ldrplb $Tll,[$Xi,$cnt] | |||
eor $Zll,$Zll,$Zlh,lsl#28 | |||
eor $Zlh,$Tlh,$Zlh,lsr#4 | |||
@@ -230,8 +249,14 @@ gcm_ghash_4bit: | |||
eor $Zlh,$Zlh,$Zhl,lsl#28 | |||
eor $Zhl,$Thl,$Zhl,lsr#4 | |||
eor $Zhl,$Zhl,$Zhh,lsl#28 | |||
#ifdef __thumb2__ | |||
it pl | |||
#endif | |||
eorpl $nlo,$nlo,$Tll | |||
eor $Zhh,$Thh,$Zhh,lsr#4 | |||
#ifdef __thumb2__ | |||
itt pl | |||
#endif | |||
andpl $nhi,$nlo,#0xf0 | |||
andpl $nlo,$nlo,#0x0f | |||
eor $Zhh,$Zhh,$Tlh,lsl#16 @ ^= rem_4bit[rem] | |||
@@ -241,7 +266,11 @@ gcm_ghash_4bit: | |||
add $inp,$inp,#16 | |||
mov $nhi,$Zll | |||
___ | |||
&Zsmash("cmp\t$inp,$len","ldrneb\t$nlo,[$inp,#15]"); | |||
&Zsmash("cmp\t$inp,$len","\n". | |||
"#ifdef __thumb2__\n". | |||
" it ne\n". | |||
"#endif\n". | |||
" ldrneb $nlo,[$inp,#15]"); | |||
$code.=<<___; | |||
bne .Louter | |||
@@ -299,6 +328,9 @@ gcm_gmult_4bit: | |||
eor $Zlh,$Zlh,$Zhl,lsl#28 | |||
ldrh $Tll,[$rem_4bit,$nlo] @ rem_4bit[rem] | |||
eor $Zhl,$Thl,$Zhl,lsr#4 | |||
#ifdef __thumb2__ | |||
it pl | |||
#endif | |||
ldrplb $nlo,[$Xi,$cnt] | |||
eor $Zhl,$Zhl,$Zhh,lsl#28 | |||
eor $Zhh,$Thh,$Zhh,lsr#4 | |||
@@ -316,6 +348,9 @@ gcm_gmult_4bit: | |||
eor $Zhl,$Thl,$Zhl,lsr#4 | |||
eor $Zhl,$Zhl,$Zhh,lsl#28 | |||
eor $Zhh,$Thh,$Zhh,lsr#4 | |||
#ifdef __thumb2__ | |||
itt pl | |||
#endif | |||
andpl $nhi,$nlo,#0xf0 | |||
andpl $nlo,$nlo,#0x0f | |||
eor $Zhh,$Zhh,$Tll,lsl#16 @ ^= rem_4bit[rem] | |||
@@ -181,7 +181,12 @@ $code=<<___; | |||
#include <openssl/arm_arch.h> | |||
.text | |||
#if defined(__thumb2__) && !defined(__APPLE__) | |||
.syntax unified | |||
.thumb | |||
#else | |||
.code 32 | |||
#endif | |||
.global sha1_block_data_order | |||
.type sha1_block_data_order,%function | |||
@@ -189,7 +194,8 @@ $code=<<___; | |||
.align 5 | |||
sha1_block_data_order: | |||
#if __ARM_MAX_ARCH__>=7 | |||
sub r3,pc,#8 @ sha1_block_data_order | |||
.Lsha1_block: | |||
adr r3,.Lsha1_block | |||
ldr r12,.LOPENSSL_armcap | |||
ldr r12,[r3,r12] @ OPENSSL_armcap_P | |||
#ifdef __APPLE__ | |||
@@ -216,7 +222,12 @@ for($i=0;$i<5;$i++) { | |||
&BODY_00_15(@V); unshift(@V,pop(@V)); | |||
} | |||
$code.=<<___; | |||
#if defined(__thumb2__) && !defined(__APPLE__) | |||
mov $t3,sp | |||
teq $Xi,$t3 | |||
#else | |||
teq $Xi,sp | |||
#endif | |||
bne .L_00_15 @ [((11+4)*5+2)*3] | |||
sub sp,sp,#25*4 | |||
___ | |||
@@ -235,7 +246,12 @@ for($i=0;$i<5;$i++) { | |||
&BODY_20_39(@V); unshift(@V,pop(@V)); | |||
} | |||
$code.=<<___; | |||
#if defined(__thumb2__) && !defined(__APPLE__) | |||
mov $t3,sp | |||
teq $Xi,$t3 | |||
#else | |||
teq $Xi,sp @ preserve carry | |||
#endif | |||
bne .L_20_39_or_60_79 @ [+((12+3)*5+2)*4] | |||
bcs .L_done @ [+((12+3)*5+2)*4], spare 300 bytes | |||
@@ -247,7 +263,12 @@ for($i=0;$i<5;$i++) { | |||
&BODY_40_59(@V); unshift(@V,pop(@V)); | |||
} | |||
$code.=<<___; | |||
#if defined(__thumb2__) && !defined(__APPLE__) | |||
mov $t3,sp | |||
teq $Xi,$t3 | |||
#else | |||
teq $Xi,sp | |||
#endif | |||
bne .L_40_59 @ [+((12+5)*5+2)*4] | |||
ldr $K,.LK_60_79 | |||
@@ -283,7 +304,7 @@ $code.=<<___; | |||
.LK_60_79: .word 0xca62c1d6 | |||
#if __ARM_MAX_ARCH__>=7 | |||
.LOPENSSL_armcap: | |||
.word OPENSSL_armcap_P-sha1_block_data_order | |||
.word OPENSSL_armcap_P-.Lsha1_block | |||
#endif | |||
.asciz "SHA1 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>" | |||
.align 5 | |||
@@ -458,6 +479,7 @@ sub Xuplast_80 () | |||
&teq ($inp,$len); | |||
&sub ($K_XX_XX,$K_XX_XX,16); # rewind $K_XX_XX | |||
&it ("eq"); | |||
&subeq ($inp,$inp,64); # reload last block to avoid SEGV | |||
&vld1_8 ("{@X[-4&7]-@X[-3&7]}","[$inp]!"); | |||
eval(shift(@insns)); | |||
@@ -508,12 +530,12 @@ sha1_block_data_order_neon: | |||
@ dmb @ errata #451034 on early Cortex A8 | |||
@ vstmdb sp!,{d8-d15} @ ABI specification says so | |||
mov $saved_sp,sp | |||
sub sp,sp,#64 @ alloca | |||
sub $Xfer,sp,#64 | |||
adr $K_XX_XX,.LK_00_19 | |||
bic sp,sp,#15 @ align for 128-bit stores | |||
bic $Xfer,$Xfer,#15 @ align for 128-bit stores | |||
ldmia $ctx,{$a,$b,$c,$d,$e} @ load context | |||
mov $Xfer,sp | |||
mov sp,$Xfer @ alloca | |||
vld1.8 {@X[-4&7]-@X[-3&7]},[$inp]! @ handles unaligned | |||
veor $zero,$zero,$zero | |||
@@ -560,10 +582,13 @@ $code.=<<___; | |||
add $b,$b,$t0 | |||
add $c,$c,$t1 | |||
add $d,$d,$Xfer | |||
it eq | |||
moveq sp,$saved_sp | |||
add $e,$e,$Ki | |||
it ne | |||
ldrne $Ki,[sp] | |||
stmia $ctx,{$a,$b,$c,$d,$e} | |||
itt ne | |||
addne $Xfer,sp,#3*16 | |||
bne .Loop_neon | |||
@@ -584,6 +609,13 @@ my ($W0,$W1,$ABCD_SAVE)=map("q$_",(12..14)); | |||
$code.=<<___; | |||
#if __ARM_MAX_ARCH__>=7 | |||
# if defined(__thumb2__) && !defined(__APPLE__) | |||
# define INST(a,b,c,d) .byte c,d|0xf,a,b | |||
# else | |||
# define INST(a,b,c,d) .byte a,b,c,d|0x10 | |||
# endif | |||
.type sha1_block_data_order_armv8,%function | |||
.align 5 | |||
sha1_block_data_order_armv8: | |||
@@ -678,7 +710,10 @@ ___ | |||
# since ARMv7 instructions are always encoded little-endian. | |||
# correct solution is to use .inst directive, but older | |||
# assemblers don't implement it:-( | |||
sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s", | |||
# this fix-up provides Thumb encoding in conjunction with INST | |||
$word &= ~0x10000000 if (($word & 0x0f000000) == 0x02000000); | |||
sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s", | |||
$word&0xff,($word>>8)&0xff, | |||
($word>>16)&0xff,($word>>24)&0xff, | |||
$mnemonic,$arg; | |||
@@ -212,16 +212,12 @@ $code=<<___; | |||
#endif | |||
.text | |||
#if __ARM_ARCH__<7 || defined(__APPLE__) | |||
.code 32 | |||
#else | |||
#if defined(__thumb2__) && !defined(__APPLE__) | |||
.syntax unified | |||
# ifdef __thumb2__ | |||
# define adrl adr | |||
.thumb | |||
# else | |||
.code 32 | |||
# endif | |||
# define adrl adr | |||
#else | |||
.code 32 | |||
#endif | |||
.type K512,%object | |||
@@ -280,10 +276,10 @@ WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817) | |||
.type sha512_block_data_order,%function | |||
sha512_block_data_order: | |||
.Lsha512_block_data_order: | |||
#if __ARM_ARCH__<7 | |||
#if __ARM_ARCH__<7 && !defined(__thumb2__) | |||
sub r3,pc,#8 @ sha512_block_data_order | |||
#else | |||
adr r3,sha512_block_data_order | |||
adr r3,.Lsha512_block_data_order | |||
#endif | |||
#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) | |||
ldr r12,.LOPENSSL_armcap | |||