ARMv4 assembly pack: implement support for Thumb2.

As some of ARM processors, more specifically Cortex-Mx series, are Thumb2-only, we need to support Thumb2-only builds even in assembly. (Imported from upstream's 11208dcfb9105e8afa37233185decefd45e89e17.) Change-Id: I7cb48ce6a842cf3cfdf553f6e6e6227d52d525c0 Reviewed-on: https://boringssl-review.googlesource.com/17108 Reviewed-by: Adam Langley <agl@google.com>
7 years ago · ae96383af3
--- a/crypto/fipsmodule/aes/asm/aes-armv4.pl
+++ b/crypto/fipsmodule/aes/asm/aes-armv4.pl
@@ -70,15 +70,11 @@ $code=<<___;
 #endif
 .text
 #if __ARM_ARCH__<7
 .code	32
 #else
 #if defined(__thumb2__) && !defined(__APPLE__)
 .syntax	unified
 # if defined(__thumb2__) && !defined(__APPLE__)
 .thumb
 # else
 #else
 .code	32
 # endif
 #endif
 .type	AES_Te,%object
@@ -193,7 +189,7 @@ AES_Te:
 .type   asm_AES_encrypt,%function
 .align	5
 asm_AES_encrypt:
 #if __ARM_ARCH__<7
 #ifndef	__thumb2__
 	sub	r3,pc,#8		@ asm_AES_encrypt
 #else
 	adr	r3,asm_AES_encrypt
@@ -443,19 +439,19 @@ _armv4_AES_encrypt:
 .align	5
 asm_AES_set_encrypt_key:
 _armv4_AES_set_encrypt_key:
 #if __ARM_ARCH__<7
 #ifndef	__thumb2__
 	sub	r3,pc,#8		@ asm_AES_set_encrypt_key
 #else
 	adr	r3,asm_AES_set_encrypt_key
 #endif
 	teq	r0,#0
 #if __ARM_ARCH__>=7
 #ifdef	__thumb2__
 	itt	eq			@ Thumb2 thing, sanity check in ARM
 #endif
 	moveq	r0,#-1
 	beq	.Labrt
 	teq	r2,#0
 #if __ARM_ARCH__>=7
 #ifdef	__thumb2__
 	itt	eq			@ Thumb2 thing, sanity check in ARM
 #endif
 	moveq	r0,#-1
@@ -466,7 +462,7 @@ _armv4_AES_set_encrypt_key:
 	teq	r1,#192
 	beq	.Lok
 	teq	r1,#256
 #if __ARM_ARCH__>=7
 #ifdef	__thumb2__
 	itt	ne			@ Thumb2 thing, sanity check in ARM
 #endif
 	movne	r0,#-1
@@ -627,7 +623,7 @@ _armv4_AES_set_encrypt_key:
 	str	$s2,[$key,#-16]
 	subs	$rounds,$rounds,#1
 	str	$s3,[$key,#-12]
 #if __ARM_ARCH__>=7
 #ifdef	__thumb2__
 	itt	eq				@ Thumb2 thing, sanity check in ARM
 #endif
 	subeq	r2,$key,#216
@@ -699,7 +695,7 @@ _armv4_AES_set_encrypt_key:
 	str	$s2,[$key,#-24]
 	subs	$rounds,$rounds,#1
 	str	$s3,[$key,#-20]
 #if __ARM_ARCH__>=7
 #ifdef	__thumb2__
 	itt	eq				@ Thumb2 thing, sanity check in ARM
 #endif
 	subeq	r2,$key,#256
@@ -969,7 +965,7 @@ AES_Td:
 .type   asm_AES_decrypt,%function
 .align	5
 asm_AES_decrypt:
 #if __ARM_ARCH__<7
 #ifndef	__thumb2__
 	sub	r3,pc,#8		@ asm_AES_decrypt
 #else
 	adr	r3,asm_AES_decrypt
--- a/crypto/fipsmodule/bn/asm/armv4-mont.pl
+++ b/crypto/fipsmodule/bn/asm/armv4-mont.pl
@@ -82,7 +82,12 @@ $code=<<___;
 #include <openssl/arm_arch.h>
 .text
 #if defined(__thumb2__) && !defined(__APPLE__)
 .syntax	unified
 .thumb
 #else
 .code	32
 #endif
 #if __ARM_MAX_ARCH__>=7
 .align	5
@@ -101,7 +106,7 @@ bn_mul_mont:
 #if __ARM_MAX_ARCH__>=7
 	tst	ip,#7
 	bne	.Lialu
 	adr	r0,bn_mul_mont
 	adr	r0,.Lbn_mul_mont
 	ldr	r2,.LOPENSSL_armcap
 	ldr	r0,[r0,r2]
 #ifdef	__APPLE__
@@ -117,6 +122,9 @@ bn_mul_mont:
 #endif
 	cmp	ip,#2
 	mov	$num,ip			@ load num
 #ifdef	__thumb2__
 	ittt	lt
 #endif
 	movlt	r0,#0
 	addlt	sp,sp,#2*4
 	blt	.Labrt
@@ -164,10 +172,11 @@ bn_mul_mont:
 	ldr	$n0,[$_n0]		@ restore n0
 	adc	$nhi,$nhi,#0
 	str	$nlo,[$num]		@ tp[num-1]=
 	mov	$tj,sp
 	str	$nhi,[$num,#4]		@ tp[num]=
 .Louter:
 	sub	$tj,$num,sp		@ "original" $num-1 value
 	sub	$tj,$num,$tj		@ "original" $num-1 value
 	sub	$ap,$ap,$tj		@ "rewind" ap to &ap[1]
 	ldr	$bi,[$tp,#4]!		@ *(++bp)
 	sub	$np,$np,$tj		@ "rewind" np to &np[1]
@@ -212,11 +221,16 @@ bn_mul_mont:
 	str	$nhi,[$num,#4]		@ tp[num]=
 	cmp	$tp,$tj
 #ifdef	__thumb2__
 	itt	ne
 #endif
 	movne	$tj,sp
 	bne	.Louter
 	ldr	$rp,[$_rp]		@ pull rp
 	mov	$aj,sp
 	add	$num,$num,#4		@ $num to point at &tp[num]
 	sub	$aj,$num,sp		@ "original" num value
 	sub	$aj,$num,$aj		@ "original" num value
 	mov	$tp,sp			@ "rewind" $tp
 	mov	$ap,$tp			@ "borrow" $ap
 	sub	$np,$np,$aj		@ "rewind" $np to &np[0]
@@ -242,7 +256,8 @@ bn_mul_mont:
 	cmp	$tp,$num
 	bne	.Lcopy
 	add	sp,$num,#4		@ skip over tp[num+1]
 	mov	sp,$num
 	add	sp,sp,#4		@ skip over tp[num+1]
 	ldmia	sp!,{r4-r12,lr}		@ restore registers
 	add	sp,sp,#2*4		@ skip over {r0,r2}
 	mov	r0,#1
@@ -283,6 +298,7 @@ bn_mul8x_mont_neon:
 	stmdb	sp!,{r4-r11}
 	vstmdb	sp!,{d8-d15}		@ ABI specification says so
 	ldmia	ip,{r4-r5}		@ load rest of parameter block
 	mov	ip,sp
 	sub		$toutptr,sp,#16
 	vld1.32		{${Bi}[0]}, [$bptr,:32]!
@@ -638,8 +654,9 @@ bn_mul8x_mont_neon:
 	bne	.LNEON_sub
 	ldr	r10, [$aptr]				@ load top-most bit
 	mov	r11,sp
 	veor	q0,q0,q0
 	sub	r11,$bptr,sp				@ this is num*4
 	sub	r11,$bptr,r11				@ this is num*4
 	veor	q1,q1,q1
 	mov	$aptr,sp
 	sub	$rptr,$rptr,r11				@ rewind $rptr
@@ -649,27 +666,33 @@ bn_mul8x_mont_neon:
 .LNEON_copy_n_zap:
 	ldmia	$aptr!, {r4-r7}
 	ldmia	$rptr,  {r8-r11}
 	it	cc
 	movcc	r8, r4
 	vst1.64	{q0-q1}, [$nptr,:256]!			@ wipe
 	itt	cc
 	movcc	r9, r5
 	movcc	r10,r6
 	vst1.64	{q0-q1}, [$nptr,:256]!			@ wipe
 	it	cc
 	movcc	r11,r7
 	ldmia	$aptr, {r4-r7}
 	stmia	$rptr!, {r8-r11}
 	sub	$aptr,$aptr,#16
 	ldmia	$rptr, {r8-r11}
 	it	cc
 	movcc	r8, r4
 	vst1.64	{q0-q1}, [$aptr,:256]!			@ wipe
 	itt	cc
 	movcc	r9, r5
 	movcc	r10,r6
 	vst1.64	{q0-q1}, [$nptr,:256]!			@ wipe
 	it	cc
 	movcc	r11,r7
 	teq	$aptr,$bptr				@ preserves carry
 	stmia	$rptr!, {r8-r11}
 	bne	.LNEON_copy_n_zap
 	sub	sp,ip,#96
 	mov	sp,ip
        vldmia  sp!,{d8-d15}
        ldmia   sp!,{r4-r11}
 	ret						@ bx lr
--- a/crypto/fipsmodule/modes/asm/ghash-armv4.pl
+++ b/crypto/fipsmodule/modes/asm/ghash-armv4.pl
@@ -136,7 +136,12 @@ $code=<<___;
 #include <openssl/arm_arch.h>
 .text
 #if defined(__thumb2__) && !defined(__APPLE__)
 .syntax	unified
 .thumb
 #else
 .code	32
 #endif
 #ifdef  __clang__
 #define ldrplb  ldrbpl
@@ -154,19 +159,27 @@ rem_4bit:
 .type	rem_4bit_get,%function
 rem_4bit_get:
 	sub	$rem_4bit,pc,#8
 	sub	$rem_4bit,$rem_4bit,#32	@ &rem_4bit
 #if defined(__thumb2__)
 	adr	$rem_4bit,rem_4bit
 #else
 	sub	$rem_4bit,pc,#8+32	@ &rem_4bit
 #endif
 	b	.Lrem_4bit_got
 	nop
 	nop
 .size	rem_4bit_get,.-rem_4bit_get
 .global	gcm_ghash_4bit
 .type	gcm_ghash_4bit,%function
 .align	4
 gcm_ghash_4bit:
 	sub	r12,pc,#8
 #if defined(__thumb2__)
 	adr	r12,rem_4bit
 #else
 	sub	r12,pc,#8+48		@ &rem_4bit
 #endif
 	add	$len,$inp,$len		@ $len to point at the end
 	stmdb	sp!,{r3-r11,lr}		@ save $len/end too
 	sub	r12,r12,#48		@ &rem_4bit
 	ldmia	r12,{r4-r11}		@ copy rem_4bit ...
 	stmdb	sp!,{r4-r11}		@ ... to stack
@@ -213,6 +226,9 @@ gcm_ghash_4bit:
 	eor	$Zlh,$Zlh,$Zhl,lsl#28
 	ldrh	$Tll,[sp,$nlo]		@ rem_4bit[rem]
 	eor	$Zhl,$Thl,$Zhl,lsr#4
 #ifdef	__thumb2__
 	it	pl
 #endif
 	ldrplb	$nlo,[$inp,$cnt]
 	eor	$Zhl,$Zhl,$Zhh,lsl#28
 	eor	$Zhh,$Thh,$Zhh,lsr#4
@@ -223,6 +239,9 @@ gcm_ghash_4bit:
 	add	$nhi,$nhi,$nhi
 	ldmia	$Thh,{$Tll-$Thh}	@ load Htbl[nhi]
 	eor	$Zll,$Tll,$Zll,lsr#4
 #ifdef	__thumb2__
 	it	pl
 #endif
 	ldrplb	$Tll,[$Xi,$cnt]
 	eor	$Zll,$Zll,$Zlh,lsl#28
 	eor	$Zlh,$Tlh,$Zlh,lsr#4
@@ -230,8 +249,14 @@ gcm_ghash_4bit:
 	eor	$Zlh,$Zlh,$Zhl,lsl#28
 	eor	$Zhl,$Thl,$Zhl,lsr#4
 	eor	$Zhl,$Zhl,$Zhh,lsl#28
 #ifdef	__thumb2__
 	it	pl
 #endif
 	eorpl	$nlo,$nlo,$Tll
 	eor	$Zhh,$Thh,$Zhh,lsr#4
 #ifdef	__thumb2__
 	itt	pl
 #endif
 	andpl	$nhi,$nlo,#0xf0
 	andpl	$nlo,$nlo,#0x0f
 	eor	$Zhh,$Zhh,$Tlh,lsl#16	@ ^= rem_4bit[rem]
@@ -241,7 +266,11 @@ gcm_ghash_4bit:
 	add	$inp,$inp,#16
 	mov	$nhi,$Zll
 ___
 	&Zsmash("cmp\t$inp,$len","ldrneb\t$nlo,[$inp,#15]");
 	&Zsmash("cmp\t$inp,$len","\n".
 				 "#ifdef __thumb2__\n".
 				 "	it	ne\n".
 				 "#endif\n".
 				 "	ldrneb	$nlo,[$inp,#15]");
 $code.=<<___;
 	bne	.Louter
@@ -299,6 +328,9 @@ gcm_gmult_4bit:
 	eor	$Zlh,$Zlh,$Zhl,lsl#28
 	ldrh	$Tll,[$rem_4bit,$nlo]	@ rem_4bit[rem]
 	eor	$Zhl,$Thl,$Zhl,lsr#4
 #ifdef	__thumb2__
 	it	pl
 #endif
 	ldrplb	$nlo,[$Xi,$cnt]
 	eor	$Zhl,$Zhl,$Zhh,lsl#28
 	eor	$Zhh,$Thh,$Zhh,lsr#4
@@ -316,6 +348,9 @@ gcm_gmult_4bit:
 	eor	$Zhl,$Thl,$Zhl,lsr#4
 	eor	$Zhl,$Zhl,$Zhh,lsl#28
 	eor	$Zhh,$Thh,$Zhh,lsr#4
 #ifdef	__thumb2__
 	itt	pl
 #endif
 	andpl	$nhi,$nlo,#0xf0
 	andpl	$nlo,$nlo,#0x0f
 	eor	$Zhh,$Zhh,$Tll,lsl#16	@ ^= rem_4bit[rem]
--- a/crypto/fipsmodule/sha/asm/sha1-armv4-large.pl
+++ b/crypto/fipsmodule/sha/asm/sha1-armv4-large.pl
@@ -181,7 +181,12 @@ $code=<<___;
 #include <openssl/arm_arch.h>
 .text
 #if defined(__thumb2__) && !defined(__APPLE__)
 .syntax	unified
 .thumb
 #else
 .code	32
 #endif
 .global	sha1_block_data_order
 .type	sha1_block_data_order,%function
@@ -189,7 +194,8 @@ $code=<<___;
 .align	5
 sha1_block_data_order:
 #if __ARM_MAX_ARCH__>=7
 	sub	r3,pc,#8		@ sha1_block_data_order
 .Lsha1_block:
 	adr	r3,.Lsha1_block
 	ldr	r12,.LOPENSSL_armcap
 	ldr	r12,[r3,r12]		@ OPENSSL_armcap_P
 #ifdef	__APPLE__
@@ -216,7 +222,12 @@ for($i=0;$i<5;$i++) {
 	&BODY_00_15(@V);	unshift(@V,pop(@V));
 }
 $code.=<<___;
 #if defined(__thumb2__) && !defined(__APPLE__)
 	mov	$t3,sp
 	teq	$Xi,$t3
 #else
 	teq	$Xi,sp
 #endif
 	bne	.L_00_15		@ [((11+4)*5+2)*3]
 	sub	sp,sp,#25*4
 ___
@@ -235,7 +246,12 @@ for($i=0;$i<5;$i++) {
 	&BODY_20_39(@V);	unshift(@V,pop(@V));
 }
 $code.=<<___;
 #if defined(__thumb2__) && !defined(__APPLE__)
 	mov	$t3,sp
 	teq	$Xi,$t3
 #else
 	teq	$Xi,sp			@ preserve carry
 #endif
 	bne	.L_20_39_or_60_79	@ [+((12+3)*5+2)*4]
 	bcs	.L_done			@ [+((12+3)*5+2)*4], spare 300 bytes
@@ -247,7 +263,12 @@ for($i=0;$i<5;$i++) {
 	&BODY_40_59(@V);	unshift(@V,pop(@V));
 }
 $code.=<<___;
 #if defined(__thumb2__) && !defined(__APPLE__)
 	mov	$t3,sp
 	teq	$Xi,$t3
 #else
 	teq	$Xi,sp
 #endif
 	bne	.L_40_59		@ [+((12+5)*5+2)*4]
 	ldr	$K,.LK_60_79
@@ -283,7 +304,7 @@ $code.=<<___;
 .LK_60_79:	.word	0xca62c1d6
 #if __ARM_MAX_ARCH__>=7
 .LOPENSSL_armcap:
 .word	OPENSSL_armcap_P-sha1_block_data_order
 .word	OPENSSL_armcap_P-.Lsha1_block
 #endif
 .asciz	"SHA1 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
 .align	5
@@ -458,6 +479,7 @@ sub Xuplast_80 ()
 	&teq		($inp,$len);
 	&sub		($K_XX_XX,$K_XX_XX,16);	# rewind $K_XX_XX
 	&it		("eq");
 	&subeq		($inp,$inp,64);		# reload last block to avoid SEGV
 	&vld1_8		("{@X[-4&7]-@X[-3&7]}","[$inp]!");
 	 eval(shift(@insns));
@@ -508,12 +530,12 @@ sha1_block_data_order_neon:
 	@ dmb				@ errata #451034 on early Cortex A8
 	@ vstmdb	sp!,{d8-d15}	@ ABI specification says so
 	mov	$saved_sp,sp
 	sub	sp,sp,#64		@ alloca
 	sub	$Xfer,sp,#64
 	adr	$K_XX_XX,.LK_00_19
 	bic	sp,sp,#15		@ align for 128-bit stores
 	bic	$Xfer,$Xfer,#15		@ align for 128-bit stores
 	ldmia	$ctx,{$a,$b,$c,$d,$e}	@ load context
 	mov	$Xfer,sp
 	mov	sp,$Xfer		@ alloca
 	vld1.8		{@X[-4&7]-@X[-3&7]},[$inp]!	@ handles unaligned
 	veor		$zero,$zero,$zero
@@ -560,10 +582,13 @@ $code.=<<___;
 	add	$b,$b,$t0
 	add	$c,$c,$t1
 	add	$d,$d,$Xfer
 	it	eq
 	moveq	sp,$saved_sp
 	add	$e,$e,$Ki
 	it	ne
 	ldrne	$Ki,[sp]
 	stmia	$ctx,{$a,$b,$c,$d,$e}
 	itt	ne
 	addne	$Xfer,sp,#3*16
 	bne	.Loop_neon
@@ -584,6 +609,13 @@ my ($W0,$W1,$ABCD_SAVE)=map("q$_",(12..14));
 $code.=<<___;
 #if __ARM_MAX_ARCH__>=7
 # if defined(__thumb2__) && !defined(__APPLE__)
 #  define INST(a,b,c,d)	.byte	c,d|0xf,a,b
 # else
 #  define INST(a,b,c,d)	.byte	a,b,c,d|0x10
 # endif
 .type	sha1_block_data_order_armv8,%function
 .align	5
 sha1_block_data_order_armv8:
@@ -678,7 +710,10 @@ ___
 	    # since ARMv7 instructions are always encoded little-endian.
 	    # correct solution is to use .inst directive, but older
 	    # assemblers don't implement it:-(
 	    sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s",
 	    # this fix-up provides Thumb encoding in conjunction with INST
 	    $word &= ~0x10000000 if (($word & 0x0f000000) == 0x02000000);
 	    sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s",
 			$word&0xff,($word>>8)&0xff,
 			($word>>16)&0xff,($word>>24)&0xff,
 			$mnemonic,$arg;
--- a/crypto/fipsmodule/sha/asm/sha512-armv4.pl
+++ b/crypto/fipsmodule/sha/asm/sha512-armv4.pl
@@ -212,16 +212,12 @@ $code=<<___;
 #endif
 .text
 #if __ARM_ARCH__<7 || defined(__APPLE__)
 .code	32
 #else
 #if defined(__thumb2__) && !defined(__APPLE__)
 .syntax unified
 # ifdef __thumb2__
 #  define adrl adr
 .thumb
 # else
 .code   32
 # endif
 # define adrl adr
 #else
 .code	32
 #endif
 .type	K512,%object
@@ -280,10 +276,10 @@ WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817)
 .type	sha512_block_data_order,%function
 sha512_block_data_order:
 .Lsha512_block_data_order:
 #if __ARM_ARCH__<7
 #if __ARM_ARCH__<7 && !defined(__thumb2__)
 	sub	r3,pc,#8		@ sha512_block_data_order
 #else
 	adr	r3,sha512_block_data_order
 	adr	r3,.Lsha512_block_data_order
 #endif
 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
 	ldr	r12,.LOPENSSL_armcap