From ae96383af375d52f30f72554b75272fa226ca795 Mon Sep 17 00:00:00 2001
From: David Benjamin <davidben@google.com>
Date: Mon, 12 Jun 2017 18:31:15 -0400
Subject: [PATCH] ARMv4 assembly pack: implement support for Thumb2.

As some of ARM processors, more specifically Cortex-Mx series, are
Thumb2-only, we need to support Thumb2-only builds even in assembly.

(Imported from upstream's 11208dcfb9105e8afa37233185decefd45e89e17.)

Change-Id: I7cb48ce6a842cf3cfdf553f6e6e6227d52d525c0
Reviewed-on: https://boringssl-review.googlesource.com/17108
Reviewed-by: Adam Langley <agl@google.com>
---
 crypto/fipsmodule/aes/asm/aes-armv4.pl        | 24 ++++------
 crypto/fipsmodule/bn/asm/armv4-mont.pl        | 35 +++++++++++---
 crypto/fipsmodule/modes/asm/ghash-armv4.pl    | 45 ++++++++++++++++--
 crypto/fipsmodule/sha/asm/sha1-armv4-large.pl | 47 ++++++++++++++++---
 crypto/fipsmodule/sha/asm/sha512-armv4.pl     | 16 +++----
 5 files changed, 126 insertions(+), 41 deletions(-)

diff --git a/crypto/fipsmodule/aes/asm/aes-armv4.pl b/crypto/fipsmodule/aes/asm/aes-armv4.pl
index e8d7f2bf..c1cf4b77 100644
--- a/crypto/fipsmodule/aes/asm/aes-armv4.pl
+++ b/crypto/fipsmodule/aes/asm/aes-armv4.pl
@@ -70,15 +70,11 @@ $code=<<___;
 #endif
 
 .text
-#if __ARM_ARCH__<7
-.code	32
-#else
+#if defined(__thumb2__) && !defined(__APPLE__)
 .syntax	unified
-# if defined(__thumb2__) && !defined(__APPLE__)
 .thumb
-# else
+#else
 .code	32
-# endif
 #endif
 
 .type	AES_Te,%object
@@ -193,7 +189,7 @@ AES_Te:
 .type   asm_AES_encrypt,%function
 .align	5
 asm_AES_encrypt:
-#if __ARM_ARCH__<7
+#ifndef	__thumb2__
 	sub	r3,pc,#8		@ asm_AES_encrypt
 #else
 	adr	r3,asm_AES_encrypt
@@ -443,19 +439,19 @@ _armv4_AES_encrypt:
 .align	5
 asm_AES_set_encrypt_key:
 _armv4_AES_set_encrypt_key:
-#if __ARM_ARCH__<7
+#ifndef	__thumb2__
 	sub	r3,pc,#8		@ asm_AES_set_encrypt_key
 #else
 	adr	r3,asm_AES_set_encrypt_key
 #endif
 	teq	r0,#0
-#if __ARM_ARCH__>=7
+#ifdef	__thumb2__
 	itt	eq			@ Thumb2 thing, sanity check in ARM
 #endif
 	moveq	r0,#-1
 	beq	.Labrt
 	teq	r2,#0
-#if __ARM_ARCH__>=7
+#ifdef	__thumb2__
 	itt	eq			@ Thumb2 thing, sanity check in ARM
 #endif
 	moveq	r0,#-1
@@ -466,7 +462,7 @@ _armv4_AES_set_encrypt_key:
 	teq	r1,#192
 	beq	.Lok
 	teq	r1,#256
-#if __ARM_ARCH__>=7
+#ifdef	__thumb2__
 	itt	ne			@ Thumb2 thing, sanity check in ARM
 #endif
 	movne	r0,#-1
@@ -627,7 +623,7 @@ _armv4_AES_set_encrypt_key:
 	str	$s2,[$key,#-16]
 	subs	$rounds,$rounds,#1
 	str	$s3,[$key,#-12]
-#if __ARM_ARCH__>=7
+#ifdef	__thumb2__
 	itt	eq				@ Thumb2 thing, sanity check in ARM
 #endif
 	subeq	r2,$key,#216
@@ -699,7 +695,7 @@ _armv4_AES_set_encrypt_key:
 	str	$s2,[$key,#-24]
 	subs	$rounds,$rounds,#1
 	str	$s3,[$key,#-20]
-#if __ARM_ARCH__>=7
+#ifdef	__thumb2__
 	itt	eq				@ Thumb2 thing, sanity check in ARM
 #endif
 	subeq	r2,$key,#256
@@ -969,7 +965,7 @@ AES_Td:
 .type   asm_AES_decrypt,%function
 .align	5
 asm_AES_decrypt:
-#if __ARM_ARCH__<7
+#ifndef	__thumb2__
 	sub	r3,pc,#8		@ asm_AES_decrypt
 #else
 	adr	r3,asm_AES_decrypt
diff --git a/crypto/fipsmodule/bn/asm/armv4-mont.pl b/crypto/fipsmodule/bn/asm/armv4-mont.pl
index acda9e52..ad4c12b9 100644
--- a/crypto/fipsmodule/bn/asm/armv4-mont.pl
+++ b/crypto/fipsmodule/bn/asm/armv4-mont.pl
@@ -82,7 +82,12 @@ $code=<<___;
 #include <openssl/arm_arch.h>
 
 .text
+#if defined(__thumb2__) && !defined(__APPLE__)
+.syntax	unified
+.thumb
+#else
 .code	32
+#endif
 
 #if __ARM_MAX_ARCH__>=7
 .align	5
@@ -101,7 +106,7 @@ bn_mul_mont:
 #if __ARM_MAX_ARCH__>=7
 	tst	ip,#7
 	bne	.Lialu
-	adr	r0,bn_mul_mont
+	adr	r0,.Lbn_mul_mont
 	ldr	r2,.LOPENSSL_armcap
 	ldr	r0,[r0,r2]
 #ifdef	__APPLE__
@@ -117,6 +122,9 @@ bn_mul_mont:
 #endif
 	cmp	ip,#2
 	mov	$num,ip			@ load num
+#ifdef	__thumb2__
+	ittt	lt
+#endif
 	movlt	r0,#0
 	addlt	sp,sp,#2*4
 	blt	.Labrt
@@ -164,10 +172,11 @@ bn_mul_mont:
 	ldr	$n0,[$_n0]		@ restore n0
 	adc	$nhi,$nhi,#0
 	str	$nlo,[$num]		@ tp[num-1]=
+	mov	$tj,sp
 	str	$nhi,[$num,#4]		@ tp[num]=
 
 .Louter:
-	sub	$tj,$num,sp		@ "original" $num-1 value
+	sub	$tj,$num,$tj		@ "original" $num-1 value
 	sub	$ap,$ap,$tj		@ "rewind" ap to &ap[1]
 	ldr	$bi,[$tp,#4]!		@ *(++bp)
 	sub	$np,$np,$tj		@ "rewind" np to &np[1]
@@ -212,11 +221,16 @@ bn_mul_mont:
 	str	$nhi,[$num,#4]		@ tp[num]=
 
 	cmp	$tp,$tj
+#ifdef	__thumb2__
+	itt	ne
+#endif
+	movne	$tj,sp
 	bne	.Louter
 
 	ldr	$rp,[$_rp]		@ pull rp
+	mov	$aj,sp
 	add	$num,$num,#4		@ $num to point at &tp[num]
-	sub	$aj,$num,sp		@ "original" num value
+	sub	$aj,$num,$aj		@ "original" num value
 	mov	$tp,sp			@ "rewind" $tp
 	mov	$ap,$tp			@ "borrow" $ap
 	sub	$np,$np,$aj		@ "rewind" $np to &np[0]
@@ -242,7 +256,8 @@ bn_mul_mont:
 	cmp	$tp,$num
 	bne	.Lcopy
 
-	add	sp,$num,#4		@ skip over tp[num+1]
+	mov	sp,$num
+	add	sp,sp,#4		@ skip over tp[num+1]
 	ldmia	sp!,{r4-r12,lr}		@ restore registers
 	add	sp,sp,#2*4		@ skip over {r0,r2}
 	mov	r0,#1
@@ -283,6 +298,7 @@ bn_mul8x_mont_neon:
 	stmdb	sp!,{r4-r11}
 	vstmdb	sp!,{d8-d15}		@ ABI specification says so
 	ldmia	ip,{r4-r5}		@ load rest of parameter block
+	mov	ip,sp
 
 	sub		$toutptr,sp,#16
 	vld1.32		{${Bi}[0]}, [$bptr,:32]!
@@ -638,8 +654,9 @@ bn_mul8x_mont_neon:
 	bne	.LNEON_sub
 
 	ldr	r10, [$aptr]				@ load top-most bit
+	mov	r11,sp
 	veor	q0,q0,q0
-	sub	r11,$bptr,sp				@ this is num*4
+	sub	r11,$bptr,r11				@ this is num*4
 	veor	q1,q1,q1
 	mov	$aptr,sp
 	sub	$rptr,$rptr,r11				@ rewind $rptr
@@ -649,27 +666,33 @@ bn_mul8x_mont_neon:
 .LNEON_copy_n_zap:
 	ldmia	$aptr!, {r4-r7}
 	ldmia	$rptr,  {r8-r11}
+	it	cc
 	movcc	r8, r4
 	vst1.64	{q0-q1}, [$nptr,:256]!			@ wipe
+	itt	cc
 	movcc	r9, r5
 	movcc	r10,r6
 	vst1.64	{q0-q1}, [$nptr,:256]!			@ wipe
+	it	cc
 	movcc	r11,r7
 	ldmia	$aptr, {r4-r7}
 	stmia	$rptr!, {r8-r11}
 	sub	$aptr,$aptr,#16
 	ldmia	$rptr, {r8-r11}
+	it	cc
 	movcc	r8, r4
 	vst1.64	{q0-q1}, [$aptr,:256]!			@ wipe
+	itt	cc
 	movcc	r9, r5
 	movcc	r10,r6
 	vst1.64	{q0-q1}, [$nptr,:256]!			@ wipe
+	it	cc
 	movcc	r11,r7
 	teq	$aptr,$bptr				@ preserves carry
 	stmia	$rptr!, {r8-r11}
 	bne	.LNEON_copy_n_zap
 
-	sub	sp,ip,#96
+	mov	sp,ip
         vldmia  sp!,{d8-d15}
         ldmia   sp!,{r4-r11}
 	ret						@ bx lr
diff --git a/crypto/fipsmodule/modes/asm/ghash-armv4.pl b/crypto/fipsmodule/modes/asm/ghash-armv4.pl
index 868a020a..cf4c4b28 100644
--- a/crypto/fipsmodule/modes/asm/ghash-armv4.pl
+++ b/crypto/fipsmodule/modes/asm/ghash-armv4.pl
@@ -136,7 +136,12 @@ $code=<<___;
 #include <openssl/arm_arch.h>
 
 .text
+#if defined(__thumb2__) && !defined(__APPLE__)
+.syntax	unified
+.thumb
+#else
 .code	32
+#endif
 
 #ifdef  __clang__
 #define ldrplb  ldrbpl
@@ -154,19 +159,27 @@ rem_4bit:
 
 .type	rem_4bit_get,%function
 rem_4bit_get:
-	sub	$rem_4bit,pc,#8
-	sub	$rem_4bit,$rem_4bit,#32	@ &rem_4bit
+#if defined(__thumb2__)
+	adr	$rem_4bit,rem_4bit
+#else
+	sub	$rem_4bit,pc,#8+32	@ &rem_4bit
+#endif
 	b	.Lrem_4bit_got
 	nop
+	nop
 .size	rem_4bit_get,.-rem_4bit_get
 
 .global	gcm_ghash_4bit
 .type	gcm_ghash_4bit,%function
+.align	4
 gcm_ghash_4bit:
-	sub	r12,pc,#8
+#if defined(__thumb2__)
+	adr	r12,rem_4bit
+#else
+	sub	r12,pc,#8+48		@ &rem_4bit
+#endif
 	add	$len,$inp,$len		@ $len to point at the end
 	stmdb	sp!,{r3-r11,lr}		@ save $len/end too
-	sub	r12,r12,#48		@ &rem_4bit
 
 	ldmia	r12,{r4-r11}		@ copy rem_4bit ...
 	stmdb	sp!,{r4-r11}		@ ... to stack
@@ -213,6 +226,9 @@ gcm_ghash_4bit:
 	eor	$Zlh,$Zlh,$Zhl,lsl#28
 	ldrh	$Tll,[sp,$nlo]		@ rem_4bit[rem]
 	eor	$Zhl,$Thl,$Zhl,lsr#4
+#ifdef	__thumb2__
+	it	pl
+#endif
 	ldrplb	$nlo,[$inp,$cnt]
 	eor	$Zhl,$Zhl,$Zhh,lsl#28
 	eor	$Zhh,$Thh,$Zhh,lsr#4
@@ -223,6 +239,9 @@ gcm_ghash_4bit:
 	add	$nhi,$nhi,$nhi
 	ldmia	$Thh,{$Tll-$Thh}	@ load Htbl[nhi]
 	eor	$Zll,$Tll,$Zll,lsr#4
+#ifdef	__thumb2__
+	it	pl
+#endif
 	ldrplb	$Tll,[$Xi,$cnt]
 	eor	$Zll,$Zll,$Zlh,lsl#28
 	eor	$Zlh,$Tlh,$Zlh,lsr#4
@@ -230,8 +249,14 @@ gcm_ghash_4bit:
 	eor	$Zlh,$Zlh,$Zhl,lsl#28
 	eor	$Zhl,$Thl,$Zhl,lsr#4
 	eor	$Zhl,$Zhl,$Zhh,lsl#28
+#ifdef	__thumb2__
+	it	pl
+#endif
 	eorpl	$nlo,$nlo,$Tll
 	eor	$Zhh,$Thh,$Zhh,lsr#4
+#ifdef	__thumb2__
+	itt	pl
+#endif
 	andpl	$nhi,$nlo,#0xf0
 	andpl	$nlo,$nlo,#0x0f
 	eor	$Zhh,$Zhh,$Tlh,lsl#16	@ ^= rem_4bit[rem]
@@ -241,7 +266,11 @@ gcm_ghash_4bit:
 	add	$inp,$inp,#16
 	mov	$nhi,$Zll
 ___
-	&Zsmash("cmp\t$inp,$len","ldrneb\t$nlo,[$inp,#15]");
+	&Zsmash("cmp\t$inp,$len","\n".
+				 "#ifdef __thumb2__\n".
+				 "	it	ne\n".
+				 "#endif\n".
+				 "	ldrneb	$nlo,[$inp,#15]");
 $code.=<<___;
 	bne	.Louter
 
@@ -299,6 +328,9 @@ gcm_gmult_4bit:
 	eor	$Zlh,$Zlh,$Zhl,lsl#28
 	ldrh	$Tll,[$rem_4bit,$nlo]	@ rem_4bit[rem]
 	eor	$Zhl,$Thl,$Zhl,lsr#4
+#ifdef	__thumb2__
+	it	pl
+#endif
 	ldrplb	$nlo,[$Xi,$cnt]
 	eor	$Zhl,$Zhl,$Zhh,lsl#28
 	eor	$Zhh,$Thh,$Zhh,lsr#4
@@ -316,6 +348,9 @@ gcm_gmult_4bit:
 	eor	$Zhl,$Thl,$Zhl,lsr#4
 	eor	$Zhl,$Zhl,$Zhh,lsl#28
 	eor	$Zhh,$Thh,$Zhh,lsr#4
+#ifdef	__thumb2__
+	itt	pl
+#endif
 	andpl	$nhi,$nlo,#0xf0
 	andpl	$nlo,$nlo,#0x0f
 	eor	$Zhh,$Zhh,$Tll,lsl#16	@ ^= rem_4bit[rem]
diff --git a/crypto/fipsmodule/sha/asm/sha1-armv4-large.pl b/crypto/fipsmodule/sha/asm/sha1-armv4-large.pl
index 944248fe..93265cdb 100644
--- a/crypto/fipsmodule/sha/asm/sha1-armv4-large.pl
+++ b/crypto/fipsmodule/sha/asm/sha1-armv4-large.pl
@@ -181,7 +181,12 @@ $code=<<___;
 #include <openssl/arm_arch.h>
 
 .text
+#if defined(__thumb2__) && !defined(__APPLE__)
+.syntax	unified
+.thumb
+#else
 .code	32
+#endif
 
 .global	sha1_block_data_order
 .type	sha1_block_data_order,%function
@@ -189,7 +194,8 @@ $code=<<___;
 .align	5
 sha1_block_data_order:
 #if __ARM_MAX_ARCH__>=7
-	sub	r3,pc,#8		@ sha1_block_data_order
+.Lsha1_block:
+	adr	r3,.Lsha1_block
 	ldr	r12,.LOPENSSL_armcap
 	ldr	r12,[r3,r12]		@ OPENSSL_armcap_P
 #ifdef	__APPLE__
@@ -216,7 +222,12 @@ for($i=0;$i<5;$i++) {
 	&BODY_00_15(@V);	unshift(@V,pop(@V));
 }
 $code.=<<___;
+#if defined(__thumb2__) && !defined(__APPLE__)
+	mov	$t3,sp
+	teq	$Xi,$t3
+#else
 	teq	$Xi,sp
+#endif
 	bne	.L_00_15		@ [((11+4)*5+2)*3]
 	sub	sp,sp,#25*4
 ___
@@ -235,7 +246,12 @@ for($i=0;$i<5;$i++) {
 	&BODY_20_39(@V);	unshift(@V,pop(@V));
 }
 $code.=<<___;
+#if defined(__thumb2__) && !defined(__APPLE__)
+	mov	$t3,sp
+	teq	$Xi,$t3
+#else
 	teq	$Xi,sp			@ preserve carry
+#endif
 	bne	.L_20_39_or_60_79	@ [+((12+3)*5+2)*4]
 	bcs	.L_done			@ [+((12+3)*5+2)*4], spare 300 bytes
 
@@ -247,7 +263,12 @@ for($i=0;$i<5;$i++) {
 	&BODY_40_59(@V);	unshift(@V,pop(@V));
 }
 $code.=<<___;
+#if defined(__thumb2__) && !defined(__APPLE__)
+	mov	$t3,sp
+	teq	$Xi,$t3
+#else
 	teq	$Xi,sp
+#endif
 	bne	.L_40_59		@ [+((12+5)*5+2)*4]
 
 	ldr	$K,.LK_60_79
@@ -283,7 +304,7 @@ $code.=<<___;
 .LK_60_79:	.word	0xca62c1d6
 #if __ARM_MAX_ARCH__>=7
 .LOPENSSL_armcap:
-.word	OPENSSL_armcap_P-sha1_block_data_order
+.word	OPENSSL_armcap_P-.Lsha1_block
 #endif
 .asciz	"SHA1 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
 .align	5
@@ -458,6 +479,7 @@ sub Xuplast_80 ()
 
 	&teq		($inp,$len);
 	&sub		($K_XX_XX,$K_XX_XX,16);	# rewind $K_XX_XX
+	&it		("eq");
 	&subeq		($inp,$inp,64);		# reload last block to avoid SEGV
 	&vld1_8		("{@X[-4&7]-@X[-3&7]}","[$inp]!");
 	 eval(shift(@insns));
@@ -508,12 +530,12 @@ sha1_block_data_order_neon:
 	@ dmb				@ errata #451034 on early Cortex A8
 	@ vstmdb	sp!,{d8-d15}	@ ABI specification says so
 	mov	$saved_sp,sp
-	sub	sp,sp,#64		@ alloca
+	sub	$Xfer,sp,#64
 	adr	$K_XX_XX,.LK_00_19
-	bic	sp,sp,#15		@ align for 128-bit stores
+	bic	$Xfer,$Xfer,#15		@ align for 128-bit stores
 
 	ldmia	$ctx,{$a,$b,$c,$d,$e}	@ load context
-	mov	$Xfer,sp
+	mov	sp,$Xfer		@ alloca
 
 	vld1.8		{@X[-4&7]-@X[-3&7]},[$inp]!	@ handles unaligned
 	veor		$zero,$zero,$zero
@@ -560,10 +582,13 @@ $code.=<<___;
 	add	$b,$b,$t0
 	add	$c,$c,$t1
 	add	$d,$d,$Xfer
+	it	eq
 	moveq	sp,$saved_sp
 	add	$e,$e,$Ki
+	it	ne
 	ldrne	$Ki,[sp]
 	stmia	$ctx,{$a,$b,$c,$d,$e}
+	itt	ne
 	addne	$Xfer,sp,#3*16
 	bne	.Loop_neon
 
@@ -584,6 +609,13 @@ my ($W0,$W1,$ABCD_SAVE)=map("q$_",(12..14));
 
 $code.=<<___;
 #if __ARM_MAX_ARCH__>=7
+
+# if defined(__thumb2__) && !defined(__APPLE__)
+#  define INST(a,b,c,d)	.byte	c,d|0xf,a,b
+# else
+#  define INST(a,b,c,d)	.byte	a,b,c,d|0x10
+# endif
+
 .type	sha1_block_data_order_armv8,%function
 .align	5
 sha1_block_data_order_armv8:
@@ -678,7 +710,10 @@ ___
 	    # since ARMv7 instructions are always encoded little-endian.
 	    # correct solution is to use .inst directive, but older
 	    # assemblers don't implement it:-(
-	    sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s",
+
+	    # this fix-up provides Thumb encoding in conjunction with INST
+	    $word &= ~0x10000000 if (($word & 0x0f000000) == 0x02000000);
+	    sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s",
 			$word&0xff,($word>>8)&0xff,
 			($word>>16)&0xff,($word>>24)&0xff,
 			$mnemonic,$arg;
diff --git a/crypto/fipsmodule/sha/asm/sha512-armv4.pl b/crypto/fipsmodule/sha/asm/sha512-armv4.pl
index fd92f7a1..33c4e8cb 100644
--- a/crypto/fipsmodule/sha/asm/sha512-armv4.pl
+++ b/crypto/fipsmodule/sha/asm/sha512-armv4.pl
@@ -212,16 +212,12 @@ $code=<<___;
 #endif
 
 .text
-#if __ARM_ARCH__<7 || defined(__APPLE__)
-.code	32
-#else
+#if defined(__thumb2__) && !defined(__APPLE__)
 .syntax unified
-# ifdef __thumb2__
-#  define adrl adr
 .thumb
-# else
-.code   32
-# endif
+# define adrl adr
+#else
+.code	32
 #endif
 
 .type	K512,%object
@@ -280,10 +276,10 @@ WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817)
 .type	sha512_block_data_order,%function
 sha512_block_data_order:
 .Lsha512_block_data_order:
-#if __ARM_ARCH__<7
+#if __ARM_ARCH__<7 && !defined(__thumb2__)
 	sub	r3,pc,#8		@ sha512_block_data_order
 #else
-	adr	r3,sha512_block_data_order
+	adr	r3,.Lsha512_block_data_order
 #endif
 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
 	ldr	r12,.LOPENSSL_armcap