boringssl/crypto/fipsmodule/bn/asm/x86-mont.pl

#! /usr/bin/env perl
# Copyright 2005-2016 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the OpenSSL license (the "License").  You may not use
# this file except in compliance with the License.  You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html


# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================

# October 2005
#
# This is a "teaser" code, as it can be improved in several ways...
# First of all non-SSE2 path should be implemented (yes, for now it
# performs Montgomery multiplication/convolution only on SSE2-capable
# CPUs such as P4, others fall down to original code). Then inner loop
# can be unrolled and modulo-scheduled to improve ILP and possibly
# moved to 128-bit XMM register bank (though it would require input
# rearrangement and/or increase bus bandwidth utilization). Dedicated
# squaring procedure should give further performance improvement...
# Yet, for being draft, the code improves rsa512 *sign* benchmark by
# 110%(!), rsa1024 one - by 70% and rsa4096 - by 20%:-)

# December 2006
#
# Modulo-scheduling SSE2 loops results in further 15-20% improvement.
# Integer-only code [being equipped with dedicated squaring procedure]
# gives ~40% on rsa512 sign benchmark...

$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
push(@INC,"${dir}","${dir}../../../perlasm");
require "x86asm.pl";

$output = pop;
open STDOUT,">$output";

&asm_init($ARGV[0]);

$sse2=0;
for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }

&external_label("OPENSSL_ia32cap_P") if ($sse2);

&function_begin("bn_mul_mont");

$i="edx";
$j="ecx";
$ap="esi";	$tp="esi";		# overlapping variables!!!
$rp="edi";	$bp="edi";		# overlapping variables!!!
$np="ebp";
$num="ebx";

$_num=&DWP(4*0,"esp");			# stack top layout
$_rp=&DWP(4*1,"esp");
$_ap=&DWP(4*2,"esp");
$_bp=&DWP(4*3,"esp");
$_np=&DWP(4*4,"esp");
$_n0=&DWP(4*5,"esp");	$_n0q=&QWP(4*5,"esp");
$_sp=&DWP(4*6,"esp");
$_bpend=&DWP(4*7,"esp");
$frame=32;				# size of above frame rounded up to 16n

	&xor	("eax","eax");
	&mov	("edi",&wparam(5));	# int num
	&cmp	("edi",4);
	&jl	(&label("just_leave"));

	&lea	("esi",&wparam(0));	# put aside pointer to argument block
	&lea	("edx",&wparam(1));	# load ap
	&add	("edi",2);		# extra two words on top of tp
	&neg	("edi");
	&lea	("ebp",&DWP(-$frame,"esp","edi",4));	# future alloca($frame+4*(num+2))
	&neg	("edi");

	# minimize cache contention by arranging 2K window between stack
	# pointer and ap argument [np is also position sensitive vector,
	# but it's assumed to be near ap, as it's allocated at ~same
	# time].
	&mov	("eax","ebp");
	&sub	("eax","edx");
	&and	("eax",2047);
	&sub	("ebp","eax");		# this aligns sp and ap modulo 2048

	&xor	("edx","ebp");
	&and	("edx",2048);
	&xor	("edx",2048);
	&sub	("ebp","edx");		# this splits them apart modulo 4096

	&and	("ebp",-64);		# align to cache line

	# An OS-agnostic version of __chkstk.
	#
	# Some OSes (Windows) insist on stack being "wired" to
	# physical memory in strictly sequential manner, i.e. if stack
	# allocation spans two pages, then reference to farmost one can
	# be punishable by SEGV. But page walking can do good even on
	# other OSes, because it guarantees that villain thread hits
	# the guard page before it can make damage to innocent one...
	&mov	("eax","esp");
	&sub	("eax","ebp");
	&and	("eax",-4096);
	&mov	("edx","esp");		# saved stack pointer!
	&lea	("esp",&DWP(0,"ebp","eax"));
	&mov	("eax",&DWP(0,"esp"));
	&cmp	("esp","ebp");
	&ja	(&label("page_walk"));
	&jmp	(&label("page_walk_done"));

&set_label("page_walk",16);
	&lea	("esp",&DWP(-4096,"esp"));
	&mov	("eax",&DWP(0,"esp"));
	&cmp	("esp","ebp");
	&ja	(&label("page_walk"));
&set_label("page_walk_done");

	################################# load argument block...
	&mov	("eax",&DWP(0*4,"esi"));# BN_ULONG *rp
	&mov	("ebx",&DWP(1*4,"esi"));# const BN_ULONG *ap
	&mov	("ecx",&DWP(2*4,"esi"));# const BN_ULONG *bp
	&mov	("ebp",&DWP(3*4,"esi"));# const BN_ULONG *np
	&mov	("esi",&DWP(4*4,"esi"));# const BN_ULONG *n0
	#&mov	("edi",&DWP(5*4,"esi"));# int num

	&mov	("esi",&DWP(0,"esi"));	# pull n0[0]
	&mov	($_rp,"eax");		# ... save a copy of argument block
	&mov	($_ap,"ebx");
	&mov	($_bp,"ecx");
	&mov	($_np,"ebp");
	&mov	($_n0,"esi");
	&lea	($num,&DWP(-3,"edi"));	# num=num-1 to assist modulo-scheduling
	#&mov	($_num,$num);		# redundant as $num is not reused
	&mov	($_sp,"edx");		# saved stack pointer!

if($sse2) {
$acc0="mm0";	# mmx register bank layout
$acc1="mm1";
$car0="mm2";
$car1="mm3";
$mul0="mm4";
$mul1="mm5";
$temp="mm6";
$mask="mm7";

	&picmeup("eax","OPENSSL_ia32cap_P");
	&bt	(&DWP(0,"eax"),26);
	&jnc	(&label("non_sse2"));

	&mov	("eax",-1);
	&movd	($mask,"eax");		# mask 32 lower bits

	&mov	($ap,$_ap);		# load input pointers
	&mov	($bp,$_bp);
	&mov	($np,$_np);

	&xor	($i,$i);		# i=0
	&xor	($j,$j);		# j=0

	&movd	($mul0,&DWP(0,$bp));		# bp[0]
	&movd	($mul1,&DWP(0,$ap));		# ap[0]
	&movd	($car1,&DWP(0,$np));		# np[0]

	&pmuludq($mul1,$mul0);			# ap[0]*bp[0]
	&movq	($car0,$mul1);
	&movq	($acc0,$mul1);			# I wish movd worked for
	&pand	($acc0,$mask);			# inter-register transfers

	&pmuludq($mul1,$_n0q);			# *=n0

	&pmuludq($car1,$mul1);			# "t[0]"*np[0]*n0
	&paddq	($car1,$acc0);

	&movd	($acc1,&DWP(4,$np));		# np[1]
	&movd	($acc0,&DWP(4,$ap));		# ap[1]

	&psrlq	($car0,32);
	&psrlq	($car1,32);

	&inc	($j);				# j++
&set_label("1st",16);
	&pmuludq($acc0,$mul0);			# ap[j]*bp[0]
	&pmuludq($acc1,$mul1);			# np[j]*m1
	&paddq	($car0,$acc0);			# +=c0
	&paddq	($car1,$acc1);			# +=c1

	&movq	($acc0,$car0);
	&pand	($acc0,$mask);
	&movd	($acc1,&DWP(4,$np,$j,4));	# np[j+1]
	&paddq	($car1,$acc0);			# +=ap[j]*bp[0];
	&movd	($acc0,&DWP(4,$ap,$j,4));	# ap[j+1]
	&psrlq	($car0,32);
	&movd	(&DWP($frame-4,"esp",$j,4),$car1);	# tp[j-1]=
	&psrlq	($car1,32);

	&lea	($j,&DWP(1,$j));
	&cmp	($j,$num);
	&jl	(&label("1st"));

	&pmuludq($acc0,$mul0);			# ap[num-1]*bp[0]
	&pmuludq($acc1,$mul1);			# np[num-1]*m1
	&paddq	($car0,$acc0);			# +=c0
	&paddq	($car1,$acc1);			# +=c1

	&movq	($acc0,$car0);
	&pand	($acc0,$mask);
	&paddq	($car1,$acc0);			# +=ap[num-1]*bp[0];
	&movd	(&DWP($frame-4,"esp",$j,4),$car1);	# tp[num-2]=

	&psrlq	($car0,32);
	&psrlq	($car1,32);

	&paddq	($car1,$car0);
	&movq	(&QWP($frame,"esp",$num,4),$car1);	# tp[num].tp[num-1]

	&inc	($i);				# i++
&set_label("outer");
	&xor	($j,$j);			# j=0

	&movd	($mul0,&DWP(0,$bp,$i,4));	# bp[i]
	&movd	($mul1,&DWP(0,$ap));		# ap[0]
	&movd	($temp,&DWP($frame,"esp"));	# tp[0]
	&movd	($car1,&DWP(0,$np));		# np[0]
	&pmuludq($mul1,$mul0);			# ap[0]*bp[i]

	&paddq	($mul1,$temp);			# +=tp[0]
	&movq	($acc0,$mul1);
	&movq	($car0,$mul1);
	&pand	($acc0,$mask);

	&pmuludq($mul1,$_n0q);			# *=n0

	&pmuludq($car1,$mul1);
	&paddq	($car1,$acc0);

	&movd	($temp,&DWP($frame+4,"esp"));	# tp[1]
	&movd	($acc1,&DWP(4,$np));		# np[1]
	&movd	($acc0,&DWP(4,$ap));		# ap[1]

	&psrlq	($car0,32);
	&psrlq	($car1,32);
	&paddq	($car0,$temp);			# +=tp[1]

	&inc	($j);				# j++
	&dec	($num);
&set_label("inner");
	&pmuludq($acc0,$mul0);			# ap[j]*bp[i]
	&pmuludq($acc1,$mul1);			# np[j]*m1
	&paddq	($car0,$acc0);			# +=c0
	&paddq	($car1,$acc1);			# +=c1

	&movq	($acc0,$car0);
	&movd	($temp,&DWP($frame+4,"esp",$j,4));# tp[j+1]
	&pand	($acc0,$mask);
	&movd	($acc1,&DWP(4,$np,$j,4));	# np[j+1]
	&paddq	($car1,$acc0);			# +=ap[j]*bp[i]+tp[j]
	&movd	($acc0,&DWP(4,$ap,$j,4));	# ap[j+1]
	&psrlq	($car0,32);
	&movd	(&DWP($frame-4,"esp",$j,4),$car1);# tp[j-1]=
	&psrlq	($car1,32);
	&paddq	($car0,$temp);			# +=tp[j+1]

	&dec	($num);
	&lea	($j,&DWP(1,$j));		# j++
	&jnz	(&label("inner"));

	&mov	($num,$j);
	&pmuludq($acc0,$mul0);			# ap[num-1]*bp[i]
	&pmuludq($acc1,$mul1);			# np[num-1]*m1
	&paddq	($car0,$acc0);			# +=c0
	&paddq	($car1,$acc1);			# +=c1

	&movq	($acc0,$car0);
	&pand	($acc0,$mask);
	&paddq	($car1,$acc0);			# +=ap[num-1]*bp[i]+tp[num-1]
	&movd	(&DWP($frame-4,"esp",$j,4),$car1);	# tp[num-2]=
	&psrlq	($car0,32);
	&psrlq	($car1,32);

	&movd	($temp,&DWP($frame+4,"esp",$num,4));	# += tp[num]
	&paddq	($car1,$car0);
	&paddq	($car1,$temp);
	&movq	(&QWP($frame,"esp",$num,4),$car1);	# tp[num].tp[num-1]

	&lea	($i,&DWP(1,$i));		# i++
	&cmp	($i,$num);
	&jle	(&label("outer"));

	&emms	();				# done with mmx bank
	&jmp	(&label("common_tail"));

&set_label("non_sse2",16);
}

if (0) {
	&mov	("esp",$_sp);
	&xor	("eax","eax");	# signal "not fast enough [yet]"
	&jmp	(&label("just_leave"));
	# While the below code provides competitive performance for
	# all key lengths on modern Intel cores, it's still more
	# than 10% slower for 4096-bit key elsewhere:-( "Competitive"
	# means compared to the original integer-only assembler.
	# 512-bit RSA sign is better by ~40%, but that's about all
	# one can say about all CPUs...
} else {
$inp="esi";	# integer path uses these registers differently
$word="edi";
$carry="ebp";

	&mov	($inp,$_ap);
	&lea	($carry,&DWP(1,$num));
	&mov	($word,$_bp);
	&xor	($j,$j);				# j=0
	&mov	("edx",$inp);
	&and	($carry,1);				# see if num is even
	&sub	("edx",$word);				# see if ap==bp
	&lea	("eax",&DWP(4,$word,$num,4));		# &bp[num]
	&or	($carry,"edx");
	&mov	($word,&DWP(0,$word));			# bp[0]
	&jz	(&label("bn_sqr_mont"));
	&mov	($_bpend,"eax");
	&mov	("eax",&DWP(0,$inp));
	&xor	("edx","edx");

&set_label("mull",16);
	&mov	($carry,"edx");
	&mul	($word);				# ap[j]*bp[0]
	&add	($carry,"eax");
	&lea	($j,&DWP(1,$j));
	&adc	("edx",0);
	&mov	("eax",&DWP(0,$inp,$j,4));		# ap[j+1]
	&cmp	($j,$num);
	&mov	(&DWP($frame-4,"esp",$j,4),$carry);	# tp[j]=
	&jl	(&label("mull"));

	&mov	($carry,"edx");
	&mul	($word);				# ap[num-1]*bp[0]
	 &mov	($word,$_n0);
	&add	("eax",$carry);
	 &mov	($inp,$_np);
	&adc	("edx",0);
	 &imul	($word,&DWP($frame,"esp"));		# n0*tp[0]

	&mov	(&DWP($frame,"esp",$num,4),"eax");	# tp[num-1]=
	&xor	($j,$j);
	&mov	(&DWP($frame+4,"esp",$num,4),"edx");	# tp[num]=
	&mov	(&DWP($frame+8,"esp",$num,4),$j);	# tp[num+1]=

	&mov	("eax",&DWP(0,$inp));			# np[0]
	&mul	($word);				# np[0]*m
	&add	("eax",&DWP($frame,"esp"));		# +=tp[0]
	&mov	("eax",&DWP(4,$inp));			# np[1]
	&adc	("edx",0);
	&inc	($j);

	&jmp	(&label("2ndmadd"));

&set_label("1stmadd",16);
	&mov	($carry,"edx");
	&mul	($word);				# ap[j]*bp[i]
	&add	($carry,&DWP($frame,"esp",$j,4));	# +=tp[j]
	&lea	($j,&DWP(1,$j));
	&adc	("edx",0);
	&add	($carry,"eax");
	&mov	("eax",&DWP(0,$inp,$j,4));		# ap[j+1]
	&adc	("edx",0);
	&cmp	($j,$num);
	&mov	(&DWP($frame-4,"esp",$j,4),$carry);	# tp[j]=
	&jl	(&label("1stmadd"));

	&mov	($carry,"edx");
	&mul	($word);				# ap[num-1]*bp[i]
	&add	("eax",&DWP($frame,"esp",$num,4));	# +=tp[num-1]
	 &mov	($word,$_n0);
	&adc	("edx",0);
	 &mov	($inp,$_np);
	&add	($carry,"eax");
	&adc	("edx",0);
	 &imul	($word,&DWP($frame,"esp"));		# n0*tp[0]

	&xor	($j,$j);
	&add	("edx",&DWP($frame+4,"esp",$num,4));	# carry+=tp[num]
	&mov	(&DWP($frame,"esp",$num,4),$carry);	# tp[num-1]=
	&adc	($j,0);
	 &mov	("eax",&DWP(0,$inp));			# np[0]
	&mov	(&DWP($frame+4,"esp",$num,4),"edx");	# tp[num]=
	&mov	(&DWP($frame+8,"esp",$num,4),$j);	# tp[num+1]=

	&mul	($word);				# np[0]*m
	&add	("eax",&DWP($frame,"esp"));		# +=tp[0]
	&mov	("eax",&DWP(4,$inp));			# np[1]
	&adc	("edx",0);
	&mov	($j,1);

&set_label("2ndmadd",16);
	&mov	($carry,"edx");
	&mul	($word);				# np[j]*m
	&add	($carry,&DWP($frame,"esp",$j,4));	# +=tp[j]
	&lea	($j,&DWP(1,$j));
	&adc	("edx",0);
	&add	($carry,"eax");
	&mov	("eax",&DWP(0,$inp,$j,4));		# np[j+1]
	&adc	("edx",0);
	&cmp	($j,$num);
	&mov	(&DWP($frame-8,"esp",$j,4),$carry);	# tp[j-1]=
	&jl	(&label("2ndmadd"));

	&mov	($carry,"edx");
	&mul	($word);				# np[j]*m
	&add	($carry,&DWP($frame,"esp",$num,4));	# +=tp[num-1]
	&adc	("edx",0);
	&add	($carry,"eax");
	&adc	("edx",0);
	&mov	(&DWP($frame-4,"esp",$num,4),$carry);	# tp[num-2]=

	&xor	("eax","eax");
	 &mov	($j,$_bp);				# &bp[i]
	&add	("edx",&DWP($frame+4,"esp",$num,4));	# carry+=tp[num]
	&adc	("eax",&DWP($frame+8,"esp",$num,4));	# +=tp[num+1]
	 &lea	($j,&DWP(4,$j));
	&mov	(&DWP($frame,"esp",$num,4),"edx");	# tp[num-1]=
	 &cmp	($j,$_bpend);
	&mov	(&DWP($frame+4,"esp",$num,4),"eax");	# tp[num]=
	&je	(&label("common_tail"));

	&mov	($word,&DWP(0,$j));			# bp[i+1]
	&mov	($inp,$_ap);
	&mov	($_bp,$j);				# &bp[++i]
	&xor	($j,$j);
	&xor	("edx","edx");
	&mov	("eax",&DWP(0,$inp));
	&jmp	(&label("1stmadd"));

&set_label("bn_sqr_mont",16);
$sbit=$num;
	&mov	($_num,$num);
	&mov	($_bp,$j);				# i=0

	&mov	("eax",$word);				# ap[0]
	&mul	($word);				# ap[0]*ap[0]
	&mov	(&DWP($frame,"esp"),"eax");		# tp[0]=
	&mov	($sbit,"edx");
	&shr	("edx",1);
	&and	($sbit,1);
	&inc	($j);
&set_label("sqr",16);
	&mov	("eax",&DWP(0,$inp,$j,4));		# ap[j]
	&mov	($carry,"edx");
	&mul	($word);				# ap[j]*ap[0]
	&add	("eax",$carry);
	&lea	($j,&DWP(1,$j));
	&adc	("edx",0);
	&lea	($carry,&DWP(0,$sbit,"eax",2));
	&shr	("eax",31);
	&cmp	($j,$_num);
	&mov	($sbit,"eax");
	&mov	(&DWP($frame-4,"esp",$j,4),$carry);	# tp[j]=
	&jl	(&label("sqr"));

	&mov	("eax",&DWP(0,$inp,$j,4));		# ap[num-1]
	&mov	($carry,"edx");
	&mul	($word);				# ap[num-1]*ap[0]
	&add	("eax",$carry);
	 &mov	($word,$_n0);
	&adc	("edx",0);
	 &mov	($inp,$_np);
	&lea	($carry,&DWP(0,$sbit,"eax",2));
	 &imul	($word,&DWP($frame,"esp"));		# n0*tp[0]
	&shr	("eax",31);
	&mov	(&DWP($frame,"esp",$j,4),$carry);	# tp[num-1]=

	&lea	($carry,&DWP(0,"eax","edx",2));
	 &mov	("eax",&DWP(0,$inp));			# np[0]
	&shr	("edx",31);
	&mov	(&DWP($frame+4,"esp",$j,4),$carry);	# tp[num]=
	&mov	(&DWP($frame+8,"esp",$j,4),"edx");	# tp[num+1]=

	&mul	($word);				# np[0]*m
	&add	("eax",&DWP($frame,"esp"));		# +=tp[0]
	&mov	($num,$j);
	&adc	("edx",0);
	&mov	("eax",&DWP(4,$inp));			# np[1]
	&mov	($j,1);

&set_label("3rdmadd",16);
	&mov	($carry,"edx");
	&mul	($word);				# np[j]*m
	&add	($carry,&DWP($frame,"esp",$j,4));	# +=tp[j]
	&adc	("edx",0);
	&add	($carry,"eax");
	&mov	("eax",&DWP(4,$inp,$j,4));		# np[j+1]
	&adc	("edx",0);
	&mov	(&DWP($frame-4,"esp",$j,4),$carry);	# tp[j-1]=

	&mov	($carry,"edx");
	&mul	($word);				# np[j+1]*m
	&add	($carry,&DWP($frame+4,"esp",$j,4));	# +=tp[j+1]
	&lea	($j,&DWP(2,$j));
	&adc	("edx",0);
	&add	($carry,"eax");
	&mov	("eax",&DWP(0,$inp,$j,4));		# np[j+2]
	&adc	("edx",0);
	&cmp	($j,$num);
	&mov	(&DWP($frame-8,"esp",$j,4),$carry);	# tp[j]=
	&jl	(&label("3rdmadd"));

	&mov	($carry,"edx");
	&mul	($word);				# np[j]*m
	&add	($carry,&DWP($frame,"esp",$num,4));	# +=tp[num-1]
	&adc	("edx",0);
	&add	($carry,"eax");
	&adc	("edx",0);
	&mov	(&DWP($frame-4,"esp",$num,4),$carry);	# tp[num-2]=

	&mov	($j,$_bp);				# i
	&xor	("eax","eax");
	&mov	($inp,$_ap);
	&add	("edx",&DWP($frame+4,"esp",$num,4));	# carry+=tp[num]
	&adc	("eax",&DWP($frame+8,"esp",$num,4));	# +=tp[num+1]
	&mov	(&DWP($frame,"esp",$num,4),"edx");	# tp[num-1]=
	&cmp	($j,$num);
	&mov	(&DWP($frame+4,"esp",$num,4),"eax");	# tp[num]=
	&je	(&label("common_tail"));

	&mov	($word,&DWP(4,$inp,$j,4));		# ap[i]
	&lea	($j,&DWP(1,$j));
	&mov	("eax",$word);
	&mov	($_bp,$j);				# ++i
	&mul	($word);				# ap[i]*ap[i]
	&add	("eax",&DWP($frame,"esp",$j,4));	# +=tp[i]
	&adc	("edx",0);
	&mov	(&DWP($frame,"esp",$j,4),"eax");	# tp[i]=
	&xor	($carry,$carry);
	&cmp	($j,$num);
	&lea	($j,&DWP(1,$j));
	&je	(&label("sqrlast"));

	&mov	($sbit,"edx");				# zaps $num
	&shr	("edx",1);
	&and	($sbit,1);
&set_label("sqradd",16);
	&mov	("eax",&DWP(0,$inp,$j,4));		# ap[j]
	&mov	($carry,"edx");
	&mul	($word);				# ap[j]*ap[i]
	&add	("eax",$carry);
	&lea	($carry,&DWP(0,"eax","eax"));
	&adc	("edx",0);
	&shr	("eax",31);
	&add	($carry,&DWP($frame,"esp",$j,4));	# +=tp[j]
	&lea	($j,&DWP(1,$j));
	&adc	("eax",0);
	&add	($carry,$sbit);
	&adc	("eax",0);
	&cmp	($j,$_num);
	&mov	(&DWP($frame-4,"esp",$j,4),$carry);	# tp[j]=
	&mov	($sbit,"eax");
	&jle	(&label("sqradd"));

	&mov	($carry,"edx");
	&add	("edx","edx");
	&shr	($carry,31);
	&add	("edx",$sbit);
	&adc	($carry,0);
&set_label("sqrlast");
	&mov	($word,$_n0);
	&mov	($inp,$_np);
	&imul	($word,&DWP($frame,"esp"));		# n0*tp[0]

	&add	("edx",&DWP($frame,"esp",$j,4));	# +=tp[num]
	&mov	("eax",&DWP(0,$inp));			# np[0]
	&adc	($carry,0);
	&mov	(&DWP($frame,"esp",$j,4),"edx");	# tp[num]=
	&mov	(&DWP($frame+4,"esp",$j,4),$carry);	# tp[num+1]=

	&mul	($word);				# np[0]*m
	&add	("eax",&DWP($frame,"esp"));		# +=tp[0]
	&lea	($num,&DWP(-1,$j));
	&adc	("edx",0);
	&mov	($j,1);
	&mov	("eax",&DWP(4,$inp));			# np[1]

	&jmp	(&label("3rdmadd"));
}

&set_label("common_tail",16);
	&mov	($np,$_np);			# load modulus pointer
	&mov	($rp,$_rp);			# load result pointer
	&lea	($tp,&DWP($frame,"esp"));	# [$ap and $bp are zapped]

	&mov	("eax",&DWP(0,$tp));		# tp[0]
	&mov	($j,$num);			# j=num-1
	&xor	($i,$i);			# i=0 and clear CF!

&set_label("sub",16);
	&sbb	("eax",&DWP(0,$np,$i,4));
	&mov	(&DWP(0,$rp,$i,4),"eax");	# rp[i]=tp[i]-np[i]
	&dec	($j);				# doesn't affect CF!
	&mov	("eax",&DWP(4,$tp,$i,4));	# tp[i+1]
	&lea	($i,&DWP(1,$i));		# i++
	&jge	(&label("sub"));

	&sbb	("eax",0);			# handle upmost overflow bit
	&and	($tp,"eax");
	&not	("eax");
	&mov	($np,$rp);
	&and	($np,"eax");
	&or	($tp,$np);			# tp=carry?tp:rp

&set_label("copy",16);				# copy or in-place refresh
	&mov	("eax",&DWP(0,$tp,$num,4));
	&mov	(&DWP(0,$rp,$num,4),"eax");	# rp[i]=tp[i]
	&mov	(&DWP($frame,"esp",$num,4),$j);	# zap temporary vector
	&dec	($num);
	&jge	(&label("copy"));

	&mov	("esp",$_sp);		# pull saved stack pointer
	&mov	("eax",1);
&set_label("just_leave");
&function_end("bn_mul_mont");

&asciz("Montgomery Multiplication for x86, CRYPTOGAMS by <appro\@openssl.org>");

&asm_finish();

close STDOUT;
-												Sync up some perlasm license headers and easy fixes.

These files are otherwise up-to-date with OpenSSL master as of
50ea9d2b3521467a11559be41dcf05ee05feabd6, modulo a couple of spelling
fixes which I've imported.

I've also reverted the same-line label and instruction patch to
x86_64-mont*.pl. The new delocate parser handles that fine.

Change-Id: Ife35c671a8104c3cc2fb6c5a03127376fccc4402
Reviewed-on: https://boringssl-review.googlesource.com/25644
Reviewed-by: Adam Langley <agl@google.com>

											
										
										
											2018-02-02 20:12:22 +00:00
+								#! /usr/bin/env perl
 								# Copyright 2005-2016 The OpenSSL Project Authors. All Rights Reserved.
 								#
 								# Licensed under the OpenSSL license (the "License").  You may not use
 								# this file except in compliance with the License.  You can obtain a copy
 								# in the file LICENSE in the source distribution or at
 								# https://www.openssl.org/source/license.html
-												Inital import.

Initial fork from f2d678e6e89b6508147086610e985d4e8416e867 (1.0.2 beta).

(This change contains substantial changes from the original and
effectively starts a new history.)

											
										
										
											2014-06-20 20:00:00 +01:00
 								# ====================================================================
-												Sync up some perlasm license headers and easy fixes.

These files are otherwise up-to-date with OpenSSL master as of
50ea9d2b3521467a11559be41dcf05ee05feabd6, modulo a couple of spelling
fixes which I've imported.

I've also reverted the same-line label and instruction patch to
x86_64-mont*.pl. The new delocate parser handles that fine.

Change-Id: Ife35c671a8104c3cc2fb6c5a03127376fccc4402
Reviewed-on: https://boringssl-review.googlesource.com/25644
Reviewed-by: Adam Langley <agl@google.com>

											
										
										
											2018-02-02 20:12:22 +00:00
+								# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
-												Inital import.

Initial fork from f2d678e6e89b6508147086610e985d4e8416e867 (1.0.2 beta).

(This change contains substantial changes from the original and
effectively starts a new history.)

											
										
										
											2014-06-20 20:00:00 +01:00
+								# project. The module is, however, dual licensed under OpenSSL and
 								# CRYPTOGAMS licenses depending on where you obtain it. For further
 								# details see http://www.openssl.org/~appro/cryptogams/.
 								# ====================================================================
 								# October 2005
 								#
 								# This is a "teaser" code, as it can be improved in several ways...
 								# First of all non-SSE2 path should be implemented (yes, for now it
 								# performs Montgomery multiplication/convolution only on SSE2-capable
 								# CPUs such as P4, others fall down to original code). Then inner loop
 								# can be unrolled and modulo-scheduled to improve ILP and possibly
 								# moved to 128-bit XMM register bank (though it would require input
 								# rearrangement and/or increase bus bandwidth utilization). Dedicated
 								# squaring procedure should give further performance improvement...
 								# Yet, for being draft, the code improves rsa512 *sign* benchmark by
 								# 110%(!), rsa1024 one - by 70% and rsa4096 - by 20%:-)
 								# December 2006
 								#
 								# Modulo-scheduling SSE2 loops results in further 15-20% improvement.
 								# Integer-only code [being equipped with dedicated squaring procedure]
 								# gives ~40% on rsa512 sign benchmark...
 								$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
-												Move bn/ into crypto/fipsmodule/

Change-Id: I68aa4a740ee1c7f2a308a6536f408929f15b694c
Reviewed-on: https://boringssl-review.googlesource.com/15647
Reviewed-by: Adam Langley <agl@google.com>
Commit-Queue: Adam Langley <agl@google.com>
CQ-Verified: CQ bot account: commit-bot@chromium.org <commit-bot@chromium.org>

											
										
										
											2017-04-28 22:47:06 +01:00
+								push(@INC,"${dir}","${dir}../../../perlasm");
-												Inital import.

Initial fork from f2d678e6e89b6508147086610e985d4e8416e867 (1.0.2 beta).

(This change contains substantial changes from the original and
effectively starts a new history.)

											
										
										
											2014-06-20 20:00:00 +01:00
+								require "x86asm.pl";
-												Switch perlasm calling convention.

Depending on architecture, perlasm differed on which one or both of:

  perl foo.pl flavor output.S
  perl foo.pl flavor > output.S

Upstream has now unified on the first form after making a number of
changes to their files (the second does not even work for their x86
files anymore). Sync those portions of our perlasm scripts with upstream
and update CMakeLists.txt and generate_build_files.py per the new
convention.

This imports various commits like this one:
184bc45f683c76531d7e065b6553ca9086564576 (this was done by taking a
diff, so I don't have the full list)

Confirmed that generate_build_files.py sees no change.

BUG=14

Change-Id: Id2fb5b8bc2a7369d077221b5df9a6947d41f50d2
Reviewed-on: https://boringssl-review.googlesource.com/8518
Reviewed-by: Adam Langley <agl@google.com>

											
										
										
											2016-06-26 18:18:50 +01:00
+								$output = pop;
 								open STDOUT,">$output";
-												Remove trailing whitespace from Perl files.

Upstream did this in 609b0852e4d50251857dbbac3141ba042e35a9ae and it's
easier to apply patches if we do also.

Change-Id: I5142693ed1e26640987ff16f5ea510e81bba200e
Reviewed-on: https://boringssl-review.googlesource.com/13771
Commit-Queue: Adam Langley <agl@google.com>
Reviewed-by: David Benjamin <davidben@google.com>

											
										
										
											2017-02-09 20:21:08 +00:00
-												Remove filename argument to x86 asm_init.

43e5a26b535f8eeee33c3106de786aea4f5023c8 removed the .file directive
from x86asm.pl. This removes the parameter from asm_init altogether. See
also upstream's e195c8a2562baef0fdcae330556ed60b1e922b0e.

Change-Id: I65761bc962d09f9210661a38ecf6df23eae8743d
Reviewed-on: https://boringssl-review.googlesource.com/16247
Reviewed-by: Steven Valdez <svaldez@google.com>
Commit-Queue: Steven Valdez <svaldez@google.com>
CQ-Verified: CQ bot account: commit-bot@chromium.org <commit-bot@chromium.org>

											
										
										
											2017-05-11 23:19:53 +01:00
+								&asm_init($ARGV[0]);
-												Inital import.

Initial fork from f2d678e6e89b6508147086610e985d4e8416e867 (1.0.2 beta).

(This change contains substantial changes from the original and
effectively starts a new history.)

											
										
										
											2014-06-20 20:00:00 +01:00
 								$sse2=0;
 								for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
 								&external_label("OPENSSL_ia32cap_P") if ($sse2);
 								&function_begin("bn_mul_mont");
 								$i="edx";
 								$j="ecx";
 								$ap="esi";	$tp="esi";		# overlapping variables!!!
 								$rp="edi";	$bp="edi";		# overlapping variables!!!
 								$np="ebp";
 								$num="ebx";
 								$_num=&DWP(4*0,"esp");			# stack top layout
 								$_rp=&DWP(4*1,"esp");
 								$_ap=&DWP(4*2,"esp");
 								$_bp=&DWP(4*3,"esp");
 								$_np=&DWP(4*4,"esp");
 								$_n0=&DWP(4*5,"esp");	$_n0q=&QWP(4*5,"esp");
 								$_sp=&DWP(4*6,"esp");
 								$_bpend=&DWP(4*7,"esp");
 								$frame=32;				# size of above frame rounded up to 16n
 									&xor	("eax","eax");
 									&mov	("edi",&wparam(5));	# int num
 									&cmp	("edi",4);
 									&jl	(&label("just_leave"));
 									&lea	("esi",&wparam(0));	# put aside pointer to argument block
 									&lea	("edx",&wparam(1));	# load ap
 									&add	("edi",2);		# extra two words on top of tp
 									&neg	("edi");
-												bn/asm/x86[_64]-mont*.pl: implement slightly alternative page-walking.

(Imports upstream's 3ba1ef829cf3dd36eaa5e819258d90291c6a1027.)

Original strategy for page-walking was adjust stack pointer and then
touch pages in order. This kind of asks for double-fault, because
if touch fails, then signal will be delivered to frame above adjusted
stack pointer. But touching pages prior adjusting stack pointer would
upset valgrind. As compromise let's adjust stack pointer in pages,
touching top of the stack. This still asks for double-fault, but at
least prevents corruption of neighbour stack if allocation is to
overstep the guard page.

Also omit predict-non-taken hints as they reportedly trigger illegal
instructions in some VM setups.

Change-Id: Ife42935319de79c6c76f8df60a76204c546fd1e0
Reviewed-on: https://boringssl-review.googlesource.com/13775
Commit-Queue: Adam Langley <agl@google.com>
Reviewed-by: David Benjamin <davidben@google.com>

											
										
										
											2017-02-09 21:29:19 +00:00
+									&lea	("ebp",&DWP(-$frame,"esp","edi",4));	# future alloca($frame+4*(num+2))
-												Inital import.

Initial fork from f2d678e6e89b6508147086610e985d4e8416e867 (1.0.2 beta).

(This change contains substantial changes from the original and
effectively starts a new history.)

											
										
										
											2014-06-20 20:00:00 +01:00
+									&neg	("edi");
-												Sync up some perlasm license headers and easy fixes.

These files are otherwise up-to-date with OpenSSL master as of
50ea9d2b3521467a11559be41dcf05ee05feabd6, modulo a couple of spelling
fixes which I've imported.

I've also reverted the same-line label and instruction patch to
x86_64-mont*.pl. The new delocate parser handles that fine.

Change-Id: Ife35c671a8104c3cc2fb6c5a03127376fccc4402
Reviewed-on: https://boringssl-review.googlesource.com/25644
Reviewed-by: Adam Langley <agl@google.com>

											
										
										
											2018-02-02 20:12:22 +00:00
+									# minimize cache contention by arranging 2K window between stack
-												Inital import.

Initial fork from f2d678e6e89b6508147086610e985d4e8416e867 (1.0.2 beta).

(This change contains substantial changes from the original and
effectively starts a new history.)

											
										
										
											2014-06-20 20:00:00 +01:00
+									# pointer and ap argument [np is also position sensitive vector,
 									# but it's assumed to be near ap, as it's allocated at ~same
 									# time].
-												bn/asm/x86[_64]-mont*.pl: implement slightly alternative page-walking.

(Imports upstream's 3ba1ef829cf3dd36eaa5e819258d90291c6a1027.)

Original strategy for page-walking was adjust stack pointer and then
touch pages in order. This kind of asks for double-fault, because
if touch fails, then signal will be delivered to frame above adjusted
stack pointer. But touching pages prior adjusting stack pointer would
upset valgrind. As compromise let's adjust stack pointer in pages,
touching top of the stack. This still asks for double-fault, but at
least prevents corruption of neighbour stack if allocation is to
overstep the guard page.

Also omit predict-non-taken hints as they reportedly trigger illegal
instructions in some VM setups.

Change-Id: Ife42935319de79c6c76f8df60a76204c546fd1e0
Reviewed-on: https://boringssl-review.googlesource.com/13775
Commit-Queue: Adam Langley <agl@google.com>
Reviewed-by: David Benjamin <davidben@google.com>

											
										
										
											2017-02-09 21:29:19 +00:00
+									&mov	("eax","ebp");
-												Inital import.

Initial fork from f2d678e6e89b6508147086610e985d4e8416e867 (1.0.2 beta).

(This change contains substantial changes from the original and
effectively starts a new history.)

											
										
										
											2014-06-20 20:00:00 +01:00
+									&sub	("eax","edx");
 									&and	("eax",2047);
-												bn/asm/x86[_64]-mont*.pl: implement slightly alternative page-walking.

(Imports upstream's 3ba1ef829cf3dd36eaa5e819258d90291c6a1027.)

Original strategy for page-walking was adjust stack pointer and then
touch pages in order. This kind of asks for double-fault, because
if touch fails, then signal will be delivered to frame above adjusted
stack pointer. But touching pages prior adjusting stack pointer would
upset valgrind. As compromise let's adjust stack pointer in pages,
touching top of the stack. This still asks for double-fault, but at
least prevents corruption of neighbour stack if allocation is to
overstep the guard page.

Also omit predict-non-taken hints as they reportedly trigger illegal
instructions in some VM setups.

Change-Id: Ife42935319de79c6c76f8df60a76204c546fd1e0
Reviewed-on: https://boringssl-review.googlesource.com/13775
Commit-Queue: Adam Langley <agl@google.com>
Reviewed-by: David Benjamin <davidben@google.com>

											
										
										
											2017-02-09 21:29:19 +00:00
+									&sub	("ebp","eax");		# this aligns sp and ap modulo 2048
-												Inital import.

Initial fork from f2d678e6e89b6508147086610e985d4e8416e867 (1.0.2 beta).

(This change contains substantial changes from the original and
effectively starts a new history.)

											
										
										
											2014-06-20 20:00:00 +01:00
-												bn/asm/x86[_64]-mont*.pl: implement slightly alternative page-walking.

(Imports upstream's 3ba1ef829cf3dd36eaa5e819258d90291c6a1027.)

Original strategy for page-walking was adjust stack pointer and then
touch pages in order. This kind of asks for double-fault, because
if touch fails, then signal will be delivered to frame above adjusted
stack pointer. But touching pages prior adjusting stack pointer would
upset valgrind. As compromise let's adjust stack pointer in pages,
touching top of the stack. This still asks for double-fault, but at
least prevents corruption of neighbour stack if allocation is to
overstep the guard page.

Also omit predict-non-taken hints as they reportedly trigger illegal
instructions in some VM setups.

Change-Id: Ife42935319de79c6c76f8df60a76204c546fd1e0
Reviewed-on: https://boringssl-review.googlesource.com/13775
Commit-Queue: Adam Langley <agl@google.com>
Reviewed-by: David Benjamin <davidben@google.com>

											
										
										
											2017-02-09 21:29:19 +00:00
+									&xor	("edx","ebp");
-												Inital import.

Initial fork from f2d678e6e89b6508147086610e985d4e8416e867 (1.0.2 beta).

(This change contains substantial changes from the original and
effectively starts a new history.)

											
										
										
											2014-06-20 20:00:00 +01:00
+									&and	("edx",2048);
 									&xor	("edx",2048);
-												bn/asm/x86[_64]-mont*.pl: implement slightly alternative page-walking.

(Imports upstream's 3ba1ef829cf3dd36eaa5e819258d90291c6a1027.)

Original strategy for page-walking was adjust stack pointer and then
touch pages in order. This kind of asks for double-fault, because
if touch fails, then signal will be delivered to frame above adjusted
stack pointer. But touching pages prior adjusting stack pointer would
upset valgrind. As compromise let's adjust stack pointer in pages,
touching top of the stack. This still asks for double-fault, but at
least prevents corruption of neighbour stack if allocation is to
overstep the guard page.

Also omit predict-non-taken hints as they reportedly trigger illegal
instructions in some VM setups.

Change-Id: Ife42935319de79c6c76f8df60a76204c546fd1e0
Reviewed-on: https://boringssl-review.googlesource.com/13775
Commit-Queue: Adam Langley <agl@google.com>
Reviewed-by: David Benjamin <davidben@google.com>

											
										
										
											2017-02-09 21:29:19 +00:00
+									&sub	("ebp","edx");		# this splits them apart modulo 4096
-												Inital import.

Initial fork from f2d678e6e89b6508147086610e985d4e8416e867 (1.0.2 beta).

(This change contains substantial changes from the original and
effectively starts a new history.)

											
										
										
											2014-06-20 20:00:00 +01:00
-												bn/asm/x86[_64]-mont*.pl: implement slightly alternative page-walking.

(Imports upstream's 3ba1ef829cf3dd36eaa5e819258d90291c6a1027.)

Original strategy for page-walking was adjust stack pointer and then
touch pages in order. This kind of asks for double-fault, because
if touch fails, then signal will be delivered to frame above adjusted
stack pointer. But touching pages prior adjusting stack pointer would
upset valgrind. As compromise let's adjust stack pointer in pages,
touching top of the stack. This still asks for double-fault, but at
least prevents corruption of neighbour stack if allocation is to
overstep the guard page.

Also omit predict-non-taken hints as they reportedly trigger illegal
instructions in some VM setups.

Change-Id: Ife42935319de79c6c76f8df60a76204c546fd1e0
Reviewed-on: https://boringssl-review.googlesource.com/13775
Commit-Queue: Adam Langley <agl@google.com>
Reviewed-by: David Benjamin <davidben@google.com>

											
										
										
											2017-02-09 21:29:19 +00:00
+									&and	("ebp",-64);		# align to cache line
-												Inital import.

Initial fork from f2d678e6e89b6508147086610e985d4e8416e867 (1.0.2 beta).

(This change contains substantial changes from the original and
effectively starts a new history.)

											
										
										
											2014-06-20 20:00:00 +01:00
-												On Windows, page walking is known as __chkstk.

(Imports upstream's 0a86f668212acfa6b48abacbc17b99c234eedf33.)

Change-Id: Ie31d99f8cc3e93b6a9c7c5daa066de96941b3f7c
Reviewed-on: https://boringssl-review.googlesource.com/13770
Commit-Queue: Adam Langley <agl@google.com>
Reviewed-by: David Benjamin <davidben@google.com>

											
										
										
											2017-02-09 20:14:45 +00:00
+									# An OS-agnostic version of __chkstk.
 									#
-												Explain *cough*-dows

(Imports upstream's 1bf80d93024e72628d4351c7ad19c0dfe635aa95.)

Change-Id: If1d61336edc7f63cdfd8ac14157376bde2651a31
Reviewed-on: https://boringssl-review.googlesource.com/13769
Commit-Queue: Adam Langley <agl@google.com>
Reviewed-by: David Benjamin <davidben@google.com>

											
										
										
											2017-02-09 20:13:52 +00:00
+									# Some OSes (Windows) insist on stack being "wired" to
-												bn/asm/x86[_64]-mont*.pl: complement alloca with page-walking.

(Imports upstream's adc4f1fc25b2cac90076f1e1695b05b7aeeae501.)

Some OSes, *cough*-dows, insist on stack being "wired" to
physical memory in strictly sequential manner, i.e. if stack
allocation spans two pages, then reference to farmost one can
be punishable by SEGV. But page walking can do good even on
other OSes, because it guarantees that villain thread hits
the guard page before it can make damage to innocent one...

Change-Id: Ie1e278eb5982f26e596783b3d7820a71295688ec
Reviewed-on: https://boringssl-review.googlesource.com/13768
Commit-Queue: Adam Langley <agl@google.com>
Reviewed-by: David Benjamin <davidben@google.com>

											
										
										
											2017-02-09 20:11:53 +00:00
+									# physical memory in strictly sequential manner, i.e. if stack
 									# allocation spans two pages, then reference to farmost one can
 									# be punishable by SEGV. But page walking can do good even on
 									# other OSes, because it guarantees that villain thread hits
 									# the guard page before it can make damage to innocent one...
-												bn/asm/x86[_64]-mont*.pl: implement slightly alternative page-walking.

(Imports upstream's 3ba1ef829cf3dd36eaa5e819258d90291c6a1027.)

Original strategy for page-walking was adjust stack pointer and then
touch pages in order. This kind of asks for double-fault, because
if touch fails, then signal will be delivered to frame above adjusted
stack pointer. But touching pages prior adjusting stack pointer would
upset valgrind. As compromise let's adjust stack pointer in pages,
touching top of the stack. This still asks for double-fault, but at
least prevents corruption of neighbour stack if allocation is to
overstep the guard page.

Also omit predict-non-taken hints as they reportedly trigger illegal
instructions in some VM setups.

Change-Id: Ife42935319de79c6c76f8df60a76204c546fd1e0
Reviewed-on: https://boringssl-review.googlesource.com/13775
Commit-Queue: Adam Langley <agl@google.com>
Reviewed-by: David Benjamin <davidben@google.com>

											
										
										
											2017-02-09 21:29:19 +00:00
+									&mov	("eax","esp");
 									&sub	("eax","ebp");
-												bn/asm/x86[_64]-mont*.pl: complement alloca with page-walking.

(Imports upstream's adc4f1fc25b2cac90076f1e1695b05b7aeeae501.)

Some OSes, *cough*-dows, insist on stack being "wired" to
physical memory in strictly sequential manner, i.e. if stack
allocation spans two pages, then reference to farmost one can
be punishable by SEGV. But page walking can do good even on
other OSes, because it guarantees that villain thread hits
the guard page before it can make damage to innocent one...

Change-Id: Ie1e278eb5982f26e596783b3d7820a71295688ec
Reviewed-on: https://boringssl-review.googlesource.com/13768
Commit-Queue: Adam Langley <agl@google.com>
Reviewed-by: David Benjamin <davidben@google.com>

											
										
										
											2017-02-09 20:11:53 +00:00
+									&and	("eax",-4096);
-												bn/asm/x86[_64]-mont*.pl: implement slightly alternative page-walking.

(Imports upstream's 3ba1ef829cf3dd36eaa5e819258d90291c6a1027.)

Original strategy for page-walking was adjust stack pointer and then
touch pages in order. This kind of asks for double-fault, because
if touch fails, then signal will be delivered to frame above adjusted
stack pointer. But touching pages prior adjusting stack pointer would
upset valgrind. As compromise let's adjust stack pointer in pages,
touching top of the stack. This still asks for double-fault, but at
least prevents corruption of neighbour stack if allocation is to
overstep the guard page.

Also omit predict-non-taken hints as they reportedly trigger illegal
instructions in some VM setups.

Change-Id: Ife42935319de79c6c76f8df60a76204c546fd1e0
Reviewed-on: https://boringssl-review.googlesource.com/13775
Commit-Queue: Adam Langley <agl@google.com>
Reviewed-by: David Benjamin <davidben@google.com>

											
										
										
											2017-02-09 21:29:19 +00:00
+									&mov	("edx","esp");		# saved stack pointer!
 									&lea	("esp",&DWP(0,"ebp","eax"));
 									&mov	("eax",&DWP(0,"esp"));
 									&cmp	("esp","ebp");
 									&ja	(&label("page_walk"));
 									&jmp	(&label("page_walk_done"));
 								&set_label("page_walk",16);
 									&lea	("esp",&DWP(-4096,"esp"));
 									&mov	("eax",&DWP(0,"esp"));
 									&cmp	("esp","ebp");
 									&ja	(&label("page_walk"));
 								&set_label("page_walk_done");
-												bn/asm/x86[_64]-mont*.pl: complement alloca with page-walking.

(Imports upstream's adc4f1fc25b2cac90076f1e1695b05b7aeeae501.)

Some OSes, *cough*-dows, insist on stack being "wired" to
physical memory in strictly sequential manner, i.e. if stack
allocation spans two pages, then reference to farmost one can
be punishable by SEGV. But page walking can do good even on
other OSes, because it guarantees that villain thread hits
the guard page before it can make damage to innocent one...

Change-Id: Ie1e278eb5982f26e596783b3d7820a71295688ec
Reviewed-on: https://boringssl-review.googlesource.com/13768
Commit-Queue: Adam Langley <agl@google.com>
Reviewed-by: David Benjamin <davidben@google.com>

											
										
										
											2017-02-09 20:11:53 +00:00
-												Inital import.

Initial fork from f2d678e6e89b6508147086610e985d4e8416e867 (1.0.2 beta).

(This change contains substantial changes from the original and
effectively starts a new history.)

											
										
										
											2014-06-20 20:00:00 +01:00
+									################################# load argument block...
 									&mov	("eax",&DWP(0*4,"esi"));# BN_ULONG *rp
 									&mov	("ebx",&DWP(1*4,"esi"));# const BN_ULONG *ap
 									&mov	("ecx",&DWP(2*4,"esi"));# const BN_ULONG *bp
-												bn/asm/x86[_64]-mont*.pl: implement slightly alternative page-walking.

(Imports upstream's 3ba1ef829cf3dd36eaa5e819258d90291c6a1027.)

Original strategy for page-walking was adjust stack pointer and then
touch pages in order. This kind of asks for double-fault, because
if touch fails, then signal will be delivered to frame above adjusted
stack pointer. But touching pages prior adjusting stack pointer would
upset valgrind. As compromise let's adjust stack pointer in pages,
touching top of the stack. This still asks for double-fault, but at
least prevents corruption of neighbour stack if allocation is to
overstep the guard page.

Also omit predict-non-taken hints as they reportedly trigger illegal
instructions in some VM setups.

Change-Id: Ife42935319de79c6c76f8df60a76204c546fd1e0
Reviewed-on: https://boringssl-review.googlesource.com/13775
Commit-Queue: Adam Langley <agl@google.com>
Reviewed-by: David Benjamin <davidben@google.com>

											
										
										
											2017-02-09 21:29:19 +00:00
+									&mov	("ebp",&DWP(3*4,"esi"));# const BN_ULONG *np
-												Inital import.

Initial fork from f2d678e6e89b6508147086610e985d4e8416e867 (1.0.2 beta).

(This change contains substantial changes from the original and
effectively starts a new history.)

											
										
										
											2014-06-20 20:00:00 +01:00
+									&mov	("esi",&DWP(4*4,"esi"));# const BN_ULONG *n0
 									#&mov	("edi",&DWP(5*4,"esi"));# int num
 									&mov	("esi",&DWP(0,"esi"));	# pull n0[0]
 									&mov	($_rp,"eax");		# ... save a copy of argument block
 									&mov	($_ap,"ebx");
 									&mov	($_bp,"ecx");
-												bn/asm/x86[_64]-mont*.pl: implement slightly alternative page-walking.

(Imports upstream's 3ba1ef829cf3dd36eaa5e819258d90291c6a1027.)

Original strategy for page-walking was adjust stack pointer and then
touch pages in order. This kind of asks for double-fault, because
if touch fails, then signal will be delivered to frame above adjusted
stack pointer. But touching pages prior adjusting stack pointer would
upset valgrind. As compromise let's adjust stack pointer in pages,
touching top of the stack. This still asks for double-fault, but at
least prevents corruption of neighbour stack if allocation is to
overstep the guard page.

Also omit predict-non-taken hints as they reportedly trigger illegal
instructions in some VM setups.

Change-Id: Ife42935319de79c6c76f8df60a76204c546fd1e0
Reviewed-on: https://boringssl-review.googlesource.com/13775
Commit-Queue: Adam Langley <agl@google.com>
Reviewed-by: David Benjamin <davidben@google.com>

											
										
										
											2017-02-09 21:29:19 +00:00
+									&mov	($_np,"ebp");
-												Inital import.

Initial fork from f2d678e6e89b6508147086610e985d4e8416e867 (1.0.2 beta).

(This change contains substantial changes from the original and
effectively starts a new history.)

											
										
										
											2014-06-20 20:00:00 +01:00
+									&mov	($_n0,"esi");
 									&lea	($num,&DWP(-3,"edi"));	# num=num-1 to assist modulo-scheduling
 									#&mov	($_num,$num);		# redundant as $num is not reused
-												bn/asm/x86[_64]-mont*.pl: implement slightly alternative page-walking.

(Imports upstream's 3ba1ef829cf3dd36eaa5e819258d90291c6a1027.)

Original strategy for page-walking was adjust stack pointer and then
touch pages in order. This kind of asks for double-fault, because
if touch fails, then signal will be delivered to frame above adjusted
stack pointer. But touching pages prior adjusting stack pointer would
upset valgrind. As compromise let's adjust stack pointer in pages,
touching top of the stack. This still asks for double-fault, but at
least prevents corruption of neighbour stack if allocation is to
overstep the guard page.

Also omit predict-non-taken hints as they reportedly trigger illegal
instructions in some VM setups.

Change-Id: Ife42935319de79c6c76f8df60a76204c546fd1e0
Reviewed-on: https://boringssl-review.googlesource.com/13775
Commit-Queue: Adam Langley <agl@google.com>
Reviewed-by: David Benjamin <davidben@google.com>

											
										
										
											2017-02-09 21:29:19 +00:00
+									&mov	($_sp,"edx");		# saved stack pointer!
-												Inital import.

Initial fork from f2d678e6e89b6508147086610e985d4e8416e867 (1.0.2 beta).

(This change contains substantial changes from the original and
effectively starts a new history.)

											
										
										
											2014-06-20 20:00:00 +01:00
 								if($sse2) {
 								$acc0="mm0";	# mmx register bank layout
 								$acc1="mm1";
 								$car0="mm2";
 								$car1="mm3";
 								$mul0="mm4";
 								$mul1="mm5";
 								$temp="mm6";
 								$mask="mm7";
 									&picmeup("eax","OPENSSL_ia32cap_P");
 									&bt	(&DWP(0,"eax"),26);
 									&jnc	(&label("non_sse2"));
 									&mov	("eax",-1);
 									&movd	($mask,"eax");		# mask 32 lower bits
 									&mov	($ap,$_ap);		# load input pointers
 									&mov	($bp,$_bp);
 									&mov	($np,$_np);
 									&xor	($i,$i);		# i=0
 									&xor	($j,$j);		# j=0
 									&movd	($mul0,&DWP(0,$bp));		# bp[0]
 									&movd	($mul1,&DWP(0,$ap));		# ap[0]
 									&movd	($car1,&DWP(0,$np));		# np[0]
 									&pmuludq($mul1,$mul0);			# ap[0]*bp[0]
 									&movq	($car0,$mul1);
 									&movq	($acc0,$mul1);			# I wish movd worked for
 									&pand	($acc0,$mask);			# inter-register transfers
 									&pmuludq($mul1,$_n0q);			# *=n0
 									&pmuludq($car1,$mul1);			# "t[0]"*np[0]*n0
 									&paddq	($car1,$acc0);
 									&movd	($acc1,&DWP(4,$np));		# np[1]
 									&movd	($acc0,&DWP(4,$ap));		# ap[1]
 									&psrlq	($car0,32);
 									&psrlq	($car1,32);
 									&inc	($j);				# j++
 								&set_label("1st",16);
 									&pmuludq($acc0,$mul0);			# ap[j]*bp[0]
 									&pmuludq($acc1,$mul1);			# np[j]*m1
 									&paddq	($car0,$acc0);			# +=c0
 									&paddq	($car1,$acc1);			# +=c1
 									&movq	($acc0,$car0);
 									&pand	($acc0,$mask);
 									&movd	($acc1,&DWP(4,$np,$j,4));	# np[j+1]
 									&paddq	($car1,$acc0);			# +=ap[j]*bp[0];
 									&movd	($acc0,&DWP(4,$ap,$j,4));	# ap[j+1]
 									&psrlq	($car0,32);
 									&movd	(&DWP($frame-4,"esp",$j,4),$car1);	# tp[j-1]=
 									&psrlq	($car1,32);
 									&lea	($j,&DWP(1,$j));
 									&cmp	($j,$num);
 									&jl	(&label("1st"));
 									&pmuludq($acc0,$mul0);			# ap[num-1]*bp[0]
 									&pmuludq($acc1,$mul1);			# np[num-1]*m1
 									&paddq	($car0,$acc0);			# +=c0
 									&paddq	($car1,$acc1);			# +=c1
 									&movq	($acc0,$car0);
 									&pand	($acc0,$mask);
 									&paddq	($car1,$acc0);			# +=ap[num-1]*bp[0];
 									&movd	(&DWP($frame-4,"esp",$j,4),$car1);	# tp[num-2]=
 									&psrlq	($car0,32);
 									&psrlq	($car1,32);
 									&paddq	($car1,$car0);
 									&movq	(&QWP($frame,"esp",$num,4),$car1);	# tp[num].tp[num-1]
 									&inc	($i);				# i++
 								&set_label("outer");
 									&xor	($j,$j);			# j=0
 									&movd	($mul0,&DWP(0,$bp,$i,4));	# bp[i]
 									&movd	($mul1,&DWP(0,$ap));		# ap[0]
 									&movd	($temp,&DWP($frame,"esp"));	# tp[0]
 									&movd	($car1,&DWP(0,$np));		# np[0]
 									&pmuludq($mul1,$mul0);			# ap[0]*bp[i]
 									&paddq	($mul1,$temp);			# +=tp[0]
 									&movq	($acc0,$mul1);
 									&movq	($car0,$mul1);
 									&pand	($acc0,$mask);
 									&pmuludq($mul1,$_n0q);			# *=n0
 									&pmuludq($car1,$mul1);
 									&paddq	($car1,$acc0);
 									&movd	($temp,&DWP($frame+4,"esp"));	# tp[1]
 									&movd	($acc1,&DWP(4,$np));		# np[1]
 									&movd	($acc0,&DWP(4,$ap));		# ap[1]
 									&psrlq	($car0,32);
 									&psrlq	($car1,32);
 									&paddq	($car0,$temp);			# +=tp[1]
 									&inc	($j);				# j++
 									&dec	($num);
 								&set_label("inner");
 									&pmuludq($acc0,$mul0);			# ap[j]*bp[i]
 									&pmuludq($acc1,$mul1);			# np[j]*m1
 									&paddq	($car0,$acc0);			# +=c0
 									&paddq	($car1,$acc1);			# +=c1
 									&movq	($acc0,$car0);
 									&movd	($temp,&DWP($frame+4,"esp",$j,4));# tp[j+1]
 									&pand	($acc0,$mask);
 									&movd	($acc1,&DWP(4,$np,$j,4));	# np[j+1]
 									&paddq	($car1,$acc0);			# +=ap[j]*bp[i]+tp[j]
 									&movd	($acc0,&DWP(4,$ap,$j,4));	# ap[j+1]
 									&psrlq	($car0,32);
 									&movd	(&DWP($frame-4,"esp",$j,4),$car1);# tp[j-1]=
 									&psrlq	($car1,32);
 									&paddq	($car0,$temp);			# +=tp[j+1]
 									&dec	($num);
 									&lea	($j,&DWP(1,$j));		# j++
 									&jnz	(&label("inner"));
 									&mov	($num,$j);
 									&pmuludq($acc0,$mul0);			# ap[num-1]*bp[i]
 									&pmuludq($acc1,$mul1);			# np[num-1]*m1
 									&paddq	($car0,$acc0);			# +=c0
 									&paddq	($car1,$acc1);			# +=c1
 									&movq	($acc0,$car0);
 									&pand	($acc0,$mask);
 									&paddq	($car1,$acc0);			# +=ap[num-1]*bp[i]+tp[num-1]
 									&movd	(&DWP($frame-4,"esp",$j,4),$car1);	# tp[num-2]=
 									&psrlq	($car0,32);
 									&psrlq	($car1,32);
 									&movd	($temp,&DWP($frame+4,"esp",$num,4));	# += tp[num]
 									&paddq	($car1,$car0);
 									&paddq	($car1,$temp);
 									&movq	(&QWP($frame,"esp",$num,4),$car1);	# tp[num].tp[num-1]
 									&lea	($i,&DWP(1,$i));		# i++
 									&cmp	($i,$num);
 									&jle	(&label("outer"));
 									&emms	();				# done with mmx bank
 									&jmp	(&label("common_tail"));
 								&set_label("non_sse2",16);
 								}
 								if (0) {
 									&mov	("esp",$_sp);
 									&xor	("eax","eax");	# signal "not fast enough [yet]"
 									&jmp	(&label("just_leave"));
 									# While the below code provides competitive performance for
-												Spelling fixes in Perl files.

(Imports upstream's 6025001707fd65679d758c877200469d4e72ea88.)

Change-Id: I2f237d675b029cfc7ba3640aa9ce7248cc230013
Reviewed-on: https://boringssl-review.googlesource.com/13773
Commit-Queue: Adam Langley <agl@google.com>
Reviewed-by: David Benjamin <davidben@google.com>

											
										
										
											2017-02-09 20:29:26 +00:00
+									# all key lengths on modern Intel cores, it's still more
-												Inital import.

Initial fork from f2d678e6e89b6508147086610e985d4e8416e867 (1.0.2 beta).

(This change contains substantial changes from the original and
effectively starts a new history.)

											
										
										
											2014-06-20 20:00:00 +01:00
+									# than 10% slower for 4096-bit key elsewhere:-( "Competitive"
 									# means compared to the original integer-only assembler.
 									# 512-bit RSA sign is better by ~40%, but that's about all
 									# one can say about all CPUs...
 								} else {
 								$inp="esi";	# integer path uses these registers differently
 								$word="edi";
 								$carry="ebp";
 									&mov	($inp,$_ap);
 									&lea	($carry,&DWP(1,$num));
 									&mov	($word,$_bp);
 									&xor	($j,$j);				# j=0
 									&mov	("edx",$inp);
 									&and	($carry,1);				# see if num is even
 									&sub	("edx",$word);				# see if ap==bp
 									&lea	("eax",&DWP(4,$word,$num,4));		# &bp[num]
 									&or	($carry,"edx");
 									&mov	($word,&DWP(0,$word));			# bp[0]
 									&jz	(&label("bn_sqr_mont"));
 									&mov	($_bpend,"eax");
 									&mov	("eax",&DWP(0,$inp));
 									&xor	("edx","edx");
 								&set_label("mull",16);
 									&mov	($carry,"edx");
 									&mul	($word);				# ap[j]*bp[0]
 									&add	($carry,"eax");
 									&lea	($j,&DWP(1,$j));
 									&adc	("edx",0);
 									&mov	("eax",&DWP(0,$inp,$j,4));		# ap[j+1]
 									&cmp	($j,$num);
 									&mov	(&DWP($frame-4,"esp",$j,4),$carry);	# tp[j]=
 									&jl	(&label("mull"));
 									&mov	($carry,"edx");
 									&mul	($word);				# ap[num-1]*bp[0]
 									 &mov	($word,$_n0);
 									&add	("eax",$carry);
 									 &mov	($inp,$_np);
 									&adc	("edx",0);
 									 &imul	($word,&DWP($frame,"esp"));		# n0*tp[0]
 									&mov	(&DWP($frame,"esp",$num,4),"eax");	# tp[num-1]=
 									&xor	($j,$j);
 									&mov	(&DWP($frame+4,"esp",$num,4),"edx");	# tp[num]=
 									&mov	(&DWP($frame+8,"esp",$num,4),$j);	# tp[num+1]=
 									&mov	("eax",&DWP(0,$inp));			# np[0]
 									&mul	($word);				# np[0]*m
 									&add	("eax",&DWP($frame,"esp"));		# +=tp[0]
 									&mov	("eax",&DWP(4,$inp));			# np[1]
 									&adc	("edx",0);
 									&inc	($j);
 									&jmp	(&label("2ndmadd"));
 								&set_label("1stmadd",16);
 									&mov	($carry,"edx");
 									&mul	($word);				# ap[j]*bp[i]
 									&add	($carry,&DWP($frame,"esp",$j,4));	# +=tp[j]
 									&lea	($j,&DWP(1,$j));
 									&adc	("edx",0);
 									&add	($carry,"eax");
 									&mov	("eax",&DWP(0,$inp,$j,4));		# ap[j+1]
 									&adc	("edx",0);
 									&cmp	($j,$num);
 									&mov	(&DWP($frame-4,"esp",$j,4),$carry);	# tp[j]=
 									&jl	(&label("1stmadd"));
 									&mov	($carry,"edx");
 									&mul	($word);				# ap[num-1]*bp[i]
 									&add	("eax",&DWP($frame,"esp",$num,4));	# +=tp[num-1]
 									 &mov	($word,$_n0);
 									&adc	("edx",0);
 									 &mov	($inp,$_np);
 									&add	($carry,"eax");
 									&adc	("edx",0);
 									 &imul	($word,&DWP($frame,"esp"));		# n0*tp[0]
 									&xor	($j,$j);
 									&add	("edx",&DWP($frame+4,"esp",$num,4));	# carry+=tp[num]
 									&mov	(&DWP($frame,"esp",$num,4),$carry);	# tp[num-1]=
 									&adc	($j,0);
 									 &mov	("eax",&DWP(0,$inp));			# np[0]
 									&mov	(&DWP($frame+4,"esp",$num,4),"edx");	# tp[num]=
 									&mov	(&DWP($frame+8,"esp",$num,4),$j);	# tp[num+1]=
 									&mul	($word);				# np[0]*m
 									&add	("eax",&DWP($frame,"esp"));		# +=tp[0]
 									&mov	("eax",&DWP(4,$inp));			# np[1]
 									&adc	("edx",0);
 									&mov	($j,1);
 								&set_label("2ndmadd",16);
 									&mov	($carry,"edx");
 									&mul	($word);				# np[j]*m
 									&add	($carry,&DWP($frame,"esp",$j,4));	# +=tp[j]
 									&lea	($j,&DWP(1,$j));
 									&adc	("edx",0);
 									&add	($carry,"eax");
 									&mov	("eax",&DWP(0,$inp,$j,4));		# np[j+1]
 									&adc	("edx",0);
 									&cmp	($j,$num);
 									&mov	(&DWP($frame-8,"esp",$j,4),$carry);	# tp[j-1]=
 									&jl	(&label("2ndmadd"));
 									&mov	($carry,"edx");
 									&mul	($word);				# np[j]*m
 									&add	($carry,&DWP($frame,"esp",$num,4));	# +=tp[num-1]
 									&adc	("edx",0);
 									&add	($carry,"eax");
 									&adc	("edx",0);
 									&mov	(&DWP($frame-4,"esp",$num,4),$carry);	# tp[num-2]=
 									&xor	("eax","eax");
 									 &mov	($j,$_bp);				# &bp[i]
 									&add	("edx",&DWP($frame+4,"esp",$num,4));	# carry+=tp[num]
 									&adc	("eax",&DWP($frame+8,"esp",$num,4));	# +=tp[num+1]
 									 &lea	($j,&DWP(4,$j));
 									&mov	(&DWP($frame,"esp",$num,4),"edx");	# tp[num-1]=
 									 &cmp	($j,$_bpend);
 									&mov	(&DWP($frame+4,"esp",$num,4),"eax");	# tp[num]=
 									&je	(&label("common_tail"));
 									&mov	($word,&DWP(0,$j));			# bp[i+1]
 									&mov	($inp,$_ap);
 									&mov	($_bp,$j);				# &bp[++i]
 									&xor	($j,$j);
 									&xor	("edx","edx");
 									&mov	("eax",&DWP(0,$inp));
 									&jmp	(&label("1stmadd"));
 								&set_label("bn_sqr_mont",16);
 								$sbit=$num;
 									&mov	($_num,$num);
 									&mov	($_bp,$j);				# i=0
 									&mov	("eax",$word);				# ap[0]
 									&mul	($word);				# ap[0]*ap[0]
 									&mov	(&DWP($frame,"esp"),"eax");		# tp[0]=
 									&mov	($sbit,"edx");
 									&shr	("edx",1);
 									&and	($sbit,1);
 									&inc	($j);
 								&set_label("sqr",16);
 									&mov	("eax",&DWP(0,$inp,$j,4));		# ap[j]
 									&mov	($carry,"edx");
 									&mul	($word);				# ap[j]*ap[0]
 									&add	("eax",$carry);
 									&lea	($j,&DWP(1,$j));
 									&adc	("edx",0);
 									&lea	($carry,&DWP(0,$sbit,"eax",2));
 									&shr	("eax",31);
 									&cmp	($j,$_num);
 									&mov	($sbit,"eax");
 									&mov	(&DWP($frame-4,"esp",$j,4),$carry);	# tp[j]=
 									&jl	(&label("sqr"));
 									&mov	("eax",&DWP(0,$inp,$j,4));		# ap[num-1]
 									&mov	($carry,"edx");
 									&mul	($word);				# ap[num-1]*ap[0]
 									&add	("eax",$carry);
 									 &mov	($word,$_n0);
 									&adc	("edx",0);
 									 &mov	($inp,$_np);
 									&lea	($carry,&DWP(0,$sbit,"eax",2));
 									 &imul	($word,&DWP($frame,"esp"));		# n0*tp[0]
 									&shr	("eax",31);
 									&mov	(&DWP($frame,"esp",$j,4),$carry);	# tp[num-1]=
 									&lea	($carry,&DWP(0,"eax","edx",2));
 									 &mov	("eax",&DWP(0,$inp));			# np[0]
 									&shr	("edx",31);
 									&mov	(&DWP($frame+4,"esp",$j,4),$carry);	# tp[num]=
 									&mov	(&DWP($frame+8,"esp",$j,4),"edx");	# tp[num+1]=
 									&mul	($word);				# np[0]*m
 									&add	("eax",&DWP($frame,"esp"));		# +=tp[0]
 									&mov	($num,$j);
 									&adc	("edx",0);
 									&mov	("eax",&DWP(4,$inp));			# np[1]
 									&mov	($j,1);
 								&set_label("3rdmadd",16);
 									&mov	($carry,"edx");
 									&mul	($word);				# np[j]*m
 									&add	($carry,&DWP($frame,"esp",$j,4));	# +=tp[j]
 									&adc	("edx",0);
 									&add	($carry,"eax");
 									&mov	("eax",&DWP(4,$inp,$j,4));		# np[j+1]
 									&adc	("edx",0);
 									&mov	(&DWP($frame-4,"esp",$j,4),$carry);	# tp[j-1]=
 									&mov	($carry,"edx");
 									&mul	($word);				# np[j+1]*m
 									&add	($carry,&DWP($frame+4,"esp",$j,4));	# +=tp[j+1]
 									&lea	($j,&DWP(2,$j));
 									&adc	("edx",0);
 									&add	($carry,"eax");
 									&mov	("eax",&DWP(0,$inp,$j,4));		# np[j+2]
 									&adc	("edx",0);
 									&cmp	($j,$num);
 									&mov	(&DWP($frame-8,"esp",$j,4),$carry);	# tp[j]=
 									&jl	(&label("3rdmadd"));
 									&mov	($carry,"edx");
 									&mul	($word);				# np[j]*m
 									&add	($carry,&DWP($frame,"esp",$num,4));	# +=tp[num-1]
 									&adc	("edx",0);
 									&add	($carry,"eax");
 									&adc	("edx",0);
 									&mov	(&DWP($frame-4,"esp",$num,4),$carry);	# tp[num-2]=
 									&mov	($j,$_bp);				# i
 									&xor	("eax","eax");
 									&mov	($inp,$_ap);
 									&add	("edx",&DWP($frame+4,"esp",$num,4));	# carry+=tp[num]
 									&adc	("eax",&DWP($frame+8,"esp",$num,4));	# +=tp[num+1]
 									&mov	(&DWP($frame,"esp",$num,4),"edx");	# tp[num-1]=
 									&cmp	($j,$num);
 									&mov	(&DWP($frame+4,"esp",$num,4),"eax");	# tp[num]=
 									&je	(&label("common_tail"));
 									&mov	($word,&DWP(4,$inp,$j,4));		# ap[i]
 									&lea	($j,&DWP(1,$j));
 									&mov	("eax",$word);
 									&mov	($_bp,$j);				# ++i
 									&mul	($word);				# ap[i]*ap[i]
 									&add	("eax",&DWP($frame,"esp",$j,4));	# +=tp[i]
 									&adc	("edx",0);
 									&mov	(&DWP($frame,"esp",$j,4),"eax");	# tp[i]=
 									&xor	($carry,$carry);
 									&cmp	($j,$num);
 									&lea	($j,&DWP(1,$j));
 									&je	(&label("sqrlast"));
 									&mov	($sbit,"edx");				# zaps $num
 									&shr	("edx",1);
 									&and	($sbit,1);
 								&set_label("sqradd",16);
 									&mov	("eax",&DWP(0,$inp,$j,4));		# ap[j]
 									&mov	($carry,"edx");
 									&mul	($word);				# ap[j]*ap[i]
 									&add	("eax",$carry);
 									&lea	($carry,&DWP(0,"eax","eax"));
 									&adc	("edx",0);
 									&shr	("eax",31);
 									&add	($carry,&DWP($frame,"esp",$j,4));	# +=tp[j]
 									&lea	($j,&DWP(1,$j));
 									&adc	("eax",0);
 									&add	($carry,$sbit);
 									&adc	("eax",0);
 									&cmp	($j,$_num);
 									&mov	(&DWP($frame-4,"esp",$j,4),$carry);	# tp[j]=
 									&mov	($sbit,"eax");
 									&jle	(&label("sqradd"));
 									&mov	($carry,"edx");
 									&add	("edx","edx");
 									&shr	($carry,31);
 									&add	("edx",$sbit);
 									&adc	($carry,0);
 								&set_label("sqrlast");
 									&mov	($word,$_n0);
 									&mov	($inp,$_np);
 									&imul	($word,&DWP($frame,"esp"));		# n0*tp[0]
 									&add	("edx",&DWP($frame,"esp",$j,4));	# +=tp[num]
 									&mov	("eax",&DWP(0,$inp));			# np[0]
 									&adc	($carry,0);
 									&mov	(&DWP($frame,"esp",$j,4),"edx");	# tp[num]=
 									&mov	(&DWP($frame+4,"esp",$j,4),$carry);	# tp[num+1]=
 									&mul	($word);				# np[0]*m
 									&add	("eax",&DWP($frame,"esp"));		# +=tp[0]
 									&lea	($num,&DWP(-1,$j));
 									&adc	("edx",0);
 									&mov	($j,1);
 									&mov	("eax",&DWP(4,$inp));			# np[1]
 									&jmp	(&label("3rdmadd"));
 								}
 								&set_label("common_tail",16);
 									&mov	($np,$_np);			# load modulus pointer
 									&mov	($rp,$_rp);			# load result pointer
 									&lea	($tp,&DWP($frame,"esp"));	# [$ap and $bp are zapped]
 									&mov	("eax",&DWP(0,$tp));		# tp[0]
 									&mov	($j,$num);			# j=num-1
 									&xor	($i,$i);			# i=0 and clear CF!
 								&set_label("sub",16);
 									&sbb	("eax",&DWP(0,$np,$i,4));
 									&mov	(&DWP(0,$rp,$i,4),"eax");	# rp[i]=tp[i]-np[i]
 									&dec	($j);				# doesn't affect CF!
 									&mov	("eax",&DWP(4,$tp,$i,4));	# tp[i+1]
 									&lea	($i,&DWP(1,$i));		# i++
 									&jge	(&label("sub"));
 									&sbb	("eax",0);			# handle upmost overflow bit
-												Revert "OpenSSL: make final reduction in Montgomery multiplication constant-time."

This reverts commit 75b833cc819a9d189adb0fdd56327bee600ff9e9.

Sadly this needs to be redone because upstream never took this change.
Perhaps, once redone, we can try upstreaming it again.

Change-Id: Ic8aaa0728a43936cde1628ca031ff3821f0fbf5b
Reviewed-on: https://boringssl-review.googlesource.com/13776
Commit-Queue: Adam Langley <agl@google.com>
Commit-Queue: David Benjamin <davidben@google.com>
Reviewed-by: David Benjamin <davidben@google.com>

											
										
										
											2017-02-09 21:48:10 +00:00
+									&and	($tp,"eax");
 									&not	("eax");
 									&mov	($np,$rp);
 									&and	($np,"eax");
 									&or	($tp,$np);			# tp=carry?tp:rp
-												Inital import.

Initial fork from f2d678e6e89b6508147086610e985d4e8416e867 (1.0.2 beta).

(This change contains substantial changes from the original and
effectively starts a new history.)

											
										
										
											2014-06-20 20:00:00 +01:00
 								&set_label("copy",16);				# copy or in-place refresh
-												Revert "OpenSSL: make final reduction in Montgomery multiplication constant-time."

This reverts commit 75b833cc819a9d189adb0fdd56327bee600ff9e9.

Sadly this needs to be redone because upstream never took this change.
Perhaps, once redone, we can try upstreaming it again.

Change-Id: Ic8aaa0728a43936cde1628ca031ff3821f0fbf5b
Reviewed-on: https://boringssl-review.googlesource.com/13776
Commit-Queue: Adam Langley <agl@google.com>
Commit-Queue: David Benjamin <davidben@google.com>
Reviewed-by: David Benjamin <davidben@google.com>

											
										
										
											2017-02-09 21:48:10 +00:00
+									&mov	("eax",&DWP(0,$tp,$num,4));
 									&mov	(&DWP(0,$rp,$num,4),"eax");	# rp[i]=tp[i]
 									&mov	(&DWP($frame,"esp",$num,4),$j);	# zap temporary vector
-												Inital import.

Initial fork from f2d678e6e89b6508147086610e985d4e8416e867 (1.0.2 beta).

(This change contains substantial changes from the original and
effectively starts a new history.)

											
										
										
											2014-06-20 20:00:00 +01:00
+									&dec	($num);
 									&jge	(&label("copy"));
 									&mov	("esp",$_sp);		# pull saved stack pointer
 									&mov	("eax",1);
 								&set_label("just_leave");
 								&function_end("bn_mul_mont");
 								&asciz("Montgomery Multiplication for x86, CRYPTOGAMS by <appro\@openssl.org>");
 								&asm_finish();
-												Switch perlasm calling convention.

Depending on architecture, perlasm differed on which one or both of:

  perl foo.pl flavor output.S
  perl foo.pl flavor > output.S

Upstream has now unified on the first form after making a number of
changes to their files (the second does not even work for their x86
files anymore). Sync those portions of our perlasm scripts with upstream
and update CMakeLists.txt and generate_build_files.py per the new
convention.

This imports various commits like this one:
184bc45f683c76531d7e065b6553ca9086564576 (this was done by taking a
diff, so I don't have the full list)

Confirmed that generate_build_files.py sees no change.

BUG=14

Change-Id: Id2fb5b8bc2a7369d077221b5df9a6947d41f50d2
Reviewed-on: https://boringssl-review.googlesource.com/8518
Reviewed-by: Adam Langley <agl@google.com>

											
										
										
											2016-06-26 18:18:50 +01:00
 								close STDOUT;