boringssl/crypto/bn/asm/x86-mont.pl
David Benjamin fdd8e9c8c7 Switch perlasm calling convention.
Depending on architecture, perlasm differed on which one or both of:

  perl foo.pl flavor output.S
  perl foo.pl flavor > output.S

Upstream has now unified on the first form after making a number of
changes to their files (the second does not even work for their x86
files anymore). Sync those portions of our perlasm scripts with upstream
and update CMakeLists.txt and generate_build_files.py per the new
convention.

This imports various commits like this one:
184bc45f683c76531d7e065b6553ca9086564576 (this was done by taking a
diff, so I don't have the full list)

Confirmed that generate_build_files.py sees no change.

BUG=14

Change-Id: Id2fb5b8bc2a7369d077221b5df9a6947d41f50d2
Reviewed-on: https://boringssl-review.googlesource.com/8518
Reviewed-by: Adam Langley <agl@google.com>
2016-06-27 21:59:26 +00:00

598 lines
16 KiB
Raku
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env perl
# ====================================================================
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
# October 2005
#
# This is a "teaser" code, as it can be improved in several ways...
# First of all non-SSE2 path should be implemented (yes, for now it
# performs Montgomery multiplication/convolution only on SSE2-capable
# CPUs such as P4, others fall down to original code). Then inner loop
# can be unrolled and modulo-scheduled to improve ILP and possibly
# moved to 128-bit XMM register bank (though it would require input
# rearrangement and/or increase bus bandwidth utilization). Dedicated
# squaring procedure should give further performance improvement...
# Yet, for being draft, the code improves rsa512 *sign* benchmark by
# 110%(!), rsa1024 one - by 70% and rsa4096 - by 20%:-)
# December 2006
#
# Modulo-scheduling SSE2 loops results in further 15-20% improvement.
# Integer-only code [being equipped with dedicated squaring procedure]
# gives ~40% on rsa512 sign benchmark...
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
push(@INC,"${dir}","${dir}../../perlasm");
require "x86asm.pl";
$output = pop;
open STDOUT,">$output";
&asm_init($ARGV[0],$0);
$sse2=0;
for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
&external_label("OPENSSL_ia32cap_P") if ($sse2);
&function_begin("bn_mul_mont");
$i="edx";
$j="ecx";
$ap="esi"; $tp="esi"; # overlapping variables!!!
$rp="edi"; $bp="edi"; # overlapping variables!!!
$np="ebp";
$num="ebx";
$_num=&DWP(4*0,"esp"); # stack top layout
$_rp=&DWP(4*1,"esp");
$_ap=&DWP(4*2,"esp");
$_bp=&DWP(4*3,"esp");
$_np=&DWP(4*4,"esp");
$_n0=&DWP(4*5,"esp"); $_n0q=&QWP(4*5,"esp");
$_sp=&DWP(4*6,"esp");
$_bpend=&DWP(4*7,"esp");
$frame=32; # size of above frame rounded up to 16n
&xor ("eax","eax");
&mov ("edi",&wparam(5)); # int num
&cmp ("edi",4);
&jl (&label("just_leave"));
&lea ("esi",&wparam(0)); # put aside pointer to argument block
&lea ("edx",&wparam(1)); # load ap
&mov ("ebp","esp"); # saved stack pointer!
&add ("edi",2); # extra two words on top of tp
&neg ("edi");
&lea ("esp",&DWP(-$frame,"esp","edi",4)); # alloca($frame+4*(num+2))
&neg ("edi");
# minimize cache contention by arraning 2K window between stack
# pointer and ap argument [np is also position sensitive vector,
# but it's assumed to be near ap, as it's allocated at ~same
# time].
&mov ("eax","esp");
&sub ("eax","edx");
&and ("eax",2047);
&sub ("esp","eax"); # this aligns sp and ap modulo 2048
&xor ("edx","esp");
&and ("edx",2048);
&xor ("edx",2048);
&sub ("esp","edx"); # this splits them apart modulo 4096
&and ("esp",-64); # align to cache line
################################# load argument block...
&mov ("eax",&DWP(0*4,"esi"));# BN_ULONG *rp
&mov ("ebx",&DWP(1*4,"esi"));# const BN_ULONG *ap
&mov ("ecx",&DWP(2*4,"esi"));# const BN_ULONG *bp
&mov ("edx",&DWP(3*4,"esi"));# const BN_ULONG *np
&mov ("esi",&DWP(4*4,"esi"));# const BN_ULONG *n0
#&mov ("edi",&DWP(5*4,"esi"));# int num
&mov ("esi",&DWP(0,"esi")); # pull n0[0]
&mov ($_rp,"eax"); # ... save a copy of argument block
&mov ($_ap,"ebx");
&mov ($_bp,"ecx");
&mov ($_np,"edx");
&mov ($_n0,"esi");
&lea ($num,&DWP(-3,"edi")); # num=num-1 to assist modulo-scheduling
#&mov ($_num,$num); # redundant as $num is not reused
&mov ($_sp,"ebp"); # saved stack pointer!
if($sse2) {
$acc0="mm0"; # mmx register bank layout
$acc1="mm1";
$car0="mm2";
$car1="mm3";
$mul0="mm4";
$mul1="mm5";
$temp="mm6";
$mask="mm7";
&picmeup("eax","OPENSSL_ia32cap_P");
&bt (&DWP(0,"eax"),26);
&jnc (&label("non_sse2"));
&mov ("eax",-1);
&movd ($mask,"eax"); # mask 32 lower bits
&mov ($ap,$_ap); # load input pointers
&mov ($bp,$_bp);
&mov ($np,$_np);
&xor ($i,$i); # i=0
&xor ($j,$j); # j=0
&movd ($mul0,&DWP(0,$bp)); # bp[0]
&movd ($mul1,&DWP(0,$ap)); # ap[0]
&movd ($car1,&DWP(0,$np)); # np[0]
&pmuludq($mul1,$mul0); # ap[0]*bp[0]
&movq ($car0,$mul1);
&movq ($acc0,$mul1); # I wish movd worked for
&pand ($acc0,$mask); # inter-register transfers
&pmuludq($mul1,$_n0q); # *=n0
&pmuludq($car1,$mul1); # "t[0]"*np[0]*n0
&paddq ($car1,$acc0);
&movd ($acc1,&DWP(4,$np)); # np[1]
&movd ($acc0,&DWP(4,$ap)); # ap[1]
&psrlq ($car0,32);
&psrlq ($car1,32);
&inc ($j); # j++
&set_label("1st",16);
&pmuludq($acc0,$mul0); # ap[j]*bp[0]
&pmuludq($acc1,$mul1); # np[j]*m1
&paddq ($car0,$acc0); # +=c0
&paddq ($car1,$acc1); # +=c1
&movq ($acc0,$car0);
&pand ($acc0,$mask);
&movd ($acc1,&DWP(4,$np,$j,4)); # np[j+1]
&paddq ($car1,$acc0); # +=ap[j]*bp[0];
&movd ($acc0,&DWP(4,$ap,$j,4)); # ap[j+1]
&psrlq ($car0,32);
&movd (&DWP($frame-4,"esp",$j,4),$car1); # tp[j-1]=
&psrlq ($car1,32);
&lea ($j,&DWP(1,$j));
&cmp ($j,$num);
&jl (&label("1st"));
&pmuludq($acc0,$mul0); # ap[num-1]*bp[0]
&pmuludq($acc1,$mul1); # np[num-1]*m1
&paddq ($car0,$acc0); # +=c0
&paddq ($car1,$acc1); # +=c1
&movq ($acc0,$car0);
&pand ($acc0,$mask);
&paddq ($car1,$acc0); # +=ap[num-1]*bp[0];
&movd (&DWP($frame-4,"esp",$j,4),$car1); # tp[num-2]=
&psrlq ($car0,32);
&psrlq ($car1,32);
&paddq ($car1,$car0);
&movq (&QWP($frame,"esp",$num,4),$car1); # tp[num].tp[num-1]
&inc ($i); # i++
&set_label("outer");
&xor ($j,$j); # j=0
&movd ($mul0,&DWP(0,$bp,$i,4)); # bp[i]
&movd ($mul1,&DWP(0,$ap)); # ap[0]
&movd ($temp,&DWP($frame,"esp")); # tp[0]
&movd ($car1,&DWP(0,$np)); # np[0]
&pmuludq($mul1,$mul0); # ap[0]*bp[i]
&paddq ($mul1,$temp); # +=tp[0]
&movq ($acc0,$mul1);
&movq ($car0,$mul1);
&pand ($acc0,$mask);
&pmuludq($mul1,$_n0q); # *=n0
&pmuludq($car1,$mul1);
&paddq ($car1,$acc0);
&movd ($temp,&DWP($frame+4,"esp")); # tp[1]
&movd ($acc1,&DWP(4,$np)); # np[1]
&movd ($acc0,&DWP(4,$ap)); # ap[1]
&psrlq ($car0,32);
&psrlq ($car1,32);
&paddq ($car0,$temp); # +=tp[1]
&inc ($j); # j++
&dec ($num);
&set_label("inner");
&pmuludq($acc0,$mul0); # ap[j]*bp[i]
&pmuludq($acc1,$mul1); # np[j]*m1
&paddq ($car0,$acc0); # +=c0
&paddq ($car1,$acc1); # +=c1
&movq ($acc0,$car0);
&movd ($temp,&DWP($frame+4,"esp",$j,4));# tp[j+1]
&pand ($acc0,$mask);
&movd ($acc1,&DWP(4,$np,$j,4)); # np[j+1]
&paddq ($car1,$acc0); # +=ap[j]*bp[i]+tp[j]
&movd ($acc0,&DWP(4,$ap,$j,4)); # ap[j+1]
&psrlq ($car0,32);
&movd (&DWP($frame-4,"esp",$j,4),$car1);# tp[j-1]=
&psrlq ($car1,32);
&paddq ($car0,$temp); # +=tp[j+1]
&dec ($num);
&lea ($j,&DWP(1,$j)); # j++
&jnz (&label("inner"));
&mov ($num,$j);
&pmuludq($acc0,$mul0); # ap[num-1]*bp[i]
&pmuludq($acc1,$mul1); # np[num-1]*m1
&paddq ($car0,$acc0); # +=c0
&paddq ($car1,$acc1); # +=c1
&movq ($acc0,$car0);
&pand ($acc0,$mask);
&paddq ($car1,$acc0); # +=ap[num-1]*bp[i]+tp[num-1]
&movd (&DWP($frame-4,"esp",$j,4),$car1); # tp[num-2]=
&psrlq ($car0,32);
&psrlq ($car1,32);
&movd ($temp,&DWP($frame+4,"esp",$num,4)); # += tp[num]
&paddq ($car1,$car0);
&paddq ($car1,$temp);
&movq (&QWP($frame,"esp",$num,4),$car1); # tp[num].tp[num-1]
&lea ($i,&DWP(1,$i)); # i++
&cmp ($i,$num);
&jle (&label("outer"));
&emms (); # done with mmx bank
&jmp (&label("common_tail"));
&set_label("non_sse2",16);
}
if (0) {
&mov ("esp",$_sp);
&xor ("eax","eax"); # signal "not fast enough [yet]"
&jmp (&label("just_leave"));
# While the below code provides competitive performance for
# all key lengthes on modern Intel cores, it's still more
# than 10% slower for 4096-bit key elsewhere:-( "Competitive"
# means compared to the original integer-only assembler.
# 512-bit RSA sign is better by ~40%, but that's about all
# one can say about all CPUs...
} else {
$inp="esi"; # integer path uses these registers differently
$word="edi";
$carry="ebp";
&mov ($inp,$_ap);
&lea ($carry,&DWP(1,$num));
&mov ($word,$_bp);
&xor ($j,$j); # j=0
&mov ("edx",$inp);
&and ($carry,1); # see if num is even
&sub ("edx",$word); # see if ap==bp
&lea ("eax",&DWP(4,$word,$num,4)); # &bp[num]
&or ($carry,"edx");
&mov ($word,&DWP(0,$word)); # bp[0]
&jz (&label("bn_sqr_mont"));
&mov ($_bpend,"eax");
&mov ("eax",&DWP(0,$inp));
&xor ("edx","edx");
&set_label("mull",16);
&mov ($carry,"edx");
&mul ($word); # ap[j]*bp[0]
&add ($carry,"eax");
&lea ($j,&DWP(1,$j));
&adc ("edx",0);
&mov ("eax",&DWP(0,$inp,$j,4)); # ap[j+1]
&cmp ($j,$num);
&mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j]=
&jl (&label("mull"));
&mov ($carry,"edx");
&mul ($word); # ap[num-1]*bp[0]
&mov ($word,$_n0);
&add ("eax",$carry);
&mov ($inp,$_np);
&adc ("edx",0);
&imul ($word,&DWP($frame,"esp")); # n0*tp[0]
&mov (&DWP($frame,"esp",$num,4),"eax"); # tp[num-1]=
&xor ($j,$j);
&mov (&DWP($frame+4,"esp",$num,4),"edx"); # tp[num]=
&mov (&DWP($frame+8,"esp",$num,4),$j); # tp[num+1]=
&mov ("eax",&DWP(0,$inp)); # np[0]
&mul ($word); # np[0]*m
&add ("eax",&DWP($frame,"esp")); # +=tp[0]
&mov ("eax",&DWP(4,$inp)); # np[1]
&adc ("edx",0);
&inc ($j);
&jmp (&label("2ndmadd"));
&set_label("1stmadd",16);
&mov ($carry,"edx");
&mul ($word); # ap[j]*bp[i]
&add ($carry,&DWP($frame,"esp",$j,4)); # +=tp[j]
&lea ($j,&DWP(1,$j));
&adc ("edx",0);
&add ($carry,"eax");
&mov ("eax",&DWP(0,$inp,$j,4)); # ap[j+1]
&adc ("edx",0);
&cmp ($j,$num);
&mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j]=
&jl (&label("1stmadd"));
&mov ($carry,"edx");
&mul ($word); # ap[num-1]*bp[i]
&add ("eax",&DWP($frame,"esp",$num,4)); # +=tp[num-1]
&mov ($word,$_n0);
&adc ("edx",0);
&mov ($inp,$_np);
&add ($carry,"eax");
&adc ("edx",0);
&imul ($word,&DWP($frame,"esp")); # n0*tp[0]
&xor ($j,$j);
&add ("edx",&DWP($frame+4,"esp",$num,4)); # carry+=tp[num]
&mov (&DWP($frame,"esp",$num,4),$carry); # tp[num-1]=
&adc ($j,0);
&mov ("eax",&DWP(0,$inp)); # np[0]
&mov (&DWP($frame+4,"esp",$num,4),"edx"); # tp[num]=
&mov (&DWP($frame+8,"esp",$num,4),$j); # tp[num+1]=
&mul ($word); # np[0]*m
&add ("eax",&DWP($frame,"esp")); # +=tp[0]
&mov ("eax",&DWP(4,$inp)); # np[1]
&adc ("edx",0);
&mov ($j,1);
&set_label("2ndmadd",16);
&mov ($carry,"edx");
&mul ($word); # np[j]*m
&add ($carry,&DWP($frame,"esp",$j,4)); # +=tp[j]
&lea ($j,&DWP(1,$j));
&adc ("edx",0);
&add ($carry,"eax");
&mov ("eax",&DWP(0,$inp,$j,4)); # np[j+1]
&adc ("edx",0);
&cmp ($j,$num);
&mov (&DWP($frame-8,"esp",$j,4),$carry); # tp[j-1]=
&jl (&label("2ndmadd"));
&mov ($carry,"edx");
&mul ($word); # np[j]*m
&add ($carry,&DWP($frame,"esp",$num,4)); # +=tp[num-1]
&adc ("edx",0);
&add ($carry,"eax");
&adc ("edx",0);
&mov (&DWP($frame-4,"esp",$num,4),$carry); # tp[num-2]=
&xor ("eax","eax");
&mov ($j,$_bp); # &bp[i]
&add ("edx",&DWP($frame+4,"esp",$num,4)); # carry+=tp[num]
&adc ("eax",&DWP($frame+8,"esp",$num,4)); # +=tp[num+1]
&lea ($j,&DWP(4,$j));
&mov (&DWP($frame,"esp",$num,4),"edx"); # tp[num-1]=
&cmp ($j,$_bpend);
&mov (&DWP($frame+4,"esp",$num,4),"eax"); # tp[num]=
&je (&label("common_tail"));
&mov ($word,&DWP(0,$j)); # bp[i+1]
&mov ($inp,$_ap);
&mov ($_bp,$j); # &bp[++i]
&xor ($j,$j);
&xor ("edx","edx");
&mov ("eax",&DWP(0,$inp));
&jmp (&label("1stmadd"));
&set_label("bn_sqr_mont",16);
$sbit=$num;
&mov ($_num,$num);
&mov ($_bp,$j); # i=0
&mov ("eax",$word); # ap[0]
&mul ($word); # ap[0]*ap[0]
&mov (&DWP($frame,"esp"),"eax"); # tp[0]=
&mov ($sbit,"edx");
&shr ("edx",1);
&and ($sbit,1);
&inc ($j);
&set_label("sqr",16);
&mov ("eax",&DWP(0,$inp,$j,4)); # ap[j]
&mov ($carry,"edx");
&mul ($word); # ap[j]*ap[0]
&add ("eax",$carry);
&lea ($j,&DWP(1,$j));
&adc ("edx",0);
&lea ($carry,&DWP(0,$sbit,"eax",2));
&shr ("eax",31);
&cmp ($j,$_num);
&mov ($sbit,"eax");
&mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j]=
&jl (&label("sqr"));
&mov ("eax",&DWP(0,$inp,$j,4)); # ap[num-1]
&mov ($carry,"edx");
&mul ($word); # ap[num-1]*ap[0]
&add ("eax",$carry);
&mov ($word,$_n0);
&adc ("edx",0);
&mov ($inp,$_np);
&lea ($carry,&DWP(0,$sbit,"eax",2));
&imul ($word,&DWP($frame,"esp")); # n0*tp[0]
&shr ("eax",31);
&mov (&DWP($frame,"esp",$j,4),$carry); # tp[num-1]=
&lea ($carry,&DWP(0,"eax","edx",2));
&mov ("eax",&DWP(0,$inp)); # np[0]
&shr ("edx",31);
&mov (&DWP($frame+4,"esp",$j,4),$carry); # tp[num]=
&mov (&DWP($frame+8,"esp",$j,4),"edx"); # tp[num+1]=
&mul ($word); # np[0]*m
&add ("eax",&DWP($frame,"esp")); # +=tp[0]
&mov ($num,$j);
&adc ("edx",0);
&mov ("eax",&DWP(4,$inp)); # np[1]
&mov ($j,1);
&set_label("3rdmadd",16);
&mov ($carry,"edx");
&mul ($word); # np[j]*m
&add ($carry,&DWP($frame,"esp",$j,4)); # +=tp[j]
&adc ("edx",0);
&add ($carry,"eax");
&mov ("eax",&DWP(4,$inp,$j,4)); # np[j+1]
&adc ("edx",0);
&mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j-1]=
&mov ($carry,"edx");
&mul ($word); # np[j+1]*m
&add ($carry,&DWP($frame+4,"esp",$j,4)); # +=tp[j+1]
&lea ($j,&DWP(2,$j));
&adc ("edx",0);
&add ($carry,"eax");
&mov ("eax",&DWP(0,$inp,$j,4)); # np[j+2]
&adc ("edx",0);
&cmp ($j,$num);
&mov (&DWP($frame-8,"esp",$j,4),$carry); # tp[j]=
&jl (&label("3rdmadd"));
&mov ($carry,"edx");
&mul ($word); # np[j]*m
&add ($carry,&DWP($frame,"esp",$num,4)); # +=tp[num-1]
&adc ("edx",0);
&add ($carry,"eax");
&adc ("edx",0);
&mov (&DWP($frame-4,"esp",$num,4),$carry); # tp[num-2]=
&mov ($j,$_bp); # i
&xor ("eax","eax");
&mov ($inp,$_ap);
&add ("edx",&DWP($frame+4,"esp",$num,4)); # carry+=tp[num]
&adc ("eax",&DWP($frame+8,"esp",$num,4)); # +=tp[num+1]
&mov (&DWP($frame,"esp",$num,4),"edx"); # tp[num-1]=
&cmp ($j,$num);
&mov (&DWP($frame+4,"esp",$num,4),"eax"); # tp[num]=
&je (&label("common_tail"));
&mov ($word,&DWP(4,$inp,$j,4)); # ap[i]
&lea ($j,&DWP(1,$j));
&mov ("eax",$word);
&mov ($_bp,$j); # ++i
&mul ($word); # ap[i]*ap[i]
&add ("eax",&DWP($frame,"esp",$j,4)); # +=tp[i]
&adc ("edx",0);
&mov (&DWP($frame,"esp",$j,4),"eax"); # tp[i]=
&xor ($carry,$carry);
&cmp ($j,$num);
&lea ($j,&DWP(1,$j));
&je (&label("sqrlast"));
&mov ($sbit,"edx"); # zaps $num
&shr ("edx",1);
&and ($sbit,1);
&set_label("sqradd",16);
&mov ("eax",&DWP(0,$inp,$j,4)); # ap[j]
&mov ($carry,"edx");
&mul ($word); # ap[j]*ap[i]
&add ("eax",$carry);
&lea ($carry,&DWP(0,"eax","eax"));
&adc ("edx",0);
&shr ("eax",31);
&add ($carry,&DWP($frame,"esp",$j,4)); # +=tp[j]
&lea ($j,&DWP(1,$j));
&adc ("eax",0);
&add ($carry,$sbit);
&adc ("eax",0);
&cmp ($j,$_num);
&mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j]=
&mov ($sbit,"eax");
&jle (&label("sqradd"));
&mov ($carry,"edx");
&add ("edx","edx");
&shr ($carry,31);
&add ("edx",$sbit);
&adc ($carry,0);
&set_label("sqrlast");
&mov ($word,$_n0);
&mov ($inp,$_np);
&imul ($word,&DWP($frame,"esp")); # n0*tp[0]
&add ("edx",&DWP($frame,"esp",$j,4)); # +=tp[num]
&mov ("eax",&DWP(0,$inp)); # np[0]
&adc ($carry,0);
&mov (&DWP($frame,"esp",$j,4),"edx"); # tp[num]=
&mov (&DWP($frame+4,"esp",$j,4),$carry); # tp[num+1]=
&mul ($word); # np[0]*m
&add ("eax",&DWP($frame,"esp")); # +=tp[0]
&lea ($num,&DWP(-1,$j));
&adc ("edx",0);
&mov ($j,1);
&mov ("eax",&DWP(4,$inp)); # np[1]
&jmp (&label("3rdmadd"));
}
&set_label("common_tail",16);
&mov ($np,$_np); # load modulus pointer
&mov ($rp,$_rp); # load result pointer
&lea ($tp,&DWP($frame,"esp")); # [$ap and $bp are zapped]
&mov ("eax",&DWP(0,$tp)); # tp[0]
&mov ($j,$num); # j=num-1
&xor ($i,$i); # i=0 and clear CF!
&set_label("sub",16);
&sbb ("eax",&DWP(0,$np,$i,4));
&mov (&DWP(0,$rp,$i,4),"eax"); # rp[i]=tp[i]-np[i]
&dec ($j); # doesn't affect CF!
&mov ("eax",&DWP(4,$tp,$i,4)); # tp[i+1]
&lea ($i,&DWP(1,$i)); # i++
&jge (&label("sub"));
&sbb ("eax",0); # handle upmost overflow bit
&set_label("copy",16); # copy or in-place refresh
&mov ("edx",&DWP(0,$tp,$num,4));
&mov ($np,&DWP(0,$rp,$num,4));
&xor ("edx",$np); # conditional select
&and ("edx","eax");
&xor ("edx",$np);
&mov (&DWP(0,$tp,$num,4),$j) # zap temporary vector
&mov (&DWP(0,$rp,$num,4),"edx"); # rp[i]=tp[i]
&dec ($num);
&jge (&label("copy"));
&mov ("esp",$_sp); # pull saved stack pointer
&mov ("eax",1);
&set_label("just_leave");
&function_end("bn_mul_mont");
&asciz("Montgomery Multiplication for x86, CRYPTOGAMS by <appro\@openssl.org>");
&asm_finish();
close STDOUT;