1862 lines
36 KiB
Perl
Executable File
1862 lines
36 KiB
Perl
Executable File
#! /usr/bin/env perl
|
|
#
|
|
# April 2019
|
|
#
|
|
# Abstract: field arithmetic in x64 assembly for SIDH/p503
|
|
|
|
$flavour = shift;
|
|
$output = shift;
|
|
if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
|
|
|
|
# OZAPTF: this may be needed
|
|
$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
|
|
|
|
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
|
|
( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
|
|
( $xlate="${dir}../../../crypto/perlasm/x86_64-xlate.pl" and -f $xlate) or
|
|
die "can't locate x86_64-xlate.pl";
|
|
|
|
open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
|
|
*STDOUT=*OUT;
|
|
|
|
$PREFIX="sike";
|
|
$addx = 1;
|
|
|
|
# Swaps 16-bytes pointed by %rdi and %rsi
|
|
# in constant time
|
|
sub CSWAP16() {
|
|
my $idx = shift;
|
|
|
|
$code.=<<___;
|
|
movdqu $idx*16(%rdi), %xmm0
|
|
movdqu $idx*16(%rsi), %xmm1
|
|
movdqa %xmm1, %xmm2
|
|
pxor %xmm0, %xmm2
|
|
pand %xmm15,%xmm2
|
|
pxor %xmm2, %xmm0
|
|
pxor %xmm2, %xmm1
|
|
movdqu %xmm0, $idx*16(%rdi)
|
|
movdqu %xmm1, $idx*16(%rsi)
|
|
___
|
|
}
|
|
|
|
sub MUL256_SCHOOL() {
|
|
my $idxM0 = shift;
|
|
my $M0 = shift;
|
|
my $idxM1 = shift;
|
|
my $M1 = shift;
|
|
my $idxC = shift;
|
|
my $C = shift;
|
|
my $T0 = shift;
|
|
my $T1 = shift;
|
|
my $T2 = shift;
|
|
my $T3 = shift;
|
|
my $T4 = shift;
|
|
my $T5 = shift;
|
|
my $T6 = shift;
|
|
my $T7 = shift;
|
|
my $T8 = shift;
|
|
my $T9 = shift;
|
|
|
|
# OZAPTF: Is it best approach?
|
|
my $idxC_0 = $idxC;
|
|
my $idxC_8 = $idxC + 8;
|
|
my $idxC_16 = $idxC + 16;
|
|
my $idxC_24 = $idxC + 24;
|
|
my $idxC_32 = $idxC + 32;
|
|
my $idxC_40 = $idxC + 40;
|
|
my $idxC_48 = $idxC + 48;
|
|
my $idxC_56 = $idxC + 56;
|
|
|
|
$code.=<<___;
|
|
mov ($idxM0+0)(%$M0), %rdx
|
|
mulx ($idxM1+0)(%$M1), %$T1, %$T0 # T0:T1 = A0*B0
|
|
mov %$T1, $idxC_0(%$C) # C0_final
|
|
mulx ($idxM1+8)(%$M1), %$T2, %$T1 # T1:T2 = A0*B1
|
|
xor %rax, %rax
|
|
adox %$T2, %$T0
|
|
mulx ($idxM1+16)(%$M1), %$T3, %$T2 # T2:T3 = A0*B2
|
|
adox %$T3, %$T1
|
|
mulx ($idxM1+24)(%$M1), %$T4, %$T3 # T3:T4 = A0*B3
|
|
adox %$T4, %$T2
|
|
|
|
mov ($idxM0+8)(%$M0), %rdx
|
|
mulx ($idxM1+0)(%$M1), %$T4, %$T5 # T5:T4 = A1*B0
|
|
adox %rax, %$T3
|
|
xor %rax, %rax
|
|
mulx ($idxM1+8)(%$M1), %$T7, %$T6 # T6:T7 = A1*B1
|
|
adox %$T0, %$T4
|
|
mov %$T4, $idxC_8(%$C) # C1_final
|
|
adcx %$T7, %$T5
|
|
mulx ($idxM1+16)(%$M1), %$T8, %$T7 # T7:T8 = A1*B2
|
|
adcx %$T8, %$T6
|
|
adox %$T1, %$T5
|
|
mulx ($idxM1+24)(%$M1), %$T9, %$T8 # T8:T9 = A1*B3
|
|
adcx %$T9, %$T7
|
|
adcx %rax, %$T8
|
|
adox %$T2, %$T6
|
|
|
|
mov ($idxM0+16)(%$M0), %rdx
|
|
mulx ($idxM1+ 0)(%$M1), %$T0, %$T1 # T1:T0 = A2*B0
|
|
adox %$T3, %$T7
|
|
adox %rax, %$T8
|
|
xor %rax, %rax
|
|
mulx ($idxM1+ 8)(%$M1), %$T3, %$T2 # T2:T3 = A2*B1
|
|
adox %$T5, %$T0
|
|
mov %$T0, $idxC_16(%$C) # C2_final
|
|
adcx %$T3, %$T1
|
|
mulx ($idxM1+16)(%$M1), %$T4, %$T3 # T3:T4 = A2*B2
|
|
adcx %$T4, %$T2
|
|
adox %$T6, %$T1
|
|
mulx ($idxM1+24)(%$M1), %$T9, %$T4 # T3:T4 = A2*B3
|
|
adcx %$T9, %$T3
|
|
mov ($idxM0+24)(%$M0), %rdx
|
|
adcx %rax, %$T4
|
|
|
|
adox %$T7, %$T2
|
|
adox %$T8, %$T3
|
|
adox %rax, %$T4
|
|
|
|
mulx ($idxM1+ 0)(%$M1), %$T0, %$T5 # T5:T0 = A3*B0
|
|
xor %rax, %rax
|
|
mulx ($idxM1+ 8)(%$M1), %$T7, %$T6 # T6:T7 = A3*B1
|
|
adcx %$T7, %$T5
|
|
adox %$T0, %$T1
|
|
mulx ($idxM1+16)(%$M1), %$T8, %$T7 # T7:T8 = A3*B2
|
|
adcx %$T8, %$T6
|
|
adox %$T5, %$T2
|
|
mulx ($idxM1+24)(%$M1), %$T9, %$T8 # T8:T9 = A3*B3
|
|
adcx %$T9, %$T7
|
|
adcx %rax, %$T8
|
|
adox %$T6, %$T3
|
|
adox %$T7, %$T4
|
|
adox %rax, %$T8
|
|
mov %$T1, $idxC_24(%$C) # C3_final
|
|
mov %$T2, $idxC_32(%$C) # C4_final
|
|
mov %$T3, $idxC_40(%$C) # C5_final
|
|
mov %$T4, $idxC_48(%$C) # C6_final
|
|
mov %$T8, $idxC_56(%$C) # C7_final
|
|
___
|
|
}
|
|
|
|
sub MUL128x320_SCHOOL() {
|
|
my $idxM0 = shift;
|
|
my $M0 = shift;
|
|
my $M1 = shift;
|
|
my $T0 = shift;
|
|
my $T1 = shift;
|
|
my $T2 = shift;
|
|
my $T3 = shift;
|
|
my $T4 = shift;
|
|
my $T5 = shift;
|
|
my $T6 = shift;
|
|
my $T7 = shift;
|
|
my $T8 = shift;
|
|
my $T9 = shift;
|
|
$code.=<<___;
|
|
mov $idxM0(%$M0), %rdx
|
|
mulx 0+$M1, %$T0, %$T1 # T0 <- C0_final
|
|
mulx 8+$M1, %$T4, %$T2
|
|
|
|
xor %rax, %rax
|
|
mulx 16+$M1, %$T5, %$T3
|
|
adox %$T4, %$T1
|
|
adox %$T5, %$T2
|
|
mulx 24+$M1, %$T7, %$T4
|
|
adox %$T7, %$T3
|
|
mulx 32+$M1, %$T6, %$T5
|
|
adox %$T6, %$T4
|
|
adox %rax, %$T5
|
|
|
|
mov ($idxM0+8)(%$M0), %rdx
|
|
mulx 0+$M1, %$T6, %$T7
|
|
adcx %$T6, %$T1 # T1 <- C1_final
|
|
adcx %$T7, %$T2
|
|
mulx 8+$M1, %$T8, %$T6
|
|
adcx %$T6, %$T3
|
|
mulx 16+$M1, %$T7, %$T9
|
|
adcx %$T9, %$T4
|
|
mulx 24+$M1, %$T9, %$T6
|
|
adcx %$T6, %$T5
|
|
mulx 32+$M1, %rdx, %$T6
|
|
adcx %rax, %$T6
|
|
|
|
xor %rax, %rax
|
|
adox %$T8, %$T2
|
|
adox %$T7, %$T3
|
|
adox %$T9, %$T4
|
|
adox %rdx, %$T5
|
|
adox %rax, %$T6
|
|
|
|
___
|
|
}
|
|
|
|
$code.=<<___;
|
|
.section .rodata
|
|
|
|
// p503 x 2
|
|
p503x2:
|
|
.quad 0xFFFFFFFFFFFFFFFE
|
|
.quad 0xFFFFFFFFFFFFFFFF
|
|
.quad 0x57FFFFFFFFFFFFFF
|
|
.quad 0x2610B7B44423CF41
|
|
.quad 0x3737ED90F6FCFB5E
|
|
.quad 0xC08B8D7BB4EF49A0
|
|
.quad 0x0080CDEA83023C3C
|
|
|
|
// p503 + 1
|
|
p503p1:
|
|
.quad 0xAC00000000000000
|
|
.quad 0x13085BDA2211E7A0
|
|
.quad 0x1B9BF6C87B7E7DAF
|
|
.quad 0x6045C6BDDA77A4D0
|
|
.quad 0x004066F541811E1E
|
|
|
|
p503p1_nz:
|
|
.quad 0xAC00000000000000
|
|
.quad 0x13085BDA2211E7A0
|
|
.quad 0x1B9BF6C87B7E7DAF
|
|
.quad 0x6045C6BDDA77A4D0
|
|
.quad 0x004066F541811E1E
|
|
|
|
.section .text
|
|
___
|
|
|
|
$code.=<<___;
|
|
.globl ${PREFIX}_fpadd
|
|
.type ${PREFIX}_fpadd,\@function,3
|
|
${PREFIX}_fpadd:
|
|
.cfi_startproc
|
|
push %r12
|
|
.cfi_adjust_cfa_offset 8
|
|
.cfi_offset r12, -16
|
|
push %r13
|
|
.cfi_adjust_cfa_offset 8
|
|
.cfi_offset r13, -24
|
|
push %r14
|
|
.cfi_adjust_cfa_offset 8
|
|
.cfi_offset r14, -32
|
|
push %r15
|
|
.cfi_adjust_cfa_offset 8
|
|
.cfi_offset r15, -40
|
|
push %rbx
|
|
.cfi_adjust_cfa_offset 8
|
|
.cfi_offset rbx, -48
|
|
|
|
xor %rax, %rax
|
|
|
|
mov 0x0(%rdi), %r8
|
|
mov 0x8(%rdi), %r9
|
|
mov 0x10(%rdi), %r10
|
|
mov 0x18(%rdi), %r11
|
|
mov 0x20(%rdi), %r12
|
|
mov 0x28(%rdi), %r13
|
|
mov 0x30(%rdi), %r14
|
|
mov 0x38(%rdi), %r15
|
|
|
|
add 0x0(%rsi), %r8
|
|
adc 0x8(%rsi), %r9
|
|
adc 0x10(%rsi), %r10
|
|
adc 0x18(%rsi), %r11
|
|
adc 0x20(%rsi), %r12
|
|
adc 0x28(%rsi), %r13
|
|
adc 0x30(%rsi), %r14
|
|
adc 0x38(%rsi), %r15
|
|
|
|
lea p503x2(%rip), %rbx
|
|
|
|
mov 0(%rbx), %rcx;
|
|
sub %rcx, %r8
|
|
mov 8(%rbx), %rcx;
|
|
sbb %rcx, %r9
|
|
sbb %rcx, %r10
|
|
mov 16(%rbx), %rcx;
|
|
sbb %rcx, %r11
|
|
mov 24(%rbx), %rcx;
|
|
sbb %rcx, %r12
|
|
mov 32(%rbx), %rcx;
|
|
sbb %rcx, %r13
|
|
mov 40(%rbx), %rcx;
|
|
sbb %rcx, %r14
|
|
mov 48(%rbx), %rcx;
|
|
sbb %rcx, %r15
|
|
sbb \$0, %rax
|
|
|
|
mov 0(%rbx), %rdi
|
|
and %rax, %rdi
|
|
mov 8(%rbx), %rsi
|
|
and %rax, %rsi
|
|
mov 16(%rbx), %rcx
|
|
and %rax, %rcx
|
|
|
|
add %rdi, %r8
|
|
mov %r8, 0x0(%rdx)
|
|
adc %rsi, %r9
|
|
mov %r9, 0x8(%rdx)
|
|
adc %rsi, %r10
|
|
mov %r10, 0x10(%rdx)
|
|
adc %rcx, %r11
|
|
mov %r11, 0x18(%rdx)
|
|
|
|
setc %cl
|
|
|
|
mov 24(%rbx), %r8
|
|
and %rax, %r8
|
|
mov 32(%rbx), %r9
|
|
and %rax, %r9
|
|
mov 40(%rbx), %r10
|
|
and %rax, %r10
|
|
mov 48(%rbx), %r11
|
|
and %rax, %r11
|
|
|
|
bt \$0, %rcx
|
|
|
|
adc %r8, %r12
|
|
mov %r12, 0x20(%rdx)
|
|
adc %r9, %r13
|
|
mov %r13, 0x28(%rdx)
|
|
adc %r10, %r14
|
|
mov %r14, 0x30(%rdx)
|
|
adc %r11, %r15
|
|
mov %r15, 0x38(%rdx)
|
|
|
|
pop %rbx
|
|
.cfi_adjust_cfa_offset -8
|
|
pop %r15
|
|
.cfi_adjust_cfa_offset -8
|
|
pop %r14
|
|
.cfi_adjust_cfa_offset -8
|
|
pop %r13
|
|
.cfi_adjust_cfa_offset -8
|
|
pop %r12
|
|
.cfi_adjust_cfa_offset -8
|
|
ret
|
|
.cfi_endproc
|
|
___
|
|
|
|
$code.=<<___;
|
|
.globl ${PREFIX}_cswap_asm
|
|
.type ${PREFIX}_cswap_asm,\@function,3
|
|
${PREFIX}_cswap_asm:
|
|
# Fill xmm15. After this step first half of XMM15 is
|
|
# just zeros and second half is whatever in RDX
|
|
mov %rdx, %xmm15
|
|
|
|
# Copy lower double word everywhere else. So that
|
|
# XMM15=RDX|RDX. As RDX has either all bits set
|
|
# or non result will be that XMM15 has also either
|
|
# all bits set or non of them. 68 = 01000100b
|
|
pshufd \$68, %xmm15, %xmm15
|
|
___
|
|
|
|
foreach my $i ( 0.. 3){&CSWAP16($i);} # P[0].X with Q[0].X
|
|
foreach my $i ( 4.. 7){&CSWAP16($i);} # P[0].Z with Q[0].Z
|
|
foreach my $i ( 8..11){&CSWAP16($i);} # P[1].X with Q[1].X
|
|
foreach my $i (12..15){&CSWAP16($i);} # P[1].Z with Q[1].Z
|
|
|
|
$code.=<<___;
|
|
ret
|
|
|
|
###########################################
|
|
# Field subtraction
|
|
# Operation: c [rdx] = a [rdi] - b [rsi]
|
|
|
|
.globl ${PREFIX}_fpsub
|
|
.type ${PREFIX}_fpsub,\@function,3
|
|
${PREFIX}_fpsub:
|
|
.cfi_startproc
|
|
push %r12
|
|
.cfi_adjust_cfa_offset 8
|
|
.cfi_offset r12, -16
|
|
push %r13
|
|
.cfi_adjust_cfa_offset 8
|
|
.cfi_offset r13, -24
|
|
push %r14
|
|
.cfi_adjust_cfa_offset 8
|
|
.cfi_offset r14, -32
|
|
push %r15
|
|
.cfi_adjust_cfa_offset 8
|
|
.cfi_offset r15, -40
|
|
push %rbx
|
|
.cfi_adjust_cfa_offset 8
|
|
.cfi_offset rbx, -48
|
|
|
|
xor %rax, %rax
|
|
|
|
mov 0x0(%rdi), %r8
|
|
mov 0x8(%rdi), %r9
|
|
mov 0x10(%rdi), %r10
|
|
mov 0x18(%rdi), %r11
|
|
mov 0x20(%rdi), %r12
|
|
mov 0x28(%rdi), %r13
|
|
mov 0x30(%rdi), %r14
|
|
mov 0x38(%rdi), %r15
|
|
|
|
sub 0x0(%rsi), %r8
|
|
sbb 0x8(%rsi), %r9
|
|
sbb 0x10(%rsi), %r10
|
|
sbb 0x18(%rsi), %r11
|
|
sbb 0x20(%rsi), %r12
|
|
sbb 0x28(%rsi), %r13
|
|
sbb 0x30(%rsi), %r14
|
|
sbb 0x38(%rsi), %r15
|
|
sbb \$0x0, %rax
|
|
|
|
lea p503x2(%rip), %rbx
|
|
|
|
mov 0x0(%rbx), %rdi
|
|
and %rax, %rdi
|
|
mov 0x8(%rbx), %rsi
|
|
and %rax, %rsi
|
|
mov 0x10(%rbx), %rcx
|
|
and %rax, %rcx
|
|
|
|
add %rdi, %r8
|
|
adc %rsi, %r9
|
|
adc %rsi, %r10
|
|
adc %rcx, %r11
|
|
mov %r8, 0x0(%rdx)
|
|
mov %r9, 0x8(%rdx)
|
|
mov %r10, 0x10(%rdx)
|
|
mov %r11, 0x18(%rdx)
|
|
|
|
setc %cl
|
|
|
|
mov 0x18(%rbx), %r8
|
|
and %rax, %r8
|
|
mov 0x20(%rbx), %r9
|
|
and %rax, %r9
|
|
mov 0x28(%rbx), %r10
|
|
and %rax, %r10
|
|
mov 0x30(%rbx), %r11
|
|
and %rax, %r11
|
|
|
|
bt \$0x0, %rcx
|
|
|
|
adc %r8, %r12
|
|
adc %r9, %r13
|
|
adc %r10, %r14
|
|
adc %r11, %r15
|
|
mov %r12, 0x20(%rdx)
|
|
mov %r13, 0x28(%rdx)
|
|
mov %r14, 0x30(%rdx)
|
|
mov %r15, 0x38(%rdx)
|
|
|
|
pop %rbx
|
|
.cfi_adjust_cfa_offset -8
|
|
pop %r15
|
|
.cfi_adjust_cfa_offset -8
|
|
pop %r14
|
|
.cfi_adjust_cfa_offset -8
|
|
pop %r13
|
|
.cfi_adjust_cfa_offset -8
|
|
pop %r12
|
|
.cfi_adjust_cfa_offset -8
|
|
ret
|
|
.cfi_endproc
|
|
|
|
###########################################
|
|
# 503-bit multiprecision addition
|
|
# Operation: c [rdx] = a [rdi] + b [rsi]
|
|
|
|
.globl ${PREFIX}_mpadd_asm
|
|
.type ${PREFIX}_mpadd_asm,\@function,3
|
|
${PREFIX}_mpadd_asm:
|
|
.cfi_startproc
|
|
mov 0x0(%rdi), %r8
|
|
mov 0x8(%rdi), %r9
|
|
mov 0x10(%rdi), %r10
|
|
mov 0x18(%rdi), %r11
|
|
add 0x0(%rsi), %r8
|
|
adc 0x8(%rsi), %r9
|
|
adc 0x10(%rsi), %r10
|
|
adc 0x18(%rsi), %r11
|
|
mov %r8, 0x0(%rdx)
|
|
mov %r9, 0x8(%rdx)
|
|
mov %r10, 0x10(%rdx)
|
|
mov %r11, 0x18(%rdx)
|
|
|
|
mov 0x20(%rdi), %r8
|
|
mov 0x28(%rdi), %r9
|
|
mov 0x30(%rdi), %r10
|
|
mov 0x38(%rdi), %r11
|
|
adc 0x20(%rsi), %r8
|
|
adc 0x28(%rsi), %r9
|
|
adc 0x30(%rsi), %r10
|
|
adc 0x38(%rsi), %r11
|
|
mov %r8, 0x20(%rdx)
|
|
mov %r9, 0x28(%rdx)
|
|
mov %r10, 0x30(%rdx)
|
|
mov %r11, 0x38(%rdx)
|
|
ret
|
|
.cfi_endproc
|
|
|
|
|
|
###########################################
|
|
# 2x503-bit multiprecision subtraction
|
|
# Operation: c [rdx] = a [rdi] - b [rsi].
|
|
# Returns borrow mask
|
|
|
|
.globl ${PREFIX}_mpsubx2_asm
|
|
.type ${PREFIX}_mpsubx2_asm,\@function,3
|
|
${PREFIX}_mpsubx2_asm:
|
|
.cfi_startproc
|
|
xor %rax, %rax
|
|
|
|
mov 0x0(%rdi), %r8
|
|
mov 0x8(%rdi), %r9
|
|
mov 0x10(%rdi), %r10
|
|
mov 0x18(%rdi), %r11
|
|
mov 0x20(%rdi), %rcx
|
|
sub 0x0(%rsi), %r8
|
|
sbb 0x8(%rsi), %r9
|
|
sbb 0x10(%rsi), %r10
|
|
sbb 0x18(%rsi), %r11
|
|
sbb 0x20(%rsi), %rcx
|
|
mov %r8, 0x0(%rdx)
|
|
mov %r9, 0x8(%rdx)
|
|
mov %r10, 0x10(%rdx)
|
|
mov %r11, 0x18(%rdx)
|
|
mov %rcx, 0x20(%rdx)
|
|
|
|
mov 0x28(%rdi), %r8
|
|
mov 0x30(%rdi), %r9
|
|
mov 0x38(%rdi), %r10
|
|
mov 0x40(%rdi), %r11
|
|
mov 0x48(%rdi), %rcx
|
|
sbb 0x28(%rsi), %r8
|
|
sbb 0x30(%rsi), %r9
|
|
sbb 0x38(%rsi), %r10
|
|
sbb 0x40(%rsi), %r11
|
|
sbb 0x48(%rsi), %rcx
|
|
mov %r8, 0x28(%rdx)
|
|
mov %r9, 0x30(%rdx)
|
|
mov %r10, 0x38(%rdx)
|
|
mov %r11, 0x40(%rdx)
|
|
mov %rcx, 0x48(%rdx)
|
|
|
|
mov 0x50(%rdi), %r8
|
|
mov 0x58(%rdi), %r9
|
|
mov 0x60(%rdi), %r10
|
|
mov 0x68(%rdi), %r11
|
|
mov 0x70(%rdi), %rcx
|
|
sbb 0x50(%rsi), %r8
|
|
sbb 0x58(%rsi), %r9
|
|
sbb 0x60(%rsi), %r10
|
|
sbb 0x68(%rsi), %r11
|
|
sbb 0x70(%rsi), %rcx
|
|
mov %r8, 0x50(%rdx)
|
|
mov %r9, 0x58(%rdx)
|
|
mov %r10, 0x60(%rdx)
|
|
mov %r11, 0x68(%rdx)
|
|
mov %rcx, 0x70(%rdx)
|
|
|
|
mov 0x78(%rdi), %r8
|
|
sbb 0x78(%rsi), %r8
|
|
sbb \$0x0, %rax
|
|
mov %r8, 0x78(%rdx)
|
|
ret
|
|
.cfi_endproc
|
|
|
|
###########################################
|
|
# Double 2x503-bit multiprecision subtraction
|
|
# Operation: c [rdx] = c [rdx] - a [rdi] - b [rsi]
|
|
|
|
.globl ${PREFIX}_mpdblsubx2_asm
|
|
.type ${PREFIX}_mpdblsubx2_asm,\@function,3
|
|
${PREFIX}_mpdblsubx2_asm:
|
|
.cfi_startproc
|
|
push %r12
|
|
.cfi_adjust_cfa_offset 8
|
|
.cfi_offset r12, -16
|
|
push %r13
|
|
.cfi_adjust_cfa_offset 8
|
|
.cfi_offset r13, -24
|
|
push %r14
|
|
.cfi_adjust_cfa_offset 8
|
|
.cfi_offset r14, -32
|
|
|
|
xor %rax, %rax
|
|
|
|
mov 0x0(%rdx), %r8
|
|
mov 0x8(%rdx), %r9
|
|
mov 0x10(%rdx), %r10
|
|
mov 0x18(%rdx), %r11
|
|
mov 0x20(%rdx), %r12
|
|
mov 0x28(%rdx), %r13
|
|
mov 0x30(%rdx), %r14
|
|
mov 0x38(%rdx), %rcx
|
|
sub 0x0(%rdi), %r8
|
|
sbb 0x8(%rdi), %r9
|
|
sbb 0x10(%rdi), %r10
|
|
sbb 0x18(%rdi), %r11
|
|
sbb 0x20(%rdi), %r12
|
|
sbb 0x28(%rdi), %r13
|
|
sbb 0x30(%rdi), %r14
|
|
sbb 0x38(%rdi), %rcx
|
|
adc \$0x0, %rax
|
|
|
|
sub 0x0(%rsi), %r8
|
|
sbb 0x8(%rsi), %r9
|
|
sbb 0x10(%rsi), %r10
|
|
sbb 0x18(%rsi), %r11
|
|
sbb 0x20(%rsi), %r12
|
|
sbb 0x28(%rsi), %r13
|
|
sbb 0x30(%rsi), %r14
|
|
sbb 0x38(%rsi), %rcx
|
|
adc \$0x0, %rax
|
|
|
|
mov %r8, 0x0(%rdx)
|
|
mov %r9, 0x8(%rdx)
|
|
mov %r10, 0x10(%rdx)
|
|
mov %r11, 0x18(%rdx)
|
|
mov %r12, 0x20(%rdx)
|
|
mov %r13, 0x28(%rdx)
|
|
mov %r14, 0x30(%rdx)
|
|
mov %rcx, 0x38(%rdx)
|
|
|
|
mov 0x40(%rdx), %r8
|
|
mov 0x48(%rdx), %r9
|
|
mov 0x50(%rdx), %r10
|
|
mov 0x58(%rdx), %r11
|
|
mov 0x60(%rdx), %r12
|
|
mov 0x68(%rdx), %r13
|
|
mov 0x70(%rdx), %r14
|
|
mov 0x78(%rdx), %rcx
|
|
|
|
sub %rax, %r8
|
|
sbb 0x40(%rdi), %r8
|
|
sbb 0x48(%rdi), %r9
|
|
sbb 0x50(%rdi), %r10
|
|
sbb 0x58(%rdi), %r11
|
|
sbb 0x60(%rdi), %r12
|
|
sbb 0x68(%rdi), %r13
|
|
sbb 0x70(%rdi), %r14
|
|
sbb 0x78(%rdi), %rcx
|
|
sub 0x40(%rsi), %r8
|
|
sbb 0x48(%rsi), %r9
|
|
sbb 0x50(%rsi), %r10
|
|
sbb 0x58(%rsi), %r11
|
|
sbb 0x60(%rsi), %r12
|
|
sbb 0x68(%rsi), %r13
|
|
sbb 0x70(%rsi), %r14
|
|
sbb 0x78(%rsi), %rcx
|
|
|
|
mov %r8, 0x40(%rdx)
|
|
mov %r9, 0x48(%rdx)
|
|
mov %r10, 0x50(%rdx)
|
|
mov %r11, 0x58(%rdx)
|
|
mov %r12, 0x60(%rdx)
|
|
mov %r13, 0x68(%rdx)
|
|
mov %r14, 0x70(%rdx)
|
|
mov %rcx, 0x78(%rdx)
|
|
|
|
pop %r14
|
|
.cfi_adjust_cfa_offset -8
|
|
pop %r13
|
|
.cfi_adjust_cfa_offset -8
|
|
pop %r12
|
|
.cfi_adjust_cfa_offset -8
|
|
ret
|
|
.cfi_endproc
|
|
|
|
###########################################
|
|
# Montgomery multiplication
|
|
# 503-bit multiplication using Karatsuba
|
|
# (one level), schoolbook (one level)
|
|
.mul_mulx_asm:
|
|
.cfi_startproc
|
|
// sike_mpmul has already pushed r12--15 by this point..cfi_adjust_cfa_offset 32
|
|
.cfi_adjust_cfa_offset 32
|
|
.cfi_offset r12, -16
|
|
.cfi_offset r13, -24
|
|
.cfi_offset r14, -32
|
|
.cfi_offset r15, -40
|
|
|
|
mov %rdx, %rcx
|
|
|
|
# r8-r11 <- AH + AL, rax <- mask
|
|
xor %rax, %rax
|
|
mov (%rdi), %r8
|
|
mov 0x8(%rdi), %r9
|
|
mov 0x10(%rdi), %r10
|
|
mov 0x18(%rdi), %r11
|
|
push %rbx
|
|
|
|
.cfi_adjust_cfa_offset 8
|
|
.cfi_offset rbx, -48
|
|
push %rbp
|
|
.cfi_offset rbp, -56
|
|
.cfi_adjust_cfa_offset 8
|
|
sub \$96, %rsp
|
|
.cfi_adjust_cfa_offset 96
|
|
add 0x20(%rdi), %r8
|
|
adc 0x28(%rdi), %r9
|
|
adc 0x30(%rdi), %r10
|
|
adc 0x38(%rdi), %r11
|
|
sbb \$0x0, %rax
|
|
mov %r8, (%rsp)
|
|
mov %r9, 0x8(%rsp)
|
|
mov %r10, 0x10(%rsp)
|
|
mov %r11, 0x18(%rsp)
|
|
|
|
# r12-r15 <- BH + BL, rbx <- mask
|
|
xor %rbx, %rbx
|
|
mov (%rsi), %r12
|
|
mov 0x8(%rsi), %r13
|
|
mov 0x10(%rsi), %r14
|
|
mov 0x18(%rsi), %r15
|
|
add 0x20(%rsi), %r12
|
|
adc 0x28(%rsi), %r13
|
|
adc 0x30(%rsi), %r14
|
|
adc 0x38(%rsi), %r15
|
|
sbb \$0x0, %rbx
|
|
mov %r12, 0x20(%rsp)
|
|
mov %r13, 0x28(%rsp)
|
|
mov %r14, 0x30(%rsp)
|
|
mov %r15, 0x38(%rsp)
|
|
|
|
# r12-r15 <- masked (BH + BL)
|
|
and %rax, %r12
|
|
and %rax, %r13
|
|
and %rax, %r14
|
|
and %rax, %r15
|
|
|
|
# r8-r11 <- masked (AH + AL)
|
|
and %rbx, %r8
|
|
and %rbx, %r9
|
|
and %rbx, %r10
|
|
and %rbx, %r11
|
|
|
|
# r8-r11 <- masked (AH + AL) + masked (AH + AL)
|
|
add %r12, %r8
|
|
adc %r13, %r9
|
|
adc %r14, %r10
|
|
adc %r15, %r11
|
|
mov %r8, 0x40(%rsp)
|
|
mov %r9, 0x48(%rsp)
|
|
mov %r10, 0x50(%rsp)
|
|
mov %r11, 0x58(%rsp)
|
|
___
|
|
|
|
# [rcx+64] <- (AH+AL) x (BH+BL), low part
|
|
&MUL256_SCHOOL( 0, "rsp",32,"rsp",64,"rcx", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "rbx", "rbp") if ($addx);
|
|
# [rcx] <- AL x BL (Result c0-c3)
|
|
&MUL256_SCHOOL( 0, "rdi", 0,"rsi", 0,"rcx", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "rbx", "rbp") if ($addx);
|
|
# [rsp] <- AH x BH
|
|
&MUL256_SCHOOL(32, "rdi",32,"rsi", 0,"rsp", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "rbx", "rbp") if ($addx);
|
|
|
|
$code.=<<___ if ($addx);
|
|
# r8-r11 <- (AH+AL) x (BH+BL), final step
|
|
mov 0x40(%rsp), %r8
|
|
mov 0x48(%rsp), %r9
|
|
mov 0x50(%rsp), %r10
|
|
mov 0x58(%rsp), %r11
|
|
mov 0x60(%rcx), %rax
|
|
add %rax, %r8
|
|
mov 0x68(%rcx), %rax
|
|
adc %rax, %r9
|
|
mov 0x70(%rcx), %rax
|
|
adc %rax, %r10
|
|
mov 0x78(%rcx), %rax
|
|
adc %rax, %r11
|
|
|
|
# [rcx+64], x3-x5 <- (AH+AL) x (BH+BL) - ALxBL
|
|
mov 0x40(%rcx), %r12
|
|
mov 0x48(%rcx), %r13
|
|
mov 0x50(%rcx), %r14
|
|
mov 0x58(%rcx), %r15
|
|
sub (%rcx), %r12
|
|
sbb 0x8(%rcx), %r13
|
|
sbb 0x10(%rcx), %r14
|
|
sbb 0x18(%rcx), %r15
|
|
sbb 0x20(%rcx), %r8
|
|
sbb 0x28(%rcx), %r9
|
|
sbb 0x30(%rcx), %r10
|
|
sbb 0x38(%rcx), %r11
|
|
|
|
# r8-r15 <- (AH+AL) x (BH+BL) - ALxBL - AHxBH
|
|
sub (%rsp), %r12
|
|
sbb 0x8(%rsp), %r13
|
|
sbb 0x10(%rsp), %r14
|
|
sbb 0x18(%rsp), %r15
|
|
sbb 0x20(%rsp), %r8
|
|
sbb 0x28(%rsp), %r9
|
|
sbb 0x30(%rsp), %r10
|
|
sbb 0x38(%rsp), %r11
|
|
|
|
add 0x20(%rcx), %r12
|
|
mov %r12, 0x20(%rcx) # Result C4-C7
|
|
adc 0x28(%rcx), %r13
|
|
mov %r13, 0x28(%rcx)
|
|
adc 0x30(%rcx), %r14
|
|
mov %r14, 0x30(%rcx)
|
|
adc 0x38(%rcx), %r15
|
|
mov %r15, 0x38(%rcx)
|
|
mov (%rsp), %rax
|
|
adc %rax, %r8 # Result C8-C15
|
|
mov %r8, 0x40(%rcx)
|
|
mov 0x8(%rsp), %rax
|
|
adc %rax, %r9
|
|
mov %r9, 0x48(%rcx)
|
|
mov 0x10(%rsp), %rax
|
|
adc %rax, %r10
|
|
mov %r10, 0x50(%rcx)
|
|
mov 0x18(%rsp), %rax
|
|
adc %rax, %r11
|
|
mov %r11, 0x58(%rcx)
|
|
mov 0x20(%rsp), %r12
|
|
adc \$0x0, %r12
|
|
mov %r12, 0x60(%rcx)
|
|
mov 0x28(%rsp), %r13
|
|
adc \$0x0, %r13
|
|
mov %r13, 0x68(%rcx)
|
|
mov 0x30(%rsp), %r14
|
|
adc \$0x0, %r14
|
|
mov %r14, 0x70(%rcx)
|
|
mov 0x38(%rsp), %r15
|
|
adc \$0x0, %r15
|
|
mov %r15, 0x78(%rcx)
|
|
|
|
add \$96, %rsp
|
|
.cfi_adjust_cfa_offset -96
|
|
pop %rbp
|
|
.cfi_adjust_cfa_offset -8
|
|
.cfi_same_value rbp
|
|
pop %rbx
|
|
.cfi_adjust_cfa_offset -8
|
|
.cfi_same_value rbx
|
|
pop %r15
|
|
.cfi_adjust_cfa_offset -8
|
|
.cfi_same_value r15
|
|
pop %r14
|
|
.cfi_adjust_cfa_offset -8
|
|
.cfi_same_value r14
|
|
pop %r13
|
|
.cfi_adjust_cfa_offset -8
|
|
.cfi_same_value r13
|
|
pop %r12
|
|
.cfi_adjust_cfa_offset -8
|
|
.cfi_same_value r12
|
|
ret
|
|
|
|
|
|
###########################################
|
|
# Integer multiplication
|
|
# Based on Karatsuba method
|
|
# Operation: c [rdx] = a [rdi] * b [rsi]
|
|
# NOTE: a=c or b=c are not allowed
|
|
|
|
.globl ${PREFIX}_mpmul
|
|
.type ${PREFIX}_mpmul,\@function,3
|
|
${PREFIX}_mpmul:
|
|
push %r12
|
|
.cfi_adjust_cfa_offset 8
|
|
.cfi_offset r12, -16
|
|
push %r13
|
|
.cfi_adjust_cfa_offset 8
|
|
.cfi_offset r13, -24
|
|
push %r14
|
|
.cfi_adjust_cfa_offset 8
|
|
.cfi_offset r14, -32
|
|
push %r15
|
|
.cfi_adjust_cfa_offset 8
|
|
.cfi_offset r15, -40
|
|
|
|
___
|
|
|
|
$code.=<<___ if ($addx);
|
|
lea OPENSSL_ia32cap_P(%rip), %rcx
|
|
mov 8(%rcx), %rcx
|
|
and \$0x80100, %ecx
|
|
cmp \$0x80100, %ecx
|
|
je .mul_mulx_asm
|
|
|
|
___
|
|
|
|
$code.=<<___;
|
|
mov %rdx, %rcx
|
|
|
|
# rcx[0-3] <- AH+AL
|
|
xor %rax, %rax
|
|
mov 0x20(%rdi), %r8
|
|
mov 0x28(%rdi), %r9
|
|
mov 0x30(%rdi), %r10
|
|
mov 0x38(%rdi), %r11
|
|
add 0x0(%rdi), %r8
|
|
adc 0x8(%rdi), %r9
|
|
adc 0x10(%rdi), %r10
|
|
adc 0x18(%rdi), %r11
|
|
mov %r8, 0x0(%rcx)
|
|
mov %r9, 0x8(%rcx)
|
|
mov %r10, 0x10(%rcx)
|
|
mov %r11, 0x18(%rcx)
|
|
sbb \$0, %rax
|
|
sub \$80, %rsp # Allocating space in stack
|
|
.cfi_adjust_cfa_offset 80
|
|
|
|
# r12-r15 <- BH+BL
|
|
xor %rdx, %rdx
|
|
mov 0x20(%rsi), %r12
|
|
mov 0x28(%rsi), %r13
|
|
mov 0x30(%rsi), %r14
|
|
mov 0x38(%rsi), %r15
|
|
add 0x0(%rsi), %r12
|
|
adc 0x8(%rsi), %r13
|
|
adc 0x10(%rsi), %r14
|
|
adc 0x18(%rsi), %r15
|
|
sbb \$0x0, %rdx
|
|
mov %rax, 0x40(%rsp)
|
|
mov %rdx, 0x48(%rsp)
|
|
|
|
# (rsp[0-3],r8,r9,r10,r11) <- (AH+AL)*(BH+BL)
|
|
mov (%rcx), %rax
|
|
mul %r12
|
|
mov %rax, (%rsp) # c0
|
|
mov %rdx, %r8
|
|
|
|
xor %r9, %r9
|
|
mov (%rcx), %rax
|
|
mul %r13
|
|
add %rax, %r8
|
|
adc %rdx, %r9
|
|
|
|
xor %r10, %r10
|
|
mov 0x8(%rcx), %rax
|
|
mul %r12
|
|
add %rax, %r8
|
|
mov %r8, 0x8(%rsp) # c1
|
|
adc %rdx, %r9
|
|
adc \$0x0, %r10
|
|
|
|
xor %r8, %r8
|
|
mov (%rcx), %rax
|
|
mul %r14
|
|
add %rax, %r9
|
|
adc %rdx, %r10
|
|
adc \$0x0, %r8
|
|
|
|
mov 0x10(%rcx), %rax
|
|
mul %r12
|
|
add %rax, %r9
|
|
adc %rdx, %r10
|
|
adc \$0x0, %r8
|
|
|
|
mov 0x8(%rcx), %rax
|
|
mul %r13
|
|
add %rax, %r9
|
|
mov %r9, 0x10(%rsp) # c2
|
|
adc %rdx, %r10
|
|
adc \$0x0, %r8
|
|
|
|
xor %r9, %r9
|
|
mov (%rcx), %rax
|
|
mul %r15
|
|
add %rax, %r10
|
|
adc %rdx, %r8
|
|
adc \$0x0, %r9
|
|
|
|
mov 0x18(%rcx), %rax
|
|
mul %r12
|
|
add %rax, %r10
|
|
adc %rdx, %r8
|
|
adc \$0x0, %r9
|
|
|
|
mov 0x8(%rcx), %rax
|
|
mul %r14
|
|
add %rax, %r10
|
|
adc %rdx, %r8
|
|
adc \$0x0, %r9
|
|
|
|
mov 0x10(%rcx), %rax
|
|
mul %r13
|
|
add %rax, %r10
|
|
mov %r10, 0x18(%rsp) # c3
|
|
adc %rdx, %r8
|
|
adc \$0x0, %r9
|
|
|
|
xor %r10, %r10
|
|
mov 0x8(%rcx), %rax
|
|
mul %r15
|
|
add %rax, %r8
|
|
adc %rdx, %r9
|
|
adc \$0x0, %r10
|
|
|
|
mov 0x18(%rcx), %rax
|
|
mul %r13
|
|
add %rax, %r8
|
|
adc %rdx, %r9
|
|
adc \$0x0, %r10
|
|
|
|
mov 0x10(%rcx), %rax
|
|
mul %r14
|
|
add %rax, %r8
|
|
mov %r8, 0x20(%rsp) # c4
|
|
adc %rdx, %r9
|
|
adc \$0x0, %r10
|
|
|
|
xor %r11, %r11
|
|
mov 0x10(%rcx), %rax
|
|
mul %r15
|
|
add %rax, %r9
|
|
adc %rdx, %r10
|
|
adc \$0x0, %r11
|
|
|
|
mov 0x18(%rcx), %rax
|
|
mul %r14
|
|
add %rax, %r9 # c5
|
|
adc %rdx, %r10
|
|
adc \$0x0, %r11
|
|
|
|
mov 0x18(%rcx), %rax
|
|
mul %r15
|
|
add %rax, %r10 # c6
|
|
adc %rdx, %r11 # c7
|
|
|
|
mov 0x40(%rsp), %rax
|
|
and %rax, %r12
|
|
and %rax, %r13
|
|
and %rax, %r14
|
|
and %rax, %r15
|
|
add %r8, %r12
|
|
adc %r9, %r13
|
|
adc %r10, %r14
|
|
adc %r11, %r15
|
|
|
|
mov 0x48(%rsp), %rax
|
|
mov (%rcx), %r8
|
|
mov 0x8(%rcx), %r9
|
|
mov 0x10(%rcx), %r10
|
|
mov 0x18(%rcx), %r11
|
|
and %rax, %r8
|
|
and %rax, %r9
|
|
and %rax, %r10
|
|
and %rax, %r11
|
|
add %r12, %r8
|
|
adc %r13, %r9
|
|
adc %r14, %r10
|
|
adc %r15, %r11
|
|
mov %r8, 0x20(%rsp)
|
|
mov %r9, 0x28(%rsp)
|
|
mov %r10, 0x30(%rsp)
|
|
mov %r11, 0x38(%rsp)
|
|
|
|
mov (%rdi), %r11
|
|
mov (%rsi), %rax
|
|
mul %r11
|
|
xor %r9, %r9
|
|
mov %rax, (%rcx) # c0
|
|
mov %rdx, %r8
|
|
|
|
mov 0x10(%rdi), %r14
|
|
mov 0x8(%rsi), %rax
|
|
mul %r11
|
|
xor %r10, %r10
|
|
add %rax, %r8
|
|
adc %rdx, %r9
|
|
|
|
mov 0x8(%rdi), %r12
|
|
mov (%rsi), %rax
|
|
mul %r12
|
|
add %rax, %r8
|
|
mov %r8, 0x8(%rcx) # c1
|
|
adc %rdx, %r9
|
|
adc \$0x0, %r10
|
|
|
|
xor %r8, %r8
|
|
mov 0x10(%rsi), %rax
|
|
mul %r11
|
|
add %rax, %r9
|
|
adc %rdx, %r10
|
|
adc \$0x0, %r8
|
|
|
|
mov (%rsi), %r13
|
|
mov %r14, %rax
|
|
mul %r13
|
|
add %rax, %r9
|
|
adc %rdx, %r10
|
|
adc \$0x0, %r8
|
|
|
|
mov 0x8(%rsi), %rax
|
|
mul %r12
|
|
add %rax, %r9
|
|
mov %r9, 0x10(%rcx) # c2
|
|
adc %rdx, %r10
|
|
adc \$0x0, %r8
|
|
|
|
xor %r9, %r9
|
|
mov 0x18(%rsi), %rax
|
|
mul %r11
|
|
mov 0x18(%rdi), %r15
|
|
add %rax, %r10
|
|
adc %rdx, %r8
|
|
adc \$0x0, %r9
|
|
|
|
mov %r15, %rax
|
|
mul %r13
|
|
add %rax, %r10
|
|
adc %rdx, %r8
|
|
adc \$0x0, %r9
|
|
|
|
mov 0x10(%rsi), %rax
|
|
mul %r12
|
|
add %rax, %r10
|
|
adc %rdx, %r8
|
|
adc \$0x0, %r9
|
|
|
|
mov 0x8(%rsi), %rax
|
|
mul %r14
|
|
add %rax, %r10
|
|
mov %r10, 0x18(%rcx) # c3
|
|
adc %rdx, %r8
|
|
adc \$0x0, %r9
|
|
|
|
xor %r10, %r10
|
|
mov 0x18(%rsi), %rax
|
|
mul %r12
|
|
add %rax, %r8
|
|
adc %rdx, %r9
|
|
adc \$0x0, %r10
|
|
|
|
mov 0x8(%rsi), %rax
|
|
mul %r15
|
|
add %rax, %r8
|
|
adc %rdx, %r9
|
|
adc \$0x0, %r10
|
|
|
|
mov 0x10(%rsi), %rax
|
|
mul %r14
|
|
add %rax, %r8
|
|
mov %r8, 0x20(%rcx) # c4
|
|
adc %rdx, %r9
|
|
adc \$0x0, %r10
|
|
|
|
xor %r8, %r8
|
|
mov 0x18(%rsi), %rax
|
|
mul %r14
|
|
add %rax, %r9
|
|
adc %rdx, %r10
|
|
adc \$0x0, %r8
|
|
|
|
mov 0x10(%rsi), %rax
|
|
mul %r15
|
|
add %rax, %r9
|
|
mov %r9, 0x28(%rcx) # c5
|
|
adc %rdx, %r10
|
|
adc \$0x0, %r8
|
|
|
|
mov 0x18(%rsi), %rax
|
|
mul %r15
|
|
add %rax, %r10
|
|
mov %r10, 0x30(%rcx) # c6
|
|
adc %rdx, %r8
|
|
mov %r8, 0x38(%rcx) # c7
|
|
|
|
# rcx[8-15] <- AH*BH
|
|
mov 0x20(%rdi), %r11
|
|
mov 0x20(%rsi), %rax
|
|
mul %r11
|
|
xor %r9, %r9
|
|
mov %rax, 0x40(%rcx) # c0
|
|
mov %rdx, %r8
|
|
|
|
mov 0x30(%rdi), %r14
|
|
mov 0x28(%rsi), %rax
|
|
mul %r11
|
|
xor %r10, %r10
|
|
add %rax, %r8
|
|
adc %rdx, %r9
|
|
|
|
mov 0x28(%rdi), %r12
|
|
mov 0x20(%rsi), %rax
|
|
mul %r12
|
|
add %rax, %r8
|
|
mov %r8, 0x48(%rcx) # c1
|
|
adc %rdx, %r9
|
|
adc \$0x0, %r10
|
|
|
|
xor %r8, %r8
|
|
mov 0x30(%rsi), %rax
|
|
mul %r11
|
|
add %rax, %r9
|
|
adc %rdx, %r10
|
|
adc \$0x0, %r8
|
|
|
|
mov 0x20(%rsi), %r13
|
|
mov %r14, %rax
|
|
mul %r13
|
|
add %rax, %r9
|
|
adc %rdx, %r10
|
|
adc \$0x0, %r8
|
|
|
|
mov 0x28(%rsi), %rax
|
|
mul %r12
|
|
add %rax, %r9
|
|
mov %r9, 0x50(%rcx) # c2
|
|
adc %rdx, %r10
|
|
adc \$0x0, %r8
|
|
|
|
xor %r9, %r9
|
|
mov 0x38(%rsi), %rax
|
|
mul %r11
|
|
mov 0x38(%rdi), %r15
|
|
add %rax, %r10
|
|
adc %rdx, %r8
|
|
adc \$0x0, %r9
|
|
|
|
mov %r15, %rax
|
|
mul %r13
|
|
add %rax, %r10
|
|
adc %rdx, %r8
|
|
adc \$0x0, %r9
|
|
|
|
mov 0x30(%rsi), %rax
|
|
mul %r12
|
|
add %rax, %r10
|
|
adc %rdx, %r8
|
|
adc \$0x0, %r9
|
|
|
|
mov 0x28(%rsi), %rax
|
|
mul %r14
|
|
add %rax, %r10
|
|
mov %r10, 0x58(%rcx) # c3
|
|
adc %rdx, %r8
|
|
adc \$0x0, %r9
|
|
|
|
xor %r10, %r10
|
|
mov 0x38(%rsi), %rax
|
|
mul %r12
|
|
add %rax, %r8
|
|
adc %rdx, %r9
|
|
adc \$0x0, %r10
|
|
|
|
mov 0x28(%rsi), %rax
|
|
mul %r15
|
|
add %rax, %r8
|
|
adc %rdx, %r9
|
|
adc \$0x0, %r10
|
|
|
|
mov 0x30(%rsi), %rax
|
|
mul %r14
|
|
add %rax, %r8
|
|
mov %r8, 0x60(%rcx) # c4
|
|
adc %rdx, %r9
|
|
adc \$0x0, %r10
|
|
|
|
xor %r8, %r8
|
|
mov 0x38(%rsi), %rax
|
|
mul %r14
|
|
add %rax, %r9
|
|
adc %rdx, %r10
|
|
adc \$0x0, %r8
|
|
|
|
mov 0x30(%rsi), %rax
|
|
mul %r15
|
|
add %rax, %r9
|
|
mov %r9, 0x68(%rcx) # c5
|
|
adc %rdx, %r10
|
|
adc \$0x0, %r8
|
|
|
|
mov 0x38(%rsi), %rax
|
|
mul %r15
|
|
add %rax, %r10
|
|
mov %r10, 0x70(%rcx) # c6
|
|
adc %rdx, %r8
|
|
mov %r8, 0x78(%rcx) # c7
|
|
|
|
# [r8-r15] <- (AH+AL)*(BH+BL) - AL*BL
|
|
mov 0x0(%rsp), %r8
|
|
sub 0x0(%rcx), %r8
|
|
mov 0x8(%rsp), %r9
|
|
sbb 0x8(%rcx), %r9
|
|
mov 0x10(%rsp), %r10
|
|
sbb 0x10(%rcx), %r10
|
|
mov 0x18(%rsp), %r11
|
|
sbb 0x18(%rcx), %r11
|
|
mov 0x20(%rsp), %r12
|
|
sbb 0x20(%rcx), %r12
|
|
mov 0x28(%rsp), %r13
|
|
sbb 0x28(%rcx), %r13
|
|
mov 0x30(%rsp), %r14
|
|
sbb 0x30(%rcx), %r14
|
|
mov 0x38(%rsp), %r15
|
|
sbb 0x38(%rcx), %r15
|
|
|
|
# [r8-r15] <- (AH+AL)*(BH+BL) - AL*BL - AH*BH
|
|
mov 0x40(%rcx), %rax
|
|
sub %rax, %r8
|
|
mov 0x48(%rcx), %rax
|
|
sbb %rax, %r9
|
|
mov 0x50(%rcx), %rax
|
|
sbb %rax, %r10
|
|
mov 0x58(%rcx), %rax
|
|
sbb %rax, %r11
|
|
mov 0x60(%rcx), %rax
|
|
sbb %rax, %r12
|
|
mov 0x68(%rcx), %rdx
|
|
sbb %rdx, %r13
|
|
mov 0x70(%rcx), %rdi
|
|
sbb %rdi, %r14
|
|
mov 0x78(%rcx), %rsi
|
|
sbb %rsi, %r15
|
|
|
|
# Final result
|
|
add 0x20(%rcx), %r8
|
|
mov %r8, 0x20(%rcx)
|
|
adc 0x28(%rcx), %r9
|
|
mov %r9, 0x28(%rcx)
|
|
adc 0x30(%rcx), %r10
|
|
mov %r10, 0x30(%rcx)
|
|
adc 0x38(%rcx), %r11
|
|
mov %r11, 0x38(%rcx)
|
|
adc 0x40(%rcx), %r12
|
|
mov %r12, 0x40(%rcx)
|
|
adc 0x48(%rcx), %r13
|
|
mov %r13, 0x48(%rcx)
|
|
adc 0x50(%rcx), %r14
|
|
mov %r14, 0x50(%rcx)
|
|
adc 0x58(%rcx), %r15
|
|
mov %r15, 0x58(%rcx)
|
|
adc \$0x0, %rax
|
|
mov %rax, 0x60(%rcx)
|
|
adc \$0x0, %rdx
|
|
mov %rdx, 0x68(%rcx)
|
|
adc \$0x0, %rdi
|
|
mov %rdi, 0x70(%rcx)
|
|
adc \$0x0, %rsi
|
|
mov %rsi, 0x78(%rcx)
|
|
|
|
add \$80, %rsp # Restoring space in stack
|
|
.cfi_adjust_cfa_offset -80
|
|
pop %r15
|
|
.cfi_adjust_cfa_offset -8
|
|
pop %r14
|
|
.cfi_adjust_cfa_offset -8
|
|
pop %r13
|
|
.cfi_adjust_cfa_offset -8
|
|
pop %r12
|
|
.cfi_adjust_cfa_offset -8
|
|
ret
|
|
.cfi_endproc
|
|
___
|
|
|
|
# Optimized x86 code for CPUs with ADOX/ADCX and BMI2
|
|
$code.=<<___ if ($addx);
|
|
###########################################
|
|
# Montgomery reduction
|
|
# Based on method described in Faz-Hernandez et al. https://eprint.iacr.org/2017/1015
|
|
# Operation: c [rsi] = a [rdi]
|
|
# NOTE: a=c is not allowed
|
|
|
|
.rdc503_mulx_asm:
|
|
.cfi_startproc
|
|
# sike_fprdc has already pushed r12--15 and rbx by this point.
|
|
.cfi_adjust_cfa_offset 32
|
|
.cfi_offset r12, -16
|
|
.cfi_offset r13, -24
|
|
.cfi_offset r14, -32
|
|
.cfi_offset r15, -40
|
|
.cfi_offset rbx, -48
|
|
.cfi_adjust_cfa_offset 8
|
|
___
|
|
|
|
# a[0-1] x p503p1_nz --> result: r8:r14
|
|
&MUL128x320_SCHOOL(0, "rdi", "p503p1_nz(%rip)", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "rbx", "rcx", "r15") if($addx);
|
|
$code.=<<___ if ($addx);
|
|
xor %r15, %r15
|
|
add 0x18(%rdi), %r8
|
|
adc 0x20(%rdi), %r9
|
|
adc 0x28(%rdi), %r10
|
|
adc 0x30(%rdi), %r11
|
|
adc 0x38(%rdi), %r12
|
|
adc 0x40(%rdi), %r13
|
|
adc 0x48(%rdi), %r14
|
|
adc 0x50(%rdi), %r15
|
|
mov %r8, 0x18(%rdi)
|
|
mov %r9, 0x20(%rdi)
|
|
mov %r10, 0x28(%rdi)
|
|
mov %r11, 0x30(%rdi)
|
|
mov %r12, 0x38(%rdi)
|
|
mov %r13, 0x40(%rdi)
|
|
mov %r14, 0x48(%rdi)
|
|
mov %r15, 0x50(%rdi)
|
|
mov 0x58(%rdi), %r8
|
|
mov 0x60(%rdi), %r9
|
|
mov 0x68(%rdi), %r10
|
|
mov 0x70(%rdi), %r11
|
|
mov 0x78(%rdi), %r12
|
|
adc \$0x0, %r8
|
|
adc \$0x0, %r9
|
|
adc \$0x0, %r10
|
|
adc \$0x0, %r11
|
|
adc \$0x0, %r12
|
|
mov %r8, 0x58(%rdi)
|
|
mov %r9, 0x60(%rdi)
|
|
mov %r10, 0x68(%rdi)
|
|
mov %r11, 0x70(%rdi)
|
|
mov %r12, 0x78(%rdi)
|
|
___
|
|
|
|
# a[2-3] x p503p1_nz --> result: r8:r14
|
|
&MUL128x320_SCHOOL(16, "rdi", "p503p1_nz(%rip)", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "rbx", "rcx", "r15") if($addx);
|
|
|
|
$code.=<<___ if ($addx);
|
|
xor %r15, %r15
|
|
add 0x28(%rdi), %r8
|
|
adc 0x30(%rdi), %r9
|
|
adc 0x38(%rdi), %r10
|
|
adc 0x40(%rdi), %r11
|
|
adc 0x48(%rdi), %r12
|
|
adc 0x50(%rdi), %r13
|
|
adc 0x58(%rdi), %r14
|
|
adc 0x60(%rdi), %r15
|
|
mov %r8, 0x28(%rdi)
|
|
mov %r9, 0x30(%rdi)
|
|
mov %r10, 0x38(%rdi)
|
|
mov %r11, 0x40(%rdi)
|
|
mov %r12, 0x48(%rdi)
|
|
mov %r13, 0x50(%rdi)
|
|
mov %r14, 0x58(%rdi)
|
|
mov %r15, 0x60(%rdi)
|
|
mov 0x68(%rdi), %r8
|
|
mov 0x70(%rdi), %r9
|
|
mov 0x78(%rdi), %r10
|
|
adc \$0x0, %r8
|
|
adc \$0x0, %r9
|
|
adc \$0x0, %r10
|
|
mov %r8, 0x68(%rdi)
|
|
mov %r9, 0x70(%rdi)
|
|
mov %r10, 0x78(%rdi)
|
|
___
|
|
|
|
# a[4-5] x p503p1_nz --> result: r8:r14
|
|
&MUL128x320_SCHOOL(32, "rdi", "p503p1_nz(%rip)", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "rbx", "rcx", "r15") if($addx);
|
|
|
|
$code.=<<___ if ($addx);
|
|
xor %r15, %r15
|
|
xor %rbx, %rbx
|
|
add 0x38(%rdi), %r8
|
|
adc 0x40(%rdi), %r9
|
|
adc 0x48(%rdi), %r10
|
|
adc 0x50(%rdi), %r11
|
|
adc 0x58(%rdi), %r12
|
|
adc 0x60(%rdi), %r13
|
|
adc 0x68(%rdi), %r14
|
|
adc 0x70(%rdi), %r15
|
|
adc 0x78(%rdi), %rbx
|
|
mov %r8, 0x38(%rdi)
|
|
mov %r9, (%rsi) # Final result c0
|
|
mov %r10, 0x48(%rdi)
|
|
mov %r11, 0x50(%rdi)
|
|
mov %r12, 0x58(%rdi)
|
|
mov %r13, 0x60(%rdi)
|
|
mov %r14, 0x68(%rdi)
|
|
mov %r15, 0x70(%rdi)
|
|
mov %rbx, 0x78(%rdi)
|
|
___
|
|
|
|
# a[6-7] x p503p1_nz --> result: r8:r14
|
|
&MUL128x320_SCHOOL(48, "rdi", "p503p1_nz(%rip)", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "rbx", "rcx", "r15") if($addx);
|
|
|
|
# Final result c1:c7
|
|
$code.=<<___ if ($addx);
|
|
add 0x48(%rdi), %r8
|
|
adc 0x50(%rdi), %r9
|
|
adc 0x58(%rdi), %r10
|
|
adc 0x60(%rdi), %r11
|
|
adc 0x68(%rdi), %r12
|
|
adc 0x70(%rdi), %r13
|
|
adc 0x78(%rdi), %r14
|
|
mov %r8, 0x8(%rsi)
|
|
mov %r9, 0x10(%rsi)
|
|
mov %r10, 0x18(%rsi)
|
|
mov %r11, 0x20(%rsi)
|
|
mov %r12, 0x28(%rsi)
|
|
mov %r13, 0x30(%rsi)
|
|
mov %r14, 0x38(%rsi)
|
|
___
|
|
|
|
$code.=<<___ if ($addx);
|
|
pop %rbx
|
|
.cfi_adjust_cfa_offset -8
|
|
.cfi_same_value rbx
|
|
pop %r15
|
|
.cfi_adjust_cfa_offset -8
|
|
.cfi_same_value r15
|
|
pop %r14
|
|
.cfi_adjust_cfa_offset -8
|
|
.cfi_same_value r14
|
|
pop %r13
|
|
.cfi_adjust_cfa_offset -8
|
|
.cfi_same_value r13
|
|
pop %r12
|
|
.cfi_adjust_cfa_offset -8
|
|
.cfi_same_value r12
|
|
ret
|
|
.cfi_endproc
|
|
___
|
|
|
|
$code.=<<___;
|
|
###########################################
|
|
# Montgomery reduction
|
|
# Based on comba method
|
|
# Operation: c [rsi] = a [rdi]
|
|
# NOTE: a=c is not allowed
|
|
|
|
.globl ${PREFIX}_fprdc
|
|
.type ${PREFIX}_fprdc,\@function,3
|
|
${PREFIX}_fprdc:
|
|
.cfi_startproc
|
|
push %r12
|
|
.cfi_adjust_cfa_offset 8
|
|
.cfi_offset r12, -16
|
|
push %r13
|
|
.cfi_adjust_cfa_offset 8
|
|
.cfi_offset r13, -24
|
|
push %r14
|
|
.cfi_adjust_cfa_offset 8
|
|
.cfi_offset r14, -32
|
|
push %r15
|
|
.cfi_adjust_cfa_offset 8
|
|
.cfi_offset r15, -40
|
|
push %rbx
|
|
.cfi_adjust_cfa_offset 8
|
|
.cfi_offset rbx, -48
|
|
___
|
|
|
|
$code.=<<___ if ($addx);
|
|
lea OPENSSL_ia32cap_P(%rip), %rcx
|
|
mov 8(%rcx), %rcx
|
|
and \$0x80100, %ecx
|
|
cmp \$0x80100, %ecx
|
|
je .rdc503_mulx_asm
|
|
___
|
|
|
|
# Reduction, generic x86 implementation
|
|
$code.=<<___;
|
|
lea p503p1(%rip), %rbx
|
|
|
|
mov (%rdi), %r11
|
|
mov (%rbx), %rax
|
|
mul %r11
|
|
xor %r8, %r8
|
|
add 0x18(%rdi), %rax
|
|
mov %rax, 0x18(%rsi) # z3
|
|
adc %rdx, %r8
|
|
|
|
xor %r9, %r9
|
|
mov 8(%rbx), %rax
|
|
mul %r11
|
|
xor %r10, %r10
|
|
add %rax, %r8
|
|
adc %rdx, %r9
|
|
|
|
mov 0x8(%rdi), %r12
|
|
mov (%rbx), %rax
|
|
mul %r12
|
|
add %rax, %r8
|
|
adc %rdx, %r9
|
|
adc \$0, %r10
|
|
add 0x20(%rdi), %r8
|
|
mov %r8, 0x20(%rsi) # z4
|
|
adc \$0, %r9
|
|
adc \$0, %r10
|
|
|
|
xor %r8, %r8
|
|
mov 0x10(%rbx), %rax
|
|
mul %r11
|
|
add %rax, %r9
|
|
adc %rdx, %r10
|
|
adc \$0, %r8
|
|
|
|
mov 8(%rbx), %rax
|
|
mul %r12
|
|
add %rax, %r9
|
|
adc %rdx, %r10
|
|
adc \$0, %r8
|
|
|
|
mov 0x10(%rdi), %r13
|
|
mov (%rbx), %rax
|
|
mul %r13
|
|
add %rax, %r9
|
|
adc %rdx, %r10
|
|
adc \$0, %r8
|
|
add 0x28(%rdi), %r9
|
|
mov %r9, 0x28(%rsi) # z5
|
|
adc \$0, %r10
|
|
adc \$0, %r8
|
|
|
|
xor %r9, %r9
|
|
mov 0x18(%rbx), %rax
|
|
mul %r11
|
|
add %rax, %r10
|
|
adc %rdx, %r8
|
|
adc \$0, %r9
|
|
|
|
mov 0x10(%rbx), %rax
|
|
mul %r12
|
|
add %rax, %r10
|
|
adc %rdx, %r8
|
|
adc \$0, %r9
|
|
|
|
mov 0x8(%rbx), %rax
|
|
mul %r13
|
|
add %rax, %r10
|
|
adc %rdx, %r8
|
|
adc \$0, %r9
|
|
|
|
mov 0x18(%rsi), %r14
|
|
mov (%rbx), %rax
|
|
mul %r14
|
|
add %rax, %r10
|
|
adc %rdx, %r8
|
|
adc \$0, %r9
|
|
add 0x30(%rdi), %r10
|
|
mov %r10, 0x30(%rsi) # z6
|
|
adc \$0, %r8
|
|
adc \$0, %r9
|
|
|
|
xor %r10, %r10
|
|
mov 0x20(%rbx), %rax
|
|
mul %r11
|
|
add %rax, %r8
|
|
adc %rdx, %r9
|
|
adc \$0, %r10
|
|
|
|
mov 0x18(%rbx), %rax
|
|
mul %r12
|
|
add %rax, %r8
|
|
adc %rdx, %r9
|
|
adc \$0, %r10
|
|
|
|
mov 0x10(%rbx), %rax
|
|
mul %r13
|
|
add %rax, %r8
|
|
adc %rdx, %r9
|
|
adc \$0, %r10
|
|
|
|
mov 0x8(%rbx), %rax
|
|
mul %r14
|
|
add %rax, %r8
|
|
adc %rdx, %r9
|
|
adc \$0, %r10
|
|
|
|
mov 0x20(%rsi), %r15
|
|
mov (%rbx), %rax
|
|
mul %r15
|
|
add %rax, %r8
|
|
adc %rdx, %r9
|
|
adc \$0, %r10
|
|
add 0x38(%rdi), %r8 # Z7
|
|
mov %r8, 0x38(%rsi)
|
|
adc \$0, %r9
|
|
adc \$0, %r10
|
|
|
|
xor %r8, %r8
|
|
mov 0x20(%rbx), %rax
|
|
mul %r12
|
|
add %rax, %r9
|
|
adc %rdx, %r10
|
|
adc \$0, %r8
|
|
|
|
mov 0x18(%rbx), %rax
|
|
mul %r13
|
|
add %rax, %r9
|
|
adc %rdx, %r10
|
|
adc \$0, %r8
|
|
|
|
mov 0x10(%rbx), %rax
|
|
mul %r14
|
|
add %rax, %r9
|
|
adc %rdx, %r10
|
|
adc \$0, %r8
|
|
|
|
mov 0x8(%rbx), %rax
|
|
mul %r15
|
|
add %rax, %r9
|
|
adc %rdx, %r10
|
|
adc \$0, %r8
|
|
|
|
mov 0x28(%rsi), %rcx
|
|
mov (%rbx), %rax
|
|
mul %rcx
|
|
add %rax, %r9
|
|
adc %rdx, %r10
|
|
adc \$0, %r8
|
|
add 0x40(%rdi), %r9
|
|
mov %r9, (%rsi) # Z9
|
|
adc \$0, %r10
|
|
adc \$0, %r8
|
|
|
|
xor %r9, %r9
|
|
mov 0x20(%rbx), %rax
|
|
mul %r13
|
|
add %rax, %r10
|
|
adc %rdx, %r8
|
|
adc \$0, %r9
|
|
|
|
mov 0x18(%rbx), %rax
|
|
mul %r14
|
|
add %rax, %r10
|
|
adc %rdx, %r8
|
|
adc \$0, %r9
|
|
|
|
mov 0x10(%rbx), %rax
|
|
mul %r15
|
|
add %rax, %r10
|
|
adc %rdx, %r8
|
|
adc \$0, %r9
|
|
|
|
mov 8(%rbx), %rax
|
|
mul %rcx
|
|
add %rax, %r10
|
|
adc %rdx, %r8
|
|
adc \$0, %r9
|
|
|
|
mov 0x30(%rsi), %r13
|
|
mov (%rbx), %rax
|
|
mul %r13
|
|
add %rax, %r10
|
|
adc %rdx, %r8
|
|
adc \$0, %r9
|
|
add 0x48(%rdi), %r10
|
|
mov %r10, 0x8(%rsi) # Z1
|
|
adc \$0, %r8
|
|
adc \$0, %r9
|
|
|
|
xor %r10, %r10
|
|
mov 0x20(%rbx), %rax
|
|
mul %r14
|
|
add %rax, %r8
|
|
adc %rdx, %r9
|
|
adc \$0, %r10
|
|
|
|
mov 0x18(%rbx), %rax
|
|
mul %r15
|
|
add %rax, %r8
|
|
adc %rdx, %r9
|
|
adc \$0, %r10
|
|
|
|
mov 0x10(%rbx), %rax
|
|
mul %rcx
|
|
add %rax, %r8
|
|
adc %rdx, %r9
|
|
adc \$0, %r10
|
|
|
|
mov 8(%rbx), %rax
|
|
mul %r13
|
|
add %rax, %r8
|
|
adc %rdx, %r9
|
|
adc \$0, %r10
|
|
|
|
mov 0x38(%rsi), %r14
|
|
mov (%rbx), %rax
|
|
mul %r14
|
|
add %rax, %r8
|
|
adc %rdx, %r9
|
|
adc \$0, %r10
|
|
add 0x50(%rdi), %r8
|
|
mov %r8, 0x10(%rsi) # Z2
|
|
adc \$0, %r9
|
|
adc \$0, %r10
|
|
|
|
xor %r8, %r8
|
|
mov 0x20(%rbx), %rax
|
|
mul %r15
|
|
add %rax, %r9
|
|
adc %rdx, %r10
|
|
adc \$0, %r8
|
|
|
|
mov 0x18(%rbx), %rax
|
|
mul %rcx
|
|
add %rax, %r9
|
|
adc %rdx, %r10
|
|
adc \$0, %r8
|
|
|
|
mov 0x10(%rbx), %rax
|
|
mul %r13
|
|
add %rax, %r9
|
|
adc %rdx, %r10
|
|
adc \$0, %r8
|
|
|
|
mov 8(%rbx), %rax
|
|
mul %r14
|
|
add %rax, %r9
|
|
adc %rdx, %r10
|
|
adc \$0, %r8
|
|
add 0x58(%rdi), %r9
|
|
mov %r9, 0x18(%rsi) # Z3
|
|
adc \$0, %r10
|
|
adc \$0, %r8
|
|
|
|
xor %r9, %r9
|
|
mov 0x20(%rbx), %rax
|
|
mul %rcx
|
|
add %rax, %r10
|
|
adc %rdx, %r8
|
|
adc \$0, %r9
|
|
|
|
mov 0x18(%rbx), %rax
|
|
mul %r13
|
|
add %rax, %r10
|
|
adc %rdx, %r8
|
|
adc \$0, %r9
|
|
|
|
mov 0x10(%rbx), %rax
|
|
mul %r14
|
|
add %rax, %r10
|
|
adc %rdx, %r8
|
|
adc \$0, %r9
|
|
add 0x60(%rdi), %r10
|
|
mov %r10, 0x20(%rsi) # Z4
|
|
adc \$0, %r8
|
|
adc \$0, %r9
|
|
|
|
xor %r10, %r10
|
|
mov 0x20(%rbx), %rax
|
|
mul %r13
|
|
add %rax, %r8
|
|
adc %rdx, %r9
|
|
adc \$0, %r10
|
|
|
|
mov 0x18(%rbx), %rax
|
|
mul %r14
|
|
add %rax, %r8
|
|
adc %rdx, %r9
|
|
adc \$0, %r10
|
|
add 0x68(%rdi), %r8 # Z5
|
|
mov %r8, 0x28(%rsi) # Z5
|
|
adc \$0, %r9
|
|
adc \$0, %r10
|
|
|
|
mov 0x20(%rbx), %rax
|
|
mul %r14
|
|
add %rax, %r9
|
|
adc %rdx, %r10
|
|
add 0x70(%rdi), %r9 # Z6
|
|
mov %r9, 0x30(%rsi) # Z6
|
|
adc \$0, %r10
|
|
add 0x78(%rdi), %r10 # Z7
|
|
mov %r10, 0x38(%rsi) # Z7
|
|
|
|
pop %rbx
|
|
.cfi_adjust_cfa_offset -8
|
|
pop %r15
|
|
.cfi_adjust_cfa_offset -8
|
|
pop %r14
|
|
.cfi_adjust_cfa_offset -8
|
|
pop %r13
|
|
.cfi_adjust_cfa_offset -8
|
|
pop %r12
|
|
.cfi_adjust_cfa_offset -8
|
|
ret
|
|
.cfi_endproc
|
|
___
|
|
|
|
foreach (split("\n",$code)) {
|
|
s/\`([^\`]*)\`/eval($1)/ge;
|
|
print $_,"\n";
|
|
}
|
|
|
|
close STDOUT;
|