453 行
7.9 KiB
ArmAsm
453 行
7.9 KiB
ArmAsm
|
|
.intel_syntax noprefix
|
|
|
|
.section .rodata
|
|
|
|
.set pbits, 511
|
|
p:
|
|
.quad 0x1b81b90533c6c87b, 0xc2721bf457aca835, 0x516730cc1f0b4f25, 0xa7aac6c567f35507
|
|
.quad 0x5afbfcc69322c9cd, 0xb42d083aedc88c42, 0xfc8ab0d15e3e4c4a, 0x65b48e8f740f89bf
|
|
|
|
|
|
.global fp_0
|
|
fp_0: .quad 0, 0, 0, 0, 0, 0, 0, 0
|
|
|
|
.global fp_1
|
|
fp_1: /* 2^512 mod p */
|
|
.quad 0xc8fc8df598726f0a, 0x7b1bc81750a6af95, 0x5d319e67c1e961b4, 0xb0aa7275301955f1
|
|
.quad 0x4a080672d9ba6c64, 0x97a5ef8a246ee77b, 0x06ea9e5d4383676a, 0x3496e2e117e0ec80
|
|
|
|
|
|
/* (2^512)^2 mod p */
|
|
.r_squared_mod_p:
|
|
.quad 0x36905b572ffc1724, 0x67086f4525f1f27d, 0x4faf3fbfd22370ca, 0x192ea214bcc584b1
|
|
.quad 0x5dae03ee2f5de3d0, 0x1e9248731776b371, 0xad5f166e20e4f52d, 0x4ed759aea6f3917e
|
|
|
|
/* -p^-1 mod 2^64 */
|
|
.inv_min_p_mod_r:
|
|
.quad 0x66c1301f632e294d
|
|
|
|
|
|
.section .text
|
|
|
|
.global fp_copy
|
|
fp_copy:
|
|
cld
|
|
mov rcx, 8
|
|
rep movsq
|
|
ret
|
|
|
|
.global fp_set
|
|
fp_set:
|
|
push rdi
|
|
call u512_set
|
|
pop rdi
|
|
mov rsi, rdi
|
|
jmp fp_enc
|
|
|
|
.global fp_cswap
|
|
fp_cswap:
|
|
movzx rax, dl
|
|
neg rax
|
|
.set k, 0
|
|
.rept 8
|
|
mov rcx, [rdi + 8*k]
|
|
mov rdx, [rsi + 8*k]
|
|
|
|
mov r8, rcx
|
|
xor r8, rdx
|
|
and r8, rax
|
|
|
|
xor rcx, r8
|
|
xor rdx, r8
|
|
|
|
mov [rdi + 8*k], rcx
|
|
mov [rsi + 8*k], rdx
|
|
|
|
.set k, k+1
|
|
.endr
|
|
ret
|
|
|
|
.reduce_once:
|
|
push rbp
|
|
mov rbp, rdi
|
|
|
|
mov rdi, [rbp + 0]
|
|
sub rdi, [rip + p + 0]
|
|
mov rsi, [rbp + 8]
|
|
sbb rsi, [rip + p + 8]
|
|
mov rdx, [rbp + 16]
|
|
sbb rdx, [rip + p + 16]
|
|
mov rcx, [rbp + 24]
|
|
sbb rcx, [rip + p + 24]
|
|
mov r8, [rbp + 32]
|
|
sbb r8, [rip + p + 32]
|
|
mov r9, [rbp + 40]
|
|
sbb r9, [rip + p + 40]
|
|
mov r10, [rbp + 48]
|
|
sbb r10, [rip + p + 48]
|
|
mov r11, [rbp + 56]
|
|
sbb r11, [rip + p + 56]
|
|
|
|
setnc al
|
|
movzx rax, al
|
|
neg rax
|
|
|
|
.macro cswap2, r, m
|
|
xor \r, \m
|
|
and \r, rax
|
|
xor \m, \r
|
|
.endm
|
|
|
|
cswap2 rdi, [rbp + 0]
|
|
cswap2 rsi, [rbp + 8]
|
|
cswap2 rdx, [rbp + 16]
|
|
cswap2 rcx, [rbp + 24]
|
|
cswap2 r8, [rbp + 32]
|
|
cswap2 r9, [rbp + 40]
|
|
cswap2 r10, [rbp + 48]
|
|
cswap2 r11, [rbp + 56]
|
|
|
|
pop rbp
|
|
ret
|
|
|
|
.global fp_add3
|
|
fp_add3:
|
|
push rdi
|
|
call u512_add3
|
|
pop rdi
|
|
jmp .reduce_once
|
|
|
|
.global fp_add2
|
|
fp_add2:
|
|
mov rdx, rdi
|
|
jmp fp_add3
|
|
|
|
.global fp_sub3
|
|
fp_sub3:
|
|
push rdi
|
|
call u512_sub3
|
|
pop rdi
|
|
xor rsi, rsi
|
|
xor rdx, rdx
|
|
xor rcx, rcx
|
|
xor r8, r8
|
|
xor r9, r9
|
|
xor r10, r10
|
|
xor r11, r11
|
|
test rax, rax
|
|
cmovnz rax, [rip + p + 0]
|
|
cmovnz rsi, [rip + p + 8]
|
|
cmovnz rdx, [rip + p + 16]
|
|
cmovnz rcx, [rip + p + 24]
|
|
cmovnz r8, [rip + p + 32]
|
|
cmovnz r9, [rip + p + 40]
|
|
cmovnz r10, [rip + p + 48]
|
|
cmovnz r11, [rip + p + 56]
|
|
add [rdi + 0], rax
|
|
adc [rdi + 8], rsi
|
|
adc [rdi + 16], rdx
|
|
adc [rdi + 24], rcx
|
|
adc [rdi + 32], r8
|
|
adc [rdi + 40], r9
|
|
adc [rdi + 48], r10
|
|
adc [rdi + 56], r11
|
|
ret
|
|
|
|
.global fp_sub2
|
|
fp_sub2:
|
|
mov rdx, rdi
|
|
xchg rsi, rdx
|
|
jmp fp_sub3
|
|
|
|
|
|
/* Montgomery arithmetic */
|
|
|
|
.global fp_enc
|
|
fp_enc:
|
|
lea rdx, [rip + .r_squared_mod_p]
|
|
jmp fp_mul3
|
|
|
|
.global fp_dec
|
|
fp_dec:
|
|
lea rdx, [rip + u512_1]
|
|
jmp fp_mul3
|
|
|
|
.global fp_mul3
|
|
fp_mul3:
|
|
push rbp
|
|
push rbx
|
|
push r12
|
|
push r13
|
|
push r14
|
|
push r15
|
|
|
|
push rdi
|
|
|
|
mov rdi, rsi
|
|
mov rsi, rdx
|
|
|
|
xor r8, r8
|
|
xor r9, r9
|
|
xor r10, r10
|
|
xor r11, r11
|
|
xor r12, r12
|
|
xor r13, r13
|
|
xor r14, r14
|
|
xor r15, r15
|
|
xor rbp, rbp
|
|
|
|
/* flags are already cleared */
|
|
|
|
.macro MULSTEP, k, r0, r1, r2, r3, r4, r5, r6, r7, r8
|
|
|
|
mov rdx, [rsi + 0]
|
|
mulx rcx, rdx, [rdi + 8*\k]
|
|
add rdx, \r0
|
|
mulx rcx, rdx, [rip + .inv_min_p_mod_r]
|
|
|
|
xor rax, rax /* clear flags */
|
|
|
|
mulx rbx, rax, [rip + p + 0]
|
|
adox \r0, rax
|
|
|
|
mulx rcx, rax, [rip + p + 8]
|
|
adcx \r1, rbx
|
|
adox \r1, rax
|
|
|
|
mulx rbx, rax, [rip + p + 16]
|
|
adcx \r2, rcx
|
|
adox \r2, rax
|
|
|
|
mulx rcx, rax, [rip + p + 24]
|
|
adcx \r3, rbx
|
|
adox \r3, rax
|
|
|
|
mulx rbx, rax, [rip + p + 32]
|
|
adcx \r4, rcx
|
|
adox \r4, rax
|
|
|
|
mulx rcx, rax, [rip + p + 40]
|
|
adcx \r5, rbx
|
|
adox \r5, rax
|
|
|
|
mulx rbx, rax, [rip + p + 48]
|
|
adcx \r6, rcx
|
|
adox \r6, rax
|
|
|
|
mulx rcx, rax, [rip + p + 56]
|
|
adcx \r7, rbx
|
|
adox \r7, rax
|
|
|
|
mov rax, 0
|
|
adcx \r8, rcx
|
|
adox \r8, rax
|
|
|
|
|
|
mov rdx, [rdi + 8*\k]
|
|
|
|
xor rax, rax /* clear flags */
|
|
|
|
mulx rbx, rax, [rsi + 0]
|
|
adox \r0, rax
|
|
|
|
mulx rcx, rax, [rsi + 8]
|
|
adcx \r1, rbx
|
|
adox \r1, rax
|
|
|
|
mulx rbx, rax, [rsi + 16]
|
|
adcx \r2, rcx
|
|
adox \r2, rax
|
|
|
|
mulx rcx, rax, [rsi + 24]
|
|
adcx \r3, rbx
|
|
adox \r3, rax
|
|
|
|
mulx rbx, rax, [rsi + 32]
|
|
adcx \r4, rcx
|
|
adox \r4, rax
|
|
|
|
mulx rcx, rax, [rsi + 40]
|
|
adcx \r5, rbx
|
|
adox \r5, rax
|
|
|
|
mulx rbx, rax, [rsi + 48]
|
|
adcx \r6, rcx
|
|
adox \r6, rax
|
|
|
|
mulx rcx, rax, [rsi + 56]
|
|
adcx \r7, rbx
|
|
adox \r7, rax
|
|
|
|
mov rax, 0
|
|
adcx \r8, rcx
|
|
adox \r8, rax
|
|
|
|
.endm
|
|
|
|
MULSTEP 0, r8, r9, r10, r11, r12, r13, r14, r15, rbp
|
|
MULSTEP 1, r9, r10, r11, r12, r13, r14, r15, rbp, r8
|
|
MULSTEP 2, r10, r11, r12, r13, r14, r15, rbp, r8, r9
|
|
MULSTEP 3, r11, r12, r13, r14, r15, rbp, r8, r9, r10
|
|
MULSTEP 4, r12, r13, r14, r15, rbp, r8, r9, r10, r11
|
|
MULSTEP 5, r13, r14, r15, rbp, r8, r9, r10, r11, r12
|
|
MULSTEP 6, r14, r15, rbp, r8, r9, r10, r11, r12, r13
|
|
MULSTEP 7, r15, rbp, r8, r9, r10, r11, r12, r13, r14
|
|
|
|
pop rdi
|
|
|
|
mov [rdi + 0], rbp
|
|
mov [rdi + 8], r8
|
|
mov [rdi + 16], r9
|
|
mov [rdi + 24], r10
|
|
mov [rdi + 32], r11
|
|
mov [rdi + 40], r12
|
|
mov [rdi + 48], r13
|
|
mov [rdi + 56], r14
|
|
|
|
pop r15
|
|
pop r14
|
|
pop r13
|
|
pop r12
|
|
pop rbx
|
|
pop rbp
|
|
jmp .reduce_once
|
|
|
|
.global fp_mul2
|
|
fp_mul2:
|
|
mov rdx, rdi
|
|
jmp fp_mul3
|
|
|
|
.global fp_sq2
|
|
fp_sq2:
|
|
/* TODO implement optimized Montgomery squaring */
|
|
mov rdx, rsi
|
|
jmp fp_mul3
|
|
|
|
.global fp_sq1
|
|
fp_sq1:
|
|
mov rsi, rdi
|
|
jmp fp_sq2
|
|
|
|
/* (obviously) not constant time in the exponent! */
|
|
.fp_pow:
|
|
push rbx
|
|
mov rbx, rsi
|
|
push r12
|
|
push r13
|
|
push rdi
|
|
sub rsp, 64
|
|
|
|
mov rsi, rdi
|
|
mov rdi, rsp
|
|
call fp_copy
|
|
|
|
mov rdi, [rsp + 64]
|
|
lea rsi, [rip + fp_1]
|
|
call fp_copy
|
|
|
|
.macro POWSTEP, k
|
|
mov r13, [rbx + 8*\k]
|
|
xor r12, r12
|
|
|
|
0:
|
|
test r13, 1
|
|
jz 1f
|
|
|
|
mov rdi, [rsp + 64]
|
|
mov rsi, rsp
|
|
call fp_mul2
|
|
|
|
1:
|
|
mov rdi, rsp
|
|
call fp_sq1
|
|
|
|
shr r13
|
|
|
|
inc r12
|
|
test r12, 64
|
|
jz 0b
|
|
.endm
|
|
|
|
POWSTEP 0
|
|
POWSTEP 1
|
|
POWSTEP 2
|
|
POWSTEP 3
|
|
POWSTEP 4
|
|
POWSTEP 5
|
|
POWSTEP 6
|
|
POWSTEP 7
|
|
|
|
add rsp, 64+8
|
|
pop r13
|
|
pop r12
|
|
pop rbx
|
|
ret
|
|
|
|
.section .rodata
|
|
.p_minus_2:
|
|
.quad 0x1b81b90533c6c879, 0xc2721bf457aca835, 0x516730cc1f0b4f25, 0xa7aac6c567f35507
|
|
.quad 0x5afbfcc69322c9cd, 0xb42d083aedc88c42, 0xfc8ab0d15e3e4c4a, 0x65b48e8f740f89bf
|
|
|
|
.section .text
|
|
|
|
/* TODO use a better addition chain? */
|
|
.global fp_inv
|
|
fp_inv:
|
|
lea rsi, [rip + .p_minus_2]
|
|
jmp .fp_pow
|
|
|
|
.section .rodata
|
|
.p_minus_1_halves:
|
|
.quad 0x8dc0dc8299e3643d, 0xe1390dfa2bd6541a, 0xa8b398660f85a792, 0xd3d56362b3f9aa83
|
|
.quad 0x2d7dfe63499164e6, 0x5a16841d76e44621, 0xfe455868af1f2625, 0x32da4747ba07c4df
|
|
|
|
.section .text
|
|
|
|
/* TODO use a better addition chain? */
|
|
.global fp_issquare
|
|
fp_issquare:
|
|
push rdi
|
|
lea rsi, [rip + .p_minus_1_halves]
|
|
call .fp_pow
|
|
pop rdi
|
|
|
|
xor rax, rax
|
|
.set k, 0
|
|
.rept 8
|
|
mov rsi, [rdi + 8*k]
|
|
xor rsi, [rip + fp_1 + 8*k]
|
|
or rax, rsi
|
|
.set k, k+1
|
|
.endr
|
|
test rax, rax
|
|
setz al
|
|
movzx rax, al
|
|
ret
|
|
|
|
|
|
/* not constant time (but this shouldn't leak anything of importance) */
|
|
.global fp_random
|
|
fp_random:
|
|
|
|
push rdi
|
|
mov rsi, 64
|
|
call randombytes
|
|
pop rdi
|
|
mov rax, 1
|
|
shl rax, (pbits % 64)
|
|
dec rax
|
|
and [rdi + 56], rax
|
|
|
|
.set k, 7
|
|
.rept 8
|
|
mov rax, [rip + p + 8*k]
|
|
cmp [rdi + 8*k], rax
|
|
jge fp_random
|
|
jl 0f
|
|
.set k, k-1
|
|
.endr
|
|
0:
|
|
ret
|
|
|