|
-
- .intel_syntax noprefix
-
- .section .rodata
-
- .set pbits, 511
- p:
- .quad 0x1b81b90533c6c87b, 0xc2721bf457aca835, 0x516730cc1f0b4f25, 0xa7aac6c567f35507
- .quad 0x5afbfcc69322c9cd, 0xb42d083aedc88c42, 0xfc8ab0d15e3e4c4a, 0x65b48e8f740f89bf
-
-
- .global fp_0
- fp_0: .quad 0, 0, 0, 0, 0, 0, 0, 0
-
- .global fp_1
- fp_1: /* 2^512 mod p */
- .quad 0xc8fc8df598726f0a, 0x7b1bc81750a6af95, 0x5d319e67c1e961b4, 0xb0aa7275301955f1
- .quad 0x4a080672d9ba6c64, 0x97a5ef8a246ee77b, 0x06ea9e5d4383676a, 0x3496e2e117e0ec80
-
-
- /* (2^512)^2 mod p */
- .r_squared_mod_p:
- .quad 0x36905b572ffc1724, 0x67086f4525f1f27d, 0x4faf3fbfd22370ca, 0x192ea214bcc584b1
- .quad 0x5dae03ee2f5de3d0, 0x1e9248731776b371, 0xad5f166e20e4f52d, 0x4ed759aea6f3917e
-
- /* -p^-1 mod 2^64 */
- .inv_min_p_mod_r:
- .quad 0x66c1301f632e294d
-
-
- .section .text
-
- .global fp_copy
- fp_copy:
- cld
- mov rcx, 8
- rep movsq
- ret
-
- .global fp_set
- fp_set:
- push rdi
- call u512_set
- pop rdi
- mov rsi, rdi
- jmp fp_enc
-
- .global fp_cswap
- fp_cswap:
- movzx rax, dl
- neg rax
- .set k, 0
- .rept 8
- mov rcx, [rdi + 8*k]
- mov rdx, [rsi + 8*k]
-
- mov r8, rcx
- xor r8, rdx
- and r8, rax
-
- xor rcx, r8
- xor rdx, r8
-
- mov [rdi + 8*k], rcx
- mov [rsi + 8*k], rdx
-
- .set k, k+1
- .endr
- ret
-
- .reduce_once:
- push rbp
- mov rbp, rdi
-
- mov rdi, [rbp + 0]
- sub rdi, [rip + p + 0]
- mov rsi, [rbp + 8]
- sbb rsi, [rip + p + 8]
- mov rdx, [rbp + 16]
- sbb rdx, [rip + p + 16]
- mov rcx, [rbp + 24]
- sbb rcx, [rip + p + 24]
- mov r8, [rbp + 32]
- sbb r8, [rip + p + 32]
- mov r9, [rbp + 40]
- sbb r9, [rip + p + 40]
- mov r10, [rbp + 48]
- sbb r10, [rip + p + 48]
- mov r11, [rbp + 56]
- sbb r11, [rip + p + 56]
-
- setnc al
- movzx rax, al
- neg rax
-
- .macro cswap2, r, m
- xor \r, \m
- and \r, rax
- xor \m, \r
- .endm
-
- cswap2 rdi, [rbp + 0]
- cswap2 rsi, [rbp + 8]
- cswap2 rdx, [rbp + 16]
- cswap2 rcx, [rbp + 24]
- cswap2 r8, [rbp + 32]
- cswap2 r9, [rbp + 40]
- cswap2 r10, [rbp + 48]
- cswap2 r11, [rbp + 56]
-
- pop rbp
- ret
-
- .global fp_add3
- fp_add3:
- push rdi
- call u512_add3
- pop rdi
- jmp .reduce_once
-
- .global fp_add2
- fp_add2:
- mov rdx, rdi
- jmp fp_add3
-
- .global fp_sub3
- fp_sub3:
- push rdi
- call u512_sub3
- pop rdi
- xor rsi, rsi
- xor rdx, rdx
- xor rcx, rcx
- xor r8, r8
- xor r9, r9
- xor r10, r10
- xor r11, r11
- test rax, rax
- cmovnz rax, [rip + p + 0]
- cmovnz rsi, [rip + p + 8]
- cmovnz rdx, [rip + p + 16]
- cmovnz rcx, [rip + p + 24]
- cmovnz r8, [rip + p + 32]
- cmovnz r9, [rip + p + 40]
- cmovnz r10, [rip + p + 48]
- cmovnz r11, [rip + p + 56]
- add [rdi + 0], rax
- adc [rdi + 8], rsi
- adc [rdi + 16], rdx
- adc [rdi + 24], rcx
- adc [rdi + 32], r8
- adc [rdi + 40], r9
- adc [rdi + 48], r10
- adc [rdi + 56], r11
- ret
-
- .global fp_sub2
- fp_sub2:
- mov rdx, rdi
- xchg rsi, rdx
- jmp fp_sub3
-
-
- /* Montgomery arithmetic */
-
- .global fp_enc
- fp_enc:
- lea rdx, [rip + .r_squared_mod_p]
- jmp fp_mul3
-
- .global fp_dec
- fp_dec:
- lea rdx, [rip + u512_1]
- jmp fp_mul3
-
- .global fp_mul3
- fp_mul3:
- push rbp
- push rbx
- push r12
- push r13
- push r14
- push r15
-
- push rdi
-
- mov rdi, rsi
- mov rsi, rdx
-
- xor r8, r8
- xor r9, r9
- xor r10, r10
- xor r11, r11
- xor r12, r12
- xor r13, r13
- xor r14, r14
- xor r15, r15
- xor rbp, rbp
-
- /* flags are already cleared */
-
- .macro MULSTEP, k, r0, r1, r2, r3, r4, r5, r6, r7, r8
-
- mov rdx, [rsi + 0]
- mulx rcx, rdx, [rdi + 8*\k]
- add rdx, \r0
- mulx rcx, rdx, [rip + .inv_min_p_mod_r]
-
- xor rax, rax /* clear flags */
-
- mulx rbx, rax, [rip + p + 0]
- adox \r0, rax
-
- mulx rcx, rax, [rip + p + 8]
- adcx \r1, rbx
- adox \r1, rax
-
- mulx rbx, rax, [rip + p + 16]
- adcx \r2, rcx
- adox \r2, rax
-
- mulx rcx, rax, [rip + p + 24]
- adcx \r3, rbx
- adox \r3, rax
-
- mulx rbx, rax, [rip + p + 32]
- adcx \r4, rcx
- adox \r4, rax
-
- mulx rcx, rax, [rip + p + 40]
- adcx \r5, rbx
- adox \r5, rax
-
- mulx rbx, rax, [rip + p + 48]
- adcx \r6, rcx
- adox \r6, rax
-
- mulx rcx, rax, [rip + p + 56]
- adcx \r7, rbx
- adox \r7, rax
-
- mov rax, 0
- adcx \r8, rcx
- adox \r8, rax
-
-
- mov rdx, [rdi + 8*\k]
-
- xor rax, rax /* clear flags */
-
- mulx rbx, rax, [rsi + 0]
- adox \r0, rax
-
- mulx rcx, rax, [rsi + 8]
- adcx \r1, rbx
- adox \r1, rax
-
- mulx rbx, rax, [rsi + 16]
- adcx \r2, rcx
- adox \r2, rax
-
- mulx rcx, rax, [rsi + 24]
- adcx \r3, rbx
- adox \r3, rax
-
- mulx rbx, rax, [rsi + 32]
- adcx \r4, rcx
- adox \r4, rax
-
- mulx rcx, rax, [rsi + 40]
- adcx \r5, rbx
- adox \r5, rax
-
- mulx rbx, rax, [rsi + 48]
- adcx \r6, rcx
- adox \r6, rax
-
- mulx rcx, rax, [rsi + 56]
- adcx \r7, rbx
- adox \r7, rax
-
- mov rax, 0
- adcx \r8, rcx
- adox \r8, rax
-
- .endm
-
- MULSTEP 0, r8, r9, r10, r11, r12, r13, r14, r15, rbp
- MULSTEP 1, r9, r10, r11, r12, r13, r14, r15, rbp, r8
- MULSTEP 2, r10, r11, r12, r13, r14, r15, rbp, r8, r9
- MULSTEP 3, r11, r12, r13, r14, r15, rbp, r8, r9, r10
- MULSTEP 4, r12, r13, r14, r15, rbp, r8, r9, r10, r11
- MULSTEP 5, r13, r14, r15, rbp, r8, r9, r10, r11, r12
- MULSTEP 6, r14, r15, rbp, r8, r9, r10, r11, r12, r13
- MULSTEP 7, r15, rbp, r8, r9, r10, r11, r12, r13, r14
-
- pop rdi
-
- mov [rdi + 0], rbp
- mov [rdi + 8], r8
- mov [rdi + 16], r9
- mov [rdi + 24], r10
- mov [rdi + 32], r11
- mov [rdi + 40], r12
- mov [rdi + 48], r13
- mov [rdi + 56], r14
-
- pop r15
- pop r14
- pop r13
- pop r12
- pop rbx
- pop rbp
- jmp .reduce_once
-
- .global fp_mul2
- fp_mul2:
- mov rdx, rdi
- jmp fp_mul3
-
- .global fp_sq2
- fp_sq2:
- /* TODO implement optimized Montgomery squaring */
- mov rdx, rsi
- jmp fp_mul3
-
- .global fp_sq1
- fp_sq1:
- mov rsi, rdi
- jmp fp_sq2
-
- /* (obviously) not constant time in the exponent! */
- .fp_pow:
- push rbx
- mov rbx, rsi
- push r12
- push r13
- push rdi
- sub rsp, 64
-
- mov rsi, rdi
- mov rdi, rsp
- call fp_copy
-
- mov rdi, [rsp + 64]
- lea rsi, [rip + fp_1]
- call fp_copy
-
- .macro POWSTEP, k
- mov r13, [rbx + 8*\k]
- xor r12, r12
-
- 0:
- test r13, 1
- jz 1f
-
- mov rdi, [rsp + 64]
- mov rsi, rsp
- call fp_mul2
-
- 1:
- mov rdi, rsp
- call fp_sq1
-
- shr r13
-
- inc r12
- test r12, 64
- jz 0b
- .endm
-
- POWSTEP 0
- POWSTEP 1
- POWSTEP 2
- POWSTEP 3
- POWSTEP 4
- POWSTEP 5
- POWSTEP 6
- POWSTEP 7
-
- add rsp, 64+8
- pop r13
- pop r12
- pop rbx
- ret
-
- .section .rodata
- .p_minus_2:
- .quad 0x1b81b90533c6c879, 0xc2721bf457aca835, 0x516730cc1f0b4f25, 0xa7aac6c567f35507
- .quad 0x5afbfcc69322c9cd, 0xb42d083aedc88c42, 0xfc8ab0d15e3e4c4a, 0x65b48e8f740f89bf
-
- .section .text
-
- /* TODO use a better addition chain? */
- .global fp_inv
- fp_inv:
- lea rsi, [rip + .p_minus_2]
- jmp .fp_pow
-
- .section .rodata
- .p_minus_1_halves:
- .quad 0x8dc0dc8299e3643d, 0xe1390dfa2bd6541a, 0xa8b398660f85a792, 0xd3d56362b3f9aa83
- .quad 0x2d7dfe63499164e6, 0x5a16841d76e44621, 0xfe455868af1f2625, 0x32da4747ba07c4df
-
- .section .text
-
- /* TODO use a better addition chain? */
- .global fp_issquare
- fp_issquare:
- push rdi
- lea rsi, [rip + .p_minus_1_halves]
- call .fp_pow
- pop rdi
-
- xor rax, rax
- .set k, 0
- .rept 8
- mov rsi, [rdi + 8*k]
- xor rsi, [rip + fp_1 + 8*k]
- or rax, rsi
- .set k, k+1
- .endr
- test rax, rax
- setz al
- movzx rax, al
- ret
-
-
- /* not constant time (but this shouldn't leak anything of importance) */
- .global fp_random
- fp_random:
-
- push rdi
- mov rsi, 64
- call randombytes
- pop rdi
- mov rax, 1
- shl rax, (pbits % 64)
- dec rax
- and [rdi + 56], rax
-
- .set k, 7
- .rept 8
- mov rax, [rip + p + 8*k]
- cmp [rdi + 8*k], rax
- jge fp_random
- jl 0f
- .set k, k-1
- .endr
- 0:
- ret
|