|
@@ -1393,7 +1393,388 @@ TEXT ·fp751Mul(SB), $96-24 |
|
|
|
|
|
|
|
|
RET |
|
|
RET |
|
|
|
|
|
|
|
|
TEXT ·fp751MontgomeryReduce(SB), $0-16 |
|
|
|
|
|
|
|
|
// This multiplies a 256-bit number pointed to by M0 with p751+1. |
|
|
|
|
|
// It is assumed that M1 points to p751+1 stored as a 768-bit Fp751Element. |
|
|
|
|
|
// C points to the place to store the result and should be at least 192 bits. |
|
|
|
|
|
// This should only be used when the BMI2 and ADX instruction set extensions |
|
|
|
|
|
// are available. |
|
|
|
|
|
#define mul256x448bmi2adx(M0, M1, C, T0, T1, T2, T3, T4, T5, T6, T7, T8, T9, T10) \ |
|
|
|
|
|
MOVQ 0+M0, DX \ |
|
|
|
|
|
MULXQ M1+40(SB), T1, T0 \ |
|
|
|
|
|
MULXQ M1+48(SB), T3, T2 \ |
|
|
|
|
|
MOVQ T1, 0+C \ // C0_final |
|
|
|
|
|
XORQ AX, AX \ |
|
|
|
|
|
MULXQ M1+56(SB), T5, T4 \ |
|
|
|
|
|
ADOXQ T3, T0 \ |
|
|
|
|
|
ADOXQ T5, T2 \ |
|
|
|
|
|
MULXQ M1+64(SB), T3, T1 \ |
|
|
|
|
|
ADOXQ T3, T4 \ |
|
|
|
|
|
MULXQ M1+72(SB), T6, T5 \ |
|
|
|
|
|
ADOXQ T6, T1 \ |
|
|
|
|
|
MULXQ M1+80(SB), T7, T3 \ |
|
|
|
|
|
ADOXQ T7, T5 \ |
|
|
|
|
|
MULXQ M1+88(SB), T8, T6 \ |
|
|
|
|
|
ADOXQ T8, T3 \ |
|
|
|
|
|
ADOXQ AX, T6 \ |
|
|
|
|
|
\ |
|
|
|
|
|
MOVQ 8+M0, DX \ |
|
|
|
|
|
MULXQ M1+40(SB), T7, T8 \ |
|
|
|
|
|
XORQ AX, AX \ |
|
|
|
|
|
ADCXQ T7, T0 \ |
|
|
|
|
|
MOVQ T0, 8+C \ // C1_final |
|
|
|
|
|
ADCXQ T8, T2 \ |
|
|
|
|
|
MULXQ M1+48(SB), T8, T7 \ |
|
|
|
|
|
ADOXQ T8, T2 \ |
|
|
|
|
|
ADCXQ T7, T4 \ |
|
|
|
|
|
MULXQ M1+56(SB), T8, T0 \ |
|
|
|
|
|
ADOXQ T8, T4 \ |
|
|
|
|
|
ADCXQ T1, T0 \ |
|
|
|
|
|
MULXQ M1+64(SB), T7, T1 \ |
|
|
|
|
|
ADCXQ T5, T1 \ |
|
|
|
|
|
MULXQ M1+72(SB), T8, T5 \ |
|
|
|
|
|
ADCXQ T5, T3 \ |
|
|
|
|
|
MULXQ M1+80(SB), T9, T5 \ |
|
|
|
|
|
ADCXQ T5, T6 \ |
|
|
|
|
|
MULXQ M1+88(SB), DX, T5 \ |
|
|
|
|
|
ADCXQ AX, T5 \ |
|
|
|
|
|
\ |
|
|
|
|
|
ADOXQ T7, T0 \ |
|
|
|
|
|
ADOXQ T8, T1 \ |
|
|
|
|
|
ADOXQ T9, T3 \ |
|
|
|
|
|
ADOXQ DX, T6 \ |
|
|
|
|
|
ADOXQ AX, T5 \ |
|
|
|
|
|
\ |
|
|
|
|
|
MOVQ 16+M0, DX \ |
|
|
|
|
|
MULXQ M1+40(SB), T7, T8 \ |
|
|
|
|
|
XORQ AX, AX \ |
|
|
|
|
|
ADCXQ T7, T2 \ |
|
|
|
|
|
MOVQ T2, 16+C \ // C2_final |
|
|
|
|
|
ADCXQ T8, T4 \ |
|
|
|
|
|
MULXQ M1+48(SB), T7, T8 \ |
|
|
|
|
|
ADOXQ T7, T4 \ |
|
|
|
|
|
ADCXQ T8, T0 \ |
|
|
|
|
|
MULXQ M1+56(SB), T8, T2 \ |
|
|
|
|
|
ADOXQ T8, T0 \ |
|
|
|
|
|
ADCXQ T2, T1 \ |
|
|
|
|
|
MULXQ M1+64(SB), T7, T2 \ |
|
|
|
|
|
ADCXQ T2, T3 \ |
|
|
|
|
|
MULXQ M1+72(SB), T8, T2 \ |
|
|
|
|
|
ADCXQ T2, T6 \ |
|
|
|
|
|
MULXQ M1+80(SB), T9, T2 \ |
|
|
|
|
|
ADCXQ T2, T5 \ |
|
|
|
|
|
MULXQ M1+88(SB), DX, T2 \ |
|
|
|
|
|
ADCXQ AX, T2 \ |
|
|
|
|
|
\ |
|
|
|
|
|
ADOXQ T7, T1 \ |
|
|
|
|
|
ADOXQ T8, T3 \ |
|
|
|
|
|
ADOXQ T9, T6 \ |
|
|
|
|
|
ADOXQ DX, T5 \ |
|
|
|
|
|
ADOXQ AX, T2 \ |
|
|
|
|
|
\ |
|
|
|
|
|
MOVQ 24+M0, DX \ |
|
|
|
|
|
MULXQ M1+40(SB), T7, T8 \ |
|
|
|
|
|
XORQ AX, AX \ |
|
|
|
|
|
ADCXQ T4, T7 \ |
|
|
|
|
|
ADCXQ T8, T0 \ |
|
|
|
|
|
MULXQ M1+48(SB), T10, T8 \ |
|
|
|
|
|
ADOXQ T10, T0 \ |
|
|
|
|
|
ADCXQ T8, T1 \ |
|
|
|
|
|
MULXQ M1+56(SB), T8, T4 \ |
|
|
|
|
|
ADOXQ T8, T1 \ |
|
|
|
|
|
ADCXQ T4, T3 \ |
|
|
|
|
|
MULXQ M1+64(SB), T10, T4 \ |
|
|
|
|
|
ADCXQ T4, T6 \ |
|
|
|
|
|
MULXQ M1+72(SB), T8, T4 \ |
|
|
|
|
|
ADCXQ T4, T5 \ |
|
|
|
|
|
MULXQ M1+80(SB), T9, T4 \ |
|
|
|
|
|
ADCXQ T4, T2 \ |
|
|
|
|
|
MULXQ M1+88(SB), DX, T4 \ |
|
|
|
|
|
ADCXQ AX, T4 \ |
|
|
|
|
|
\ |
|
|
|
|
|
ADOXQ T10, T3 \ |
|
|
|
|
|
ADOXQ T8, T6 \ |
|
|
|
|
|
ADOXQ T9, T5 \ |
|
|
|
|
|
ADOXQ DX, T2 \ |
|
|
|
|
|
ADOXQ AX, T4 |
|
|
|
|
|
|
|
|
|
|
|
// This multiplies a 256-bit number pointed to by M0 with p751+1. |
|
|
|
|
|
// It is assumed that M1 points to p751+1 stored as a 768-bit Fp751Element. |
|
|
|
|
|
// C points to the place to store the result and should be at least 192 bits. |
|
|
|
|
|
// This should only be used when the BMI2 instruction set extension is |
|
|
|
|
|
// available. |
|
|
|
|
|
#define mul256x448bmi2(M0, M1, C, T0, T1, T2, T3, T4, T5, T6, T7, T8, T9, T10) \ |
|
|
|
|
|
MOVQ 0+M0, DX \ |
|
|
|
|
|
MULXQ M1+40(SB), T1, T0 \ |
|
|
|
|
|
MULXQ M1+48(SB), T3, T2 \ |
|
|
|
|
|
MOVQ T1, 0+C \ // C0_final |
|
|
|
|
|
XORQ AX, AX \ |
|
|
|
|
|
MULXQ M1+56(SB), T5, T4 \ |
|
|
|
|
|
ADDQ T3, T0 \ |
|
|
|
|
|
ADCQ T5, T2 \ |
|
|
|
|
|
MULXQ M1+64(SB), T3, T1 \ |
|
|
|
|
|
ADCQ T3, T4 \ |
|
|
|
|
|
MULXQ M1+72(SB), T6, T5 \ |
|
|
|
|
|
ADCQ T6, T1 \ |
|
|
|
|
|
MULXQ M1+80(SB), T7, T3 \ |
|
|
|
|
|
ADCQ T7, T5 \ |
|
|
|
|
|
MULXQ M1+88(SB), T8, T6 \ |
|
|
|
|
|
ADCQ T8, T3 \ |
|
|
|
|
|
ADCQ AX, T6 \ |
|
|
|
|
|
\ |
|
|
|
|
|
MOVQ 8+M0, DX \ |
|
|
|
|
|
MULXQ M1+40(SB), T7, T8 \ |
|
|
|
|
|
ADDQ T7, T0 \ |
|
|
|
|
|
MOVQ T0, 8+C \ // C1_final |
|
|
|
|
|
ADCQ T8, T2 \ |
|
|
|
|
|
MULXQ M1+48(SB), T8, T7 \ |
|
|
|
|
|
MOVQ T8, 32+C \ |
|
|
|
|
|
ADCQ T7, T4 \ |
|
|
|
|
|
MULXQ M1+56(SB), T8, T0 \ |
|
|
|
|
|
MOVQ T8, 40+C \ |
|
|
|
|
|
ADCQ T1, T0 \ |
|
|
|
|
|
MULXQ M1+64(SB), T7, T1 \ |
|
|
|
|
|
ADCQ T5, T1 \ |
|
|
|
|
|
MULXQ M1+72(SB), T8, T5 \ |
|
|
|
|
|
ADCQ T5, T3 \ |
|
|
|
|
|
MULXQ M1+80(SB), T9, T5 \ |
|
|
|
|
|
ADCQ T5, T6 \ |
|
|
|
|
|
MULXQ M1+88(SB), DX, T5 \ |
|
|
|
|
|
ADCQ AX, T5 \ |
|
|
|
|
|
\ |
|
|
|
|
|
XORQ AX, AX \ |
|
|
|
|
|
ADDQ 32+C, T2 \ |
|
|
|
|
|
ADCQ 40+C, T4 \ |
|
|
|
|
|
ADCQ T7, T0 \ |
|
|
|
|
|
ADCQ T8, T1 \ |
|
|
|
|
|
ADCQ T9, T3 \ |
|
|
|
|
|
ADCQ DX, T6 \ |
|
|
|
|
|
ADCQ AX, T5 \ |
|
|
|
|
|
\ |
|
|
|
|
|
MOVQ 16+M0, DX \ |
|
|
|
|
|
MULXQ M1+40(SB), T7, T8 \ |
|
|
|
|
|
ADDQ T7, T2 \ |
|
|
|
|
|
MOVQ T2, 16+C \ // C2_final |
|
|
|
|
|
ADCQ T8, T4 \ |
|
|
|
|
|
MULXQ M1+48(SB), T7, T8 \ |
|
|
|
|
|
MOVQ T7, 32+C \ |
|
|
|
|
|
ADCQ T8, T0 \ |
|
|
|
|
|
MULXQ M1+56(SB), T8, T2 \ |
|
|
|
|
|
MOVQ T8, 40+C \ |
|
|
|
|
|
ADCQ T2, T1 \ |
|
|
|
|
|
MULXQ M1+64(SB), T7, T2 \ |
|
|
|
|
|
ADCQ T2, T3 \ |
|
|
|
|
|
MULXQ M1+72(SB), T8, T2 \ |
|
|
|
|
|
ADCQ T2, T6 \ |
|
|
|
|
|
MULXQ M1+80(SB), T9, T2 \ |
|
|
|
|
|
ADCQ T2, T5 \ |
|
|
|
|
|
MULXQ M1+88(SB), DX, T2 \ |
|
|
|
|
|
ADCQ AX, T2 \ |
|
|
|
|
|
\ |
|
|
|
|
|
XORQ AX, AX \ |
|
|
|
|
|
ADDQ 32+C, T4 \ |
|
|
|
|
|
ADCQ 40+C, T0 \ |
|
|
|
|
|
ADCQ T7, T1 \ |
|
|
|
|
|
ADCQ T8, T3 \ |
|
|
|
|
|
ADCQ T9, T6 \ |
|
|
|
|
|
ADCQ DX, T5 \ |
|
|
|
|
|
ADCQ AX, T2 \ |
|
|
|
|
|
\ |
|
|
|
|
|
MOVQ 24+M0, DX \ |
|
|
|
|
|
MULXQ M1+40(SB), T7, T8 \ |
|
|
|
|
|
ADDQ T4, T7 \ |
|
|
|
|
|
ADCQ T8, T0 \ |
|
|
|
|
|
MULXQ M1+48(SB), T10, T8 \ |
|
|
|
|
|
MOVQ T10, 32+C \ |
|
|
|
|
|
ADCQ T8, T1 \ |
|
|
|
|
|
MULXQ M1+56(SB), T8, T4 \ |
|
|
|
|
|
MOVQ T8, 40+C \ |
|
|
|
|
|
ADCQ T4, T3 \ |
|
|
|
|
|
MULXQ M1+64(SB), T10, T4 \ |
|
|
|
|
|
ADCQ T4, T6 \ |
|
|
|
|
|
MULXQ M1+72(SB), T8, T4 \ |
|
|
|
|
|
ADCQ T4, T5 \ |
|
|
|
|
|
MULXQ M1+80(SB), T9, T4 \ |
|
|
|
|
|
ADCQ T4, T2 \ |
|
|
|
|
|
MULXQ M1+88(SB), DX, T4 \ |
|
|
|
|
|
ADCQ AX, T4 \ |
|
|
|
|
|
\ |
|
|
|
|
|
XORQ AX, AX \ |
|
|
|
|
|
ADDQ 32+C, T0 \ |
|
|
|
|
|
ADCQ 40+C, T1 \ |
|
|
|
|
|
ADCQ T10, T3 \ |
|
|
|
|
|
ADCQ T8, T6 \ |
|
|
|
|
|
ADCQ T9, T5 \ |
|
|
|
|
|
ADCQ DX, T2 \ |
|
|
|
|
|
ADCQ AX, T4 |
|
|
|
|
|
|
|
|
|
|
|
#define fp751MontgomeryReduceCommonPart1(M0, C, T0, T1, T2, T3, T4, T5, T6, T7, T8, T9)\ |
|
|
|
|
|
XORQ T7, T7 \ |
|
|
|
|
|
MOVQ 48+C, AX \ |
|
|
|
|
|
MOVQ 56+C, DX \ |
|
|
|
|
|
MOVQ 64+C, T9 \ |
|
|
|
|
|
ADDQ 40+M0, AX \ |
|
|
|
|
|
ADCQ 48+M0, DX \ |
|
|
|
|
|
ADCQ 56+M0, T9 \ |
|
|
|
|
|
MOVQ AX, 40+M0 \ |
|
|
|
|
|
MOVQ DX, 48+M0 \ |
|
|
|
|
|
MOVQ T9, 56+M0 \ |
|
|
|
|
|
ADCQ 64+M0, T8 \ |
|
|
|
|
|
ADCQ 72+M0, T0 \ |
|
|
|
|
|
ADCQ 80+M0, T1 \ |
|
|
|
|
|
ADCQ 88+M0, T2 \ |
|
|
|
|
|
ADCQ 96+M0, T3 \ |
|
|
|
|
|
ADCQ 104+M0, T4 \ |
|
|
|
|
|
ADCQ 112+M0, T5 \ |
|
|
|
|
|
ADCQ 120+M0, T6 \ |
|
|
|
|
|
ADCQ 128+M0, T7 \ |
|
|
|
|
|
MOVQ T8, 64+M0 \ |
|
|
|
|
|
MOVQ T0, 72+M0 \ |
|
|
|
|
|
MOVQ T1, 80+M0 \ |
|
|
|
|
|
MOVQ T2, 88+M0 \ |
|
|
|
|
|
MOVQ T3, 96+M0 \ |
|
|
|
|
|
MOVQ T4, 104+M0 \ |
|
|
|
|
|
MOVQ T5, 112+M0 \ |
|
|
|
|
|
MOVQ T6, 120+M0 \ |
|
|
|
|
|
MOVQ T7, 128+M0 \ |
|
|
|
|
|
MOVQ 136+M0, T0 \ |
|
|
|
|
|
MOVQ 144+M0, T1 \ |
|
|
|
|
|
MOVQ 152+M0, T2 \ |
|
|
|
|
|
MOVQ 160+M0, T3 \ |
|
|
|
|
|
MOVQ 168+M0, T4 \ |
|
|
|
|
|
MOVQ 176+M0, T5 \ |
|
|
|
|
|
MOVQ 184+M0, T6 \ |
|
|
|
|
|
ADCQ $0, T0 \ |
|
|
|
|
|
ADCQ $0, T1 \ |
|
|
|
|
|
ADCQ $0, T2 \ |
|
|
|
|
|
ADCQ $0, T3 \ |
|
|
|
|
|
ADCQ $0, T4 \ |
|
|
|
|
|
ADCQ $0, T5 \ |
|
|
|
|
|
ADCQ $0, T6 \ |
|
|
|
|
|
MOVQ T0, 136+M0 \ |
|
|
|
|
|
MOVQ T1, 144+M0 \ |
|
|
|
|
|
MOVQ T2, 152+M0 \ |
|
|
|
|
|
MOVQ T3, 160+M0 \ |
|
|
|
|
|
MOVQ T4, 168+M0 \ |
|
|
|
|
|
MOVQ T5, 176+M0 \ |
|
|
|
|
|
MOVQ T6, 184+M0 |
|
|
|
|
|
|
|
|
|
|
|
#define fp751MontgomeryReduceCommonPart2(M0, C, T0, T1, T2, T3, T4, T5, T6, T7, T8, T9)\ |
|
|
|
|
|
XORQ T7, T7 \ |
|
|
|
|
|
MOVQ 48+C, AX \ |
|
|
|
|
|
MOVQ 56+C, DX \ |
|
|
|
|
|
MOVQ 64+C, T9 \ |
|
|
|
|
|
ADDQ 72+M0, AX \ |
|
|
|
|
|
ADCQ 80+M0, DX \ |
|
|
|
|
|
ADCQ 88+M0, T9 \ |
|
|
|
|
|
MOVQ AX, 72+M0 \ |
|
|
|
|
|
MOVQ DX, 80+M0 \ |
|
|
|
|
|
MOVQ T9, 88+M0 \ |
|
|
|
|
|
ADCQ 96+M0, T8 \ |
|
|
|
|
|
ADCQ 104+M0, T0 \ |
|
|
|
|
|
ADCQ 112+M0, T1 \ |
|
|
|
|
|
ADCQ 120+M0, T2 \ |
|
|
|
|
|
ADCQ 128+M0, T3 \ |
|
|
|
|
|
ADCQ 136+M0, T4 \ |
|
|
|
|
|
ADCQ 144+M0, T5 \ |
|
|
|
|
|
ADCQ 152+M0, T6 \ |
|
|
|
|
|
ADCQ 160+M0, T7 \ |
|
|
|
|
|
MOVQ T8, 0+C \ // Final result c0 |
|
|
|
|
|
MOVQ T0, 104+M0 \ |
|
|
|
|
|
MOVQ T1, 112+M0 \ |
|
|
|
|
|
MOVQ T2, 120+M0 \ |
|
|
|
|
|
MOVQ T3, 128+M0 \ |
|
|
|
|
|
MOVQ T4, 136+M0 \ |
|
|
|
|
|
MOVQ T5, 144+M0 \ |
|
|
|
|
|
MOVQ T6, 152+M0 \ |
|
|
|
|
|
MOVQ T7, 160+M0 \ |
|
|
|
|
|
MOVQ 168+M0, T4 \ |
|
|
|
|
|
MOVQ 176+M0, T5 \ |
|
|
|
|
|
MOVQ 184+M0, T6 \ |
|
|
|
|
|
ADCQ $0, T4 \ |
|
|
|
|
|
ADCQ $0, T5 \ |
|
|
|
|
|
ADCQ $0, T6 \ |
|
|
|
|
|
MOVQ T4, 168+M0 \ |
|
|
|
|
|
MOVQ T5, 176+M0 \ |
|
|
|
|
|
MOVQ T6, 184+M0 |
|
|
|
|
|
|
|
|
|
|
|
#define fp751MontgomeryReduceCommonPart3(M0, C, T0, T1, T2, T3, T4, T5, T6, T7, T8, T9)\ |
|
|
|
|
|
MOVQ 48+C, AX \ // Final result c1:c11 |
|
|
|
|
|
MOVQ 56+C, DX \ |
|
|
|
|
|
MOVQ 64+C, T9 \ |
|
|
|
|
|
ADDQ 104+M0, AX \ |
|
|
|
|
|
ADCQ 112+M0, DX \ |
|
|
|
|
|
ADCQ 120+M0, T9 \ |
|
|
|
|
|
MOVQ AX, 8+C \ |
|
|
|
|
|
MOVQ DX, 16+C \ |
|
|
|
|
|
MOVQ T9, 24+C \ |
|
|
|
|
|
ADCQ 128+M0, T8 \ |
|
|
|
|
|
ADCQ 136+M0, T0 \ |
|
|
|
|
|
ADCQ 144+M0, T1 \ |
|
|
|
|
|
ADCQ 152+M0, T2 \ |
|
|
|
|
|
ADCQ 160+M0, T3 \ |
|
|
|
|
|
ADCQ 168+M0, T4 \ |
|
|
|
|
|
ADCQ 176+M0, T5 \ |
|
|
|
|
|
ADCQ 184+M0, T6 \ |
|
|
|
|
|
MOVQ T8, 32+C \ |
|
|
|
|
|
MOVQ T0, 40+C \ |
|
|
|
|
|
MOVQ T1, 48+C \ |
|
|
|
|
|
MOVQ T2, 56+C \ |
|
|
|
|
|
MOVQ T3, 64+C \ |
|
|
|
|
|
MOVQ T4, 72+C \ |
|
|
|
|
|
MOVQ T5, 80+C \ |
|
|
|
|
|
MOVQ T6, 88+C |
|
|
|
|
|
|
|
|
|
|
|
// This implements the Montgomery reduction algorithm described in |
|
|
|
|
|
// section 5.2.3 of https://eprint.iacr.org/2017/1015.pdf. |
|
|
|
|
|
// This assumes that the BMI2 and ADX instruction set extensions are available. |
|
|
|
|
|
TEXT ·fp751MontgomeryReduceBMI2ADX(SB), $0-16 |
|
|
|
|
|
MOVQ z+0(FP), REG_P2 |
|
|
|
|
|
MOVQ x+8(FP), REG_P1 |
|
|
|
|
|
|
|
|
|
|
|
// a[0-3] x p751p1_nz --> result: [reg_p2+48], [reg_p2+56], [reg_p2+64], and rbp, r8:r14 |
|
|
|
|
|
mul256x448bmi2adx(0(REG_P1), ·p751p1, 48(REG_P2), R8, R9, R13, R10, R14, R12, R11, BP, BX, CX, R15) |
|
|
|
|
|
|
|
|
|
|
|
fp751MontgomeryReduceCommonPart1(0(REG_P1), 0(REG_P2), R8, R9, R10, R11, R12, R13, R14, R15, BP, BX) |
|
|
|
|
|
|
|
|
|
|
|
// a[4-7] x p751p1_nz --> result: [reg_p2+48], [reg_p2+56], [reg_p2+64], and rbp, r8:r14 |
|
|
|
|
|
mul256x448bmi2adx(32(REG_P1), ·p751p1, 48(REG_P2), R8, R9, R13, R10, R14, R12, R11, BP, BX, CX, R15) |
|
|
|
|
|
|
|
|
|
|
|
fp751MontgomeryReduceCommonPart2(0(REG_P1), 0(REG_P2), R8, R9, R10, R11, R12, R13, R14, R15, BP, BX) |
|
|
|
|
|
|
|
|
|
|
|
// a[8-11] x p751p1_nz --> result: [reg_p2+48], [reg_p2+56], [reg_p2+64], and rbp, r8:r14 |
|
|
|
|
|
mul256x448bmi2adx(64(REG_P1), ·p751p1, 48(REG_P2), R8, R9, R13, R10, R14, R12, R11, BP, BX, CX, R15) |
|
|
|
|
|
|
|
|
|
|
|
fp751MontgomeryReduceCommonPart3(0(REG_P1), 0(REG_P2), R8, R9, R10, R11, R12, R13, R14, R15, BP, BX) |
|
|
|
|
|
|
|
|
|
|
|
RET |
|
|
|
|
|
|
|
|
|
|
|
// This implements the Montgomery reduction algorithm described in |
|
|
|
|
|
// section 5.2.3 of https://eprint.iacr.org/2017/1015.pdf. |
|
|
|
|
|
// This assumes that the BMI2 instruction set extension is available. |
|
|
|
|
|
TEXT ·fp751MontgomeryReduceBMI2(SB), $0-16 |
|
|
|
|
|
MOVQ z+0(FP), REG_P2 |
|
|
|
|
|
MOVQ x+8(FP), REG_P1 |
|
|
|
|
|
|
|
|
|
|
|
// a[0-3] x p751p1_nz --> result: [reg_p2+48], [reg_p2+56], [reg_p2+64], and rbp, r8:r14 |
|
|
|
|
|
mul256x448bmi2(0(REG_P1), ·p751p1, 48(REG_P2), R8, R9, R13, R10, R14, R12, R11, BP, BX, CX, R15) |
|
|
|
|
|
|
|
|
|
|
|
fp751MontgomeryReduceCommonPart1(0(REG_P1), 0(REG_P2), R8, R9, R10, R11, R12, R13, R14, R15, BP, BX) |
|
|
|
|
|
|
|
|
|
|
|
// a[4-7] x p751p1_nz --> result: [reg_p2+48], [reg_p2+56], [reg_p2+64], and rbp, r8:r14 |
|
|
|
|
|
mul256x448bmi2(32(REG_P1), ·p751p1, 48(REG_P2), R8, R9, R13, R10, R14, R12, R11, BP, BX, CX, R15) |
|
|
|
|
|
|
|
|
|
|
|
fp751MontgomeryReduceCommonPart2(0(REG_P1), 0(REG_P2), R8, R9, R10, R11, R12, R13, R14, R15, BP, BX) |
|
|
|
|
|
|
|
|
|
|
|
// a[8-11] x p751p1_nz --> result: [reg_p2+48], [reg_p2+56], [reg_p2+64], and rbp, r8:r14 |
|
|
|
|
|
mul256x448bmi2(64(REG_P1), ·p751p1, 48(REG_P2), R8, R9, R13, R10, R14, R12, R11, BP, BX, CX, R15) |
|
|
|
|
|
|
|
|
|
|
|
fp751MontgomeryReduceCommonPart3(0(REG_P1), 0(REG_P2), R8, R9, R10, R11, R12, R13, R14, R15, BP, BX) |
|
|
|
|
|
|
|
|
|
|
|
RET |
|
|
|
|
|
|
|
|
|
|
|
// This implements the straightforward Montgomery reduction algorithm without |
|
|
|
|
|
// using specific instruction set extensions. |
|
|
|
|
|
TEXT ·fp751MontgomeryReduceFallback(SB), $0-16 |
|
|
|
|
|
|
|
|
MOVQ z+0(FP), REG_P2 |
|
|
MOVQ z+0(FP), REG_P2 |
|
|
MOVQ x+8(FP), REG_P1 |
|
|
MOVQ x+8(FP), REG_P1 |
|
|