From beb8962885cd14880b1aa65645c2819376227a1a Mon Sep 17 00:00:00 2001 From: Ko- Date: Tue, 21 Aug 2018 14:42:42 +0100 Subject: [PATCH 1/7] p751: speed up montgomery reduction with mulx/adox In https://eprint.iacr.org/2017/1015.pdf a technique was described to improve the performance of Montgomery reduction for Montgomery-friendly moduli. This adds an implementation using the mulx, adox and adcx instructions, available in the BMI2 (since Haswell) and ADX (since Broadwell) instruction set extensions. --- p751toolbox/field_amd64.s | 381 +++++++++++++++++++++++++++++++++++++- p751toolbox/field_decl.go | 7 + 2 files changed, 387 insertions(+), 1 deletion(-) diff --git a/p751toolbox/field_amd64.s b/p751toolbox/field_amd64.s index 4596d8f..48d624e 100644 --- a/p751toolbox/field_amd64.s +++ b/p751toolbox/field_amd64.s @@ -30,6 +30,15 @@ #define P751X2_10 $0x1C25213F2F75B8CD #define P751X2_11 $0x0000DFCBAA83EE38 +DATA P751P1_NZ<>+0x00(SB)/8, $0xEEB0000000000000 +DATA P751P1_NZ<>+0x08(SB)/8, $0xE3EC968549F878A8 +DATA P751P1_NZ<>+0x10(SB)/8, $0xDA959B1A13F7CC76 +DATA P751P1_NZ<>+0x18(SB)/8, $0x084E9867D6EBE876 +DATA P751P1_NZ<>+0x20(SB)/8, $0x8562B5045CB25748 +DATA P751P1_NZ<>+0x28(SB)/8, $0x0E12909F97BADC66 +DATA P751P1_NZ<>+0x30(SB)/8, $0x00006FE5D541F71C +GLOBL P751P1_NZ<>(SB), (NOPTR + RODATA), $0x38 + // The MSR code uses these registers for parameter passing. Keep using // them to avoid significant code changes. This means that when the Go // assembler does something strange, we can diff the machine code @@ -1417,11 +1426,381 @@ TEXT ·fp751Mul(SB), $96-24 RET -TEXT ·fp751MontgomeryReduce(SB), $0-16 +#define mul256x448bmi2adx(M0, M1, C, T0, T1, T2, T3, T4, T5, T6, T7, T8, T9, T10) \ + MOVQ 0+M0, DX \ + MULXQ M1+0(SB), T1, T0 \ + MULXQ M1+8(SB), T3, T2 \ + MOVQ T1, 0+C \ // C0_final + XORQ AX, AX \ + MULXQ M1+16(SB), T5, T4 \ + ADOXQ T3, T0 \ + ADOXQ T5, T2 \ + MULXQ M1+24(SB), T3, T1 \ + ADOXQ T3, T4 \ + MULXQ M1+32(SB), T6, T5 \ + ADOXQ T6, T1 \ + MULXQ M1+40(SB), T7, T3 \ + ADOXQ T7, T5 \ + MULXQ M1+48(SB), T8, T6 \ + ADOXQ T8, T3 \ + ADOXQ AX, T6 \ + \ + MOVQ 8+M0, DX \ + MULXQ M1+0(SB), T7, T8 \ + XORQ AX, AX \ + ADCXQ T7, T0 \ + MOVQ T0, 8+C \ // C1_final + ADCXQ T8, T2 \ + MULXQ M1+8(SB), T8, T7 \ + ADOXQ T8, T2 \ + ADCXQ T7, T4 \ + MULXQ M1+16(SB), T8, T0 \ + ADOXQ T8, T4 \ + ADCXQ T1, T0 \ + MULXQ M1+24(SB), T7, T1 \ + ADCXQ T5, T1 \ + MULXQ M1+32(SB), T8, T5 \ + ADCXQ T5, T3 \ + MULXQ M1+40(SB), T9, T5 \ + ADCXQ T5, T6 \ + MULXQ M1+48(SB), DX, T5 \ + ADCXQ AX, T5 \ + \ + ADOXQ T7, T0 \ + ADOXQ T8, T1 \ + ADOXQ T9, T3 \ + ADOXQ DX, T6 \ + ADOXQ AX, T5 \ + \ + MOVQ 16+M0, DX \ + MULXQ M1+0(SB), T7, T8 \ + XORQ AX, AX \ + ADCXQ T7, T2 \ + MOVQ T2, 16+C \ // C2_final + ADCXQ T8, T4 \ + MULXQ M1+8(SB), T7, T8 \ + ADOXQ T7, T4 \ + ADCXQ T8, T0 \ + MULXQ M1+16(SB), T8, T2 \ + ADOXQ T8, T0 \ + ADCXQ T2, T1 \ + MULXQ M1+24(SB), T7, T2 \ + ADCXQ T2, T3 \ + MULXQ M1+32(SB), T8, T2 \ + ADCXQ T2, T6 \ + MULXQ M1+40(SB), T9, T2 \ + ADCXQ T2, T5 \ + MULXQ M1+48(SB), DX, T2 \ + ADCXQ AX, T2 \ + \ + ADOXQ T7, T1 \ + ADOXQ T8, T3 \ + ADOXQ T9, T6 \ + ADOXQ DX, T5 \ + ADOXQ AX, T2 \ + \ + MOVQ 24+M0, DX \ + MULXQ M1+0(SB), T7, T8 \ + XORQ AX, AX \ + ADCXQ T4, T7 \ + ADCXQ T8, T0 \ + MULXQ M1+8(SB), T10, T8 \ + ADOXQ T10, T0 \ + ADCXQ T8, T1 \ + MULXQ M1+16(SB), T8, T4 \ + ADOXQ T8, T1 \ + ADCXQ T4, T3 \ + MULXQ M1+24(SB), T10, T4 \ + ADCXQ T4, T6 \ + MULXQ M1+32(SB), T8, T4 \ + ADCXQ T4, T5 \ + MULXQ M1+40(SB), T9, T4 \ + ADCXQ T4, T2 \ + MULXQ M1+48(SB), DX, T4 \ + ADCXQ AX, T4 \ + \ + ADOXQ T10, T3 \ + ADOXQ T8, T6 \ + ADOXQ T9, T5 \ + ADOXQ DX, T2 \ + ADOXQ AX, T4 + +#define mul256x448bmi2(M0, M1, C, T0, T1, T2, T3, T4, T5, T6, T7, T8, T9, T10) \ + MOVQ 0+M0, DX \ + MULXQ M1+0(SB), T1, T0 \ + MULXQ M1+8(SB), T3, T2 \ + MOVQ T1, 0+C \ // C0_final + XORQ AX, AX \ + MULXQ M1+16(SB), T5, T4 \ + ADDQ T3, T0 \ + ADCQ T5, T2 \ + MULXQ M1+24(SB), T3, T1 \ + ADCQ T3, T4 \ + MULXQ M1+32(SB), T6, T5 \ + ADCQ T6, T1 \ + MULXQ M1+40(SB), T7, T3 \ + ADCQ T7, T5 \ + MULXQ M1+48(SB), T8, T6 \ + ADCQ T8, T3 \ + ADCQ AX, T6 \ + \ + MOVQ 8+M0, DX \ + MULXQ M1+0(SB), T7, T8 \ + ADDQ T7, T0 \ + MOVQ T0, 8+C \ // C1_final + ADCQ T8, T2 \ + MULXQ M1+8(SB), T8, T7 \ + MOVQ T8, 32+C \ + ADCQ T7, T4 \ + MULXQ M1+16(SB), T8, T0 \ + MOVQ T8, 40+C \ + ADCQ T1, T0 \ + MULXQ M1+24(SB), T7, T1 \ + ADCQ T5, T1 \ + MULXQ M1+32(SB), T8, T5 \ + ADCQ T5, T3 \ + MULXQ M1+40(SB), T9, T5 \ + ADCQ T5, T6 \ + MULXQ M1+48(SB), DX, T5 \ + ADCQ AX, T5 \ + \ + XORQ AX, AX \ + ADDQ 32+C, T2 \ + ADCQ 40+C, T4 \ + ADCQ T7, T0 \ + ADCQ T8, T1 \ + ADCQ T9, T3 \ + ADCQ DX, T6 \ + ADCQ AX, T5 \ + \ + MOVQ 16+M0, DX \ + MULXQ M1+0(SB), T7, T8 \ + ADDQ T7, T2 \ + MOVQ T2, 16+C \ // C2_final + ADCQ T8, T4 \ + MULXQ M1+8(SB), T7, T8 \ + MOVQ T7, 32+C \ + ADCQ T8, T0 \ + MULXQ M1+16(SB), T8, T2 \ + MOVQ T8, 40+C \ + ADCQ T2, T1 \ + MULXQ M1+24(SB), T7, T2 \ + ADCQ T2, T3 \ + MULXQ M1+32(SB), T8, T2 \ + ADCQ T2, T6 \ + MULXQ M1+40(SB), T9, T2 \ + ADCQ T2, T5 \ + MULXQ M1+48(SB), DX, T2 \ + ADCQ AX, T2 \ + \ + XORQ AX, AX \ + ADDQ 32+C, T4 \ + ADCQ 40+C, T0 \ + ADCQ T7, T1 \ + ADCQ T8, T3 \ + ADCQ T9, T6 \ + ADCQ DX, T5 \ + ADCQ AX, T2 \ + \ + MOVQ 24+M0, DX \ + MULXQ M1+0(SB), T7, T8 \ + ADDQ T4, T7 \ + ADCQ T8, T0 \ + MULXQ M1+8(SB), T10, T8 \ + MOVQ T10, 32+C \ + ADCQ T8, T1 \ + MULXQ M1+16(SB), T8, T4 \ + MOVQ T8, 40+C \ + ADCQ T4, T3 \ + MULXQ M1+24(SB), T10, T4 \ + ADCQ T4, T6 \ + MULXQ M1+32(SB), T8, T4 \ + ADCQ T4, T5 \ + MULXQ M1+40(SB), T9, T4 \ + ADCQ T4, T2 \ + MULXQ M1+48(SB), DX, T4 \ + ADCQ AX, T4 \ + \ + XORQ AX, AX \ + ADDQ 32+C, T0 \ + ADCQ 40+C, T1 \ + ADCQ T10, T3 \ + ADCQ T8, T6 \ + ADCQ T9, T5 \ + ADCQ DX, T2 \ + ADCQ AX, T4 + +TEXT ·fp751MontgomeryReduce(SB), $48-16 MOVQ z+0(FP), REG_P2 MOVQ x+8(FP), REG_P1 + // If the MULX instruction is available, a technique from + // https://eprint.iacr.org/2017/1015 can be used for a faster Montgomery + // reduction. + CMPB ·hasBMI2(SB), $0 + JE noBMI2 + + MOVQ BX, 0(SP) + MOVQ BP, 8(SP) + MOVQ R12, 16(SP) + MOVQ R13, 24(SP) + MOVQ R14, 32(SP) + MOVQ R15, 40(SP) + + // a[0-3] x p751p1_nz --> result: [reg_p2+48], [reg_p2+56], [reg_p2+64], and rbp, r8:r14 + CMPB ·hasADX(SB), $0 + JE noADX1 + mul256x448bmi2adx(0(REG_P1), P751P1_NZ<>, 48(REG_P2), R8, R9, R13, R10, R14, R12, R11, BP, BX, CX, R15) + JMP continue1 +noADX1: + mul256x448bmi2(0(REG_P1), P751P1_NZ<>, 48(REG_P2), R8, R9, R13, R10, R14, R12, R11, BP, BX, CX, R15) + +continue1: + XORQ R15, R15 + MOVQ 48(REG_P2), AX + MOVQ 56(REG_P2), DX + MOVQ 64(REG_P2), BX + ADDQ 40(REG_P1), AX + ADCQ 48(REG_P1), DX + ADCQ 56(REG_P1), BX + MOVQ AX, 40(REG_P1) + MOVQ DX, 48(REG_P1) + MOVQ BX, 56(REG_P1) + ADCQ 64(REG_P1), BP + ADCQ 72(REG_P1), R8 + ADCQ 80(REG_P1), R9 + ADCQ 88(REG_P1), R10 + ADCQ 96(REG_P1), R11 + ADCQ 104(REG_P1), R12 + ADCQ 112(REG_P1), R13 + ADCQ 120(REG_P1), R14 + ADCQ 128(REG_P1), R15 + MOVQ BP, 64(REG_P1) + MOVQ R8, 72(REG_P1) + MOVQ R9, 80(REG_P1) + MOVQ R10, 88(REG_P1) + MOVQ R11, 96(REG_P1) + MOVQ R12, 104(REG_P1) + MOVQ R13, 112(REG_P1) + MOVQ R14, 120(REG_P1) + MOVQ R15, 128(REG_P1) + MOVQ 136(REG_P1), R8 + MOVQ 144(REG_P1), R9 + MOVQ 152(REG_P1), R10 + MOVQ 160(REG_P1), R11 + MOVQ 168(REG_P1), R12 + MOVQ 176(REG_P1), R13 + MOVQ 184(REG_P1), R14 + ADCQ $0, R8 + ADCQ $0, R9 + ADCQ $0, R10 + ADCQ $0, R11 + ADCQ $0, R12 + ADCQ $0, R13 + ADCQ $0, R14 + MOVQ R8, 136(REG_P1) + MOVQ R9, 144(REG_P1) + MOVQ R10, 152(REG_P1) + MOVQ R11, 160(REG_P1) + MOVQ R12, 168(REG_P1) + MOVQ R13, 176(REG_P1) + MOVQ R14, 184(REG_P1) + + // a[4-7] x p751p1_nz --> result: [reg_p2+48], [reg_p2+56], [reg_p2+64], and rbp, r8:r14 + CMPB ·hasADX(SB), $0 + JE noADX2 + mul256x448bmi2adx(32(REG_P1), P751P1_NZ<>, 48(REG_P2), R8, R9, R13, R10, R14, R12, R11, BP, BX, CX, R15) + JMP continue2 +noADX2: + mul256x448bmi2(32(REG_P1), P751P1_NZ<>, 48(REG_P2), R8, R9, R13, R10, R14, R12, R11, BP, BX, CX, R15) + +continue2: + XORQ R15, R15 + MOVQ 48(REG_P2), AX + MOVQ 56(REG_P2), DX + MOVQ 64(REG_P2), BX + ADDQ 72(REG_P1), AX + ADCQ 80(REG_P1), DX + ADCQ 88(REG_P1), BX + MOVQ AX, 72(REG_P1) + MOVQ DX, 80(REG_P1) + MOVQ BX, 88(REG_P1) + ADCQ 96(REG_P1), BP + ADCQ 104(REG_P1), R8 + ADCQ 112(REG_P1), R9 + ADCQ 120(REG_P1), R10 + ADCQ 128(REG_P1), R11 + ADCQ 136(REG_P1), R12 + ADCQ 144(REG_P1), R13 + ADCQ 152(REG_P1), R14 + ADCQ 160(REG_P1), R15 + MOVQ BP, (REG_P2) // Final result c0 + MOVQ R8, 104(REG_P1) + MOVQ R9, 112(REG_P1) + MOVQ R10, 120(REG_P1) + MOVQ R11, 128(REG_P1) + MOVQ R12, 136(REG_P1) + MOVQ R13, 144(REG_P1) + MOVQ R14, 152(REG_P1) + MOVQ R15, 160(REG_P1) + MOVQ 168(REG_P1), R12 + MOVQ 176(REG_P1), R13 + MOVQ 184(REG_P1), R14 + ADCQ $0, R12 + ADCQ $0, R13 + ADCQ $0, R14 + MOVQ R12, 168(REG_P1) + MOVQ R13, 176(REG_P1) + MOVQ R14, 184(REG_P1) + + + // a[8-11] x p751p1_nz --> result: [reg_p2+48], [reg_p2+56], [reg_p2+64], and rbp, r8:r14 + CMPB ·hasADX(SB), $0 + JE noADX3 + mul256x448bmi2adx(64(REG_P1), P751P1_NZ<>, 48(REG_P2), R8, R9, R13, R10, R14, R12, R11, BP, BX, CX, R15) + JMP continue3 +noADX3: + mul256x448bmi2(64(REG_P1), P751P1_NZ<>, 48(REG_P2), R8, R9, R13, R10, R14, R12, R11, BP, BX, CX, R15) + +continue3: + // Final result c1:c11 + MOVQ 48(REG_P2), AX + MOVQ 56(REG_P2), DX + MOVQ 64(REG_P2), BX + ADDQ 104(REG_P1), AX + ADCQ 112(REG_P1), DX + ADCQ 120(REG_P1), BX + MOVQ AX, 8(REG_P2) + MOVQ DX, 16(REG_P2) + MOVQ BX, 24(REG_P2) + ADCQ 128(REG_P1), BP + ADCQ 136(REG_P1), R8 + ADCQ 144(REG_P1), R9 + ADCQ 152(REG_P1), R10 + ADCQ 160(REG_P1), R11 + ADCQ 168(REG_P1), R12 + ADCQ 176(REG_P1), R13 + ADCQ 184(REG_P1), R14 + MOVQ BP, 32(REG_P2) + MOVQ R8, 40(REG_P2) + MOVQ R9, 48(REG_P2) + MOVQ R10, 56(REG_P2) + MOVQ R11, 64(REG_P2) + MOVQ R12, 72(REG_P2) + MOVQ R13, 80(REG_P2) + MOVQ R14, 88(REG_P2) + + MOVQ 0(SP), BX + MOVQ 8(SP), BP + MOVQ 16(SP), R12 + MOVQ 24(SP), R13 + MOVQ 32(SP), R14 + MOVQ 40(SP), R15 + + RET + +noBMI2: MOVQ (REG_P1), R11 MOVQ P751P1_5, AX MULQ R11 diff --git a/p751toolbox/field_decl.go b/p751toolbox/field_decl.go index 37d462a..d33341a 100644 --- a/p751toolbox/field_decl.go +++ b/p751toolbox/field_decl.go @@ -2,6 +2,13 @@ package p751toolbox +import ( + "golang.org/x/sys/cpu" +) + +var hasADX = cpu.X86.HasADX +var hasBMI2 = cpu.X86.HasBMI2 + // If choice = 0, leave x,y unchanged. If choice = 1, set x,y = y,x. // If choice is neither 0 nor 1 then behaviour is undefined. // This function executes in constant time. From 036e180db1b30f939416ac39fd1747d347a6f12f Mon Sep 17 00:00:00 2001 From: Ko- Date: Tue, 21 Aug 2018 15:35:21 +0100 Subject: [PATCH 2/7] p751: fix asm, in Go registers are caller-saved --- p751toolbox/field_amd64.s | 16 +--------------- 1 file changed, 1 insertion(+), 15 deletions(-) diff --git a/p751toolbox/field_amd64.s b/p751toolbox/field_amd64.s index 48d624e..7bb7cd1 100644 --- a/p751toolbox/field_amd64.s +++ b/p751toolbox/field_amd64.s @@ -1630,7 +1630,7 @@ TEXT ·fp751Mul(SB), $96-24 ADCQ DX, T2 \ ADCQ AX, T4 -TEXT ·fp751MontgomeryReduce(SB), $48-16 +TEXT ·fp751MontgomeryReduce(SB), $0-16 MOVQ z+0(FP), REG_P2 MOVQ x+8(FP), REG_P1 @@ -1641,13 +1641,6 @@ TEXT ·fp751MontgomeryReduce(SB), $48-16 CMPB ·hasBMI2(SB), $0 JE noBMI2 - MOVQ BX, 0(SP) - MOVQ BP, 8(SP) - MOVQ R12, 16(SP) - MOVQ R13, 24(SP) - MOVQ R14, 32(SP) - MOVQ R15, 40(SP) - // a[0-3] x p751p1_nz --> result: [reg_p2+48], [reg_p2+56], [reg_p2+64], and rbp, r8:r14 CMPB ·hasADX(SB), $0 JE noADX1 @@ -1791,13 +1784,6 @@ continue3: MOVQ R13, 80(REG_P2) MOVQ R14, 88(REG_P2) - MOVQ 0(SP), BX - MOVQ 8(SP), BP - MOVQ 16(SP), R12 - MOVQ 24(SP), R13 - MOVQ 32(SP), R14 - MOVQ 40(SP), R15 - RET noBMI2: From 81fe4f21d704de34bddca3b529a197edea035dff Mon Sep 17 00:00:00 2001 From: Ko- Date: Wed, 22 Aug 2018 11:46:42 +0100 Subject: [PATCH 3/7] p751: refactor montgomery reduction Use init() to set a function pointer to the fastest implementation of fp751MontgomeryReduce based on CPU capabilities. --- p751toolbox/field_amd64.s | 285 +++++++++++++++++++------------------- p751toolbox/field_decl.go | 28 +++- 2 files changed, 169 insertions(+), 144 deletions(-) diff --git a/p751toolbox/field_amd64.s b/p751toolbox/field_amd64.s index 7bb7cd1..133fdf2 100644 --- a/p751toolbox/field_amd64.s +++ b/p751toolbox/field_amd64.s @@ -1630,163 +1630,170 @@ TEXT ·fp751Mul(SB), $96-24 ADCQ DX, T2 \ ADCQ AX, T4 -TEXT ·fp751MontgomeryReduce(SB), $0-16 - +#define fp751MontgomeryReduceCommonPart1(M0, C, T0, T1, T2, T3, T4, T5, T6, T7, T8, T9)\ + XORQ T7, T7 \ + MOVQ 48+C, AX \ + MOVQ 56+C, DX \ + MOVQ 64+C, T9 \ + ADDQ 40+M0, AX \ + ADCQ 48+M0, DX \ + ADCQ 56+M0, T9 \ + MOVQ AX, 40+M0 \ + MOVQ DX, 48+M0 \ + MOVQ T9, 56+M0 \ + ADCQ 64+M0, T8 \ + ADCQ 72+M0, T0 \ + ADCQ 80+M0, T1 \ + ADCQ 88+M0, T2 \ + ADCQ 96+M0, T3 \ + ADCQ 104+M0, T4 \ + ADCQ 112+M0, T5 \ + ADCQ 120+M0, T6 \ + ADCQ 128+M0, T7 \ + MOVQ T8, 64+M0 \ + MOVQ T0, 72+M0 \ + MOVQ T1, 80+M0 \ + MOVQ T2, 88+M0 \ + MOVQ T3, 96+M0 \ + MOVQ T4, 104+M0 \ + MOVQ T5, 112+M0 \ + MOVQ T6, 120+M0 \ + MOVQ T7, 128+M0 \ + MOVQ 136+M0, T0 \ + MOVQ 144+M0, T1 \ + MOVQ 152+M0, T2 \ + MOVQ 160+M0, T3 \ + MOVQ 168+M0, T4 \ + MOVQ 176+M0, T5 \ + MOVQ 184+M0, T6 \ + ADCQ $0, T0 \ + ADCQ $0, T1 \ + ADCQ $0, T2 \ + ADCQ $0, T3 \ + ADCQ $0, T4 \ + ADCQ $0, T5 \ + ADCQ $0, T6 \ + MOVQ T0, 136+M0 \ + MOVQ T1, 144+M0 \ + MOVQ T2, 152+M0 \ + MOVQ T3, 160+M0 \ + MOVQ T4, 168+M0 \ + MOVQ T5, 176+M0 \ + MOVQ T6, 184+M0 + +#define fp751MontgomeryReduceCommonPart2(M0, C, T0, T1, T2, T3, T4, T5, T6, T7, T8, T9)\ + XORQ T7, T7 \ + MOVQ 48+C, AX \ + MOVQ 56+C, DX \ + MOVQ 64+C, T9 \ + ADDQ 72+M0, AX \ + ADCQ 80+M0, DX \ + ADCQ 88+M0, T9 \ + MOVQ AX, 72+M0 \ + MOVQ DX, 80+M0 \ + MOVQ T9, 88+M0 \ + ADCQ 96+M0, T8 \ + ADCQ 104+M0, T0 \ + ADCQ 112+M0, T1 \ + ADCQ 120+M0, T2 \ + ADCQ 128+M0, T3 \ + ADCQ 136+M0, T4 \ + ADCQ 144+M0, T5 \ + ADCQ 152+M0, T6 \ + ADCQ 160+M0, T7 \ + MOVQ T8, 0+C \ // Final result c0 + MOVQ T0, 104+M0 \ + MOVQ T1, 112+M0 \ + MOVQ T2, 120+M0 \ + MOVQ T3, 128+M0 \ + MOVQ T4, 136+M0 \ + MOVQ T5, 144+M0 \ + MOVQ T6, 152+M0 \ + MOVQ T7, 160+M0 \ + MOVQ 168+M0, T4 \ + MOVQ 176+M0, T5 \ + MOVQ 184+M0, T6 \ + ADCQ $0, T4 \ + ADCQ $0, T5 \ + ADCQ $0, T6 \ + MOVQ T4, 168+M0 \ + MOVQ T5, 176+M0 \ + MOVQ T6, 184+M0 + +#define fp751MontgomeryReduceCommonPart3(M0, C, T0, T1, T2, T3, T4, T5, T6, T7, T8, T9)\ + MOVQ 48+C, AX \ // Final result c1:c11 + MOVQ 56+C, DX \ + MOVQ 64+C, T9 \ + ADDQ 104+M0, AX \ + ADCQ 112+M0, DX \ + ADCQ 120+M0, T9 \ + MOVQ AX, 8+C \ + MOVQ DX, 16+C \ + MOVQ T9, 24+C \ + ADCQ 128+M0, T8 \ + ADCQ 136+M0, T0 \ + ADCQ 144+M0, T1 \ + ADCQ 152+M0, T2 \ + ADCQ 160+M0, T3 \ + ADCQ 168+M0, T4 \ + ADCQ 176+M0, T5 \ + ADCQ 184+M0, T6 \ + MOVQ T8, 32+C \ + MOVQ T0, 40+C \ + MOVQ T1, 48+C \ + MOVQ T2, 56+C \ + MOVQ T3, 64+C \ + MOVQ T4, 72+C \ + MOVQ T5, 80+C \ + MOVQ T6, 88+C + +TEXT ·fp751MontgomeryReduceBMI2ADX(SB), $0-16 MOVQ z+0(FP), REG_P2 MOVQ x+8(FP), REG_P1 - // If the MULX instruction is available, a technique from - // https://eprint.iacr.org/2017/1015 can be used for a faster Montgomery - // reduction. - CMPB ·hasBMI2(SB), $0 - JE noBMI2 - // a[0-3] x p751p1_nz --> result: [reg_p2+48], [reg_p2+56], [reg_p2+64], and rbp, r8:r14 - CMPB ·hasADX(SB), $0 - JE noADX1 mul256x448bmi2adx(0(REG_P1), P751P1_NZ<>, 48(REG_P2), R8, R9, R13, R10, R14, R12, R11, BP, BX, CX, R15) - JMP continue1 -noADX1: - mul256x448bmi2(0(REG_P1), P751P1_NZ<>, 48(REG_P2), R8, R9, R13, R10, R14, R12, R11, BP, BX, CX, R15) -continue1: - XORQ R15, R15 - MOVQ 48(REG_P2), AX - MOVQ 56(REG_P2), DX - MOVQ 64(REG_P2), BX - ADDQ 40(REG_P1), AX - ADCQ 48(REG_P1), DX - ADCQ 56(REG_P1), BX - MOVQ AX, 40(REG_P1) - MOVQ DX, 48(REG_P1) - MOVQ BX, 56(REG_P1) - ADCQ 64(REG_P1), BP - ADCQ 72(REG_P1), R8 - ADCQ 80(REG_P1), R9 - ADCQ 88(REG_P1), R10 - ADCQ 96(REG_P1), R11 - ADCQ 104(REG_P1), R12 - ADCQ 112(REG_P1), R13 - ADCQ 120(REG_P1), R14 - ADCQ 128(REG_P1), R15 - MOVQ BP, 64(REG_P1) - MOVQ R8, 72(REG_P1) - MOVQ R9, 80(REG_P1) - MOVQ R10, 88(REG_P1) - MOVQ R11, 96(REG_P1) - MOVQ R12, 104(REG_P1) - MOVQ R13, 112(REG_P1) - MOVQ R14, 120(REG_P1) - MOVQ R15, 128(REG_P1) - MOVQ 136(REG_P1), R8 - MOVQ 144(REG_P1), R9 - MOVQ 152(REG_P1), R10 - MOVQ 160(REG_P1), R11 - MOVQ 168(REG_P1), R12 - MOVQ 176(REG_P1), R13 - MOVQ 184(REG_P1), R14 - ADCQ $0, R8 - ADCQ $0, R9 - ADCQ $0, R10 - ADCQ $0, R11 - ADCQ $0, R12 - ADCQ $0, R13 - ADCQ $0, R14 - MOVQ R8, 136(REG_P1) - MOVQ R9, 144(REG_P1) - MOVQ R10, 152(REG_P1) - MOVQ R11, 160(REG_P1) - MOVQ R12, 168(REG_P1) - MOVQ R13, 176(REG_P1) - MOVQ R14, 184(REG_P1) + fp751MontgomeryReduceCommonPart1(0(REG_P1), 0(REG_P2), R8, R9, R10, R11, R12, R13, R14, R15, BP, BX) // a[4-7] x p751p1_nz --> result: [reg_p2+48], [reg_p2+56], [reg_p2+64], and rbp, r8:r14 - CMPB ·hasADX(SB), $0 - JE noADX2 mul256x448bmi2adx(32(REG_P1), P751P1_NZ<>, 48(REG_P2), R8, R9, R13, R10, R14, R12, R11, BP, BX, CX, R15) - JMP continue2 -noADX2: - mul256x448bmi2(32(REG_P1), P751P1_NZ<>, 48(REG_P2), R8, R9, R13, R10, R14, R12, R11, BP, BX, CX, R15) - -continue2: - XORQ R15, R15 - MOVQ 48(REG_P2), AX - MOVQ 56(REG_P2), DX - MOVQ 64(REG_P2), BX - ADDQ 72(REG_P1), AX - ADCQ 80(REG_P1), DX - ADCQ 88(REG_P1), BX - MOVQ AX, 72(REG_P1) - MOVQ DX, 80(REG_P1) - MOVQ BX, 88(REG_P1) - ADCQ 96(REG_P1), BP - ADCQ 104(REG_P1), R8 - ADCQ 112(REG_P1), R9 - ADCQ 120(REG_P1), R10 - ADCQ 128(REG_P1), R11 - ADCQ 136(REG_P1), R12 - ADCQ 144(REG_P1), R13 - ADCQ 152(REG_P1), R14 - ADCQ 160(REG_P1), R15 - MOVQ BP, (REG_P2) // Final result c0 - MOVQ R8, 104(REG_P1) - MOVQ R9, 112(REG_P1) - MOVQ R10, 120(REG_P1) - MOVQ R11, 128(REG_P1) - MOVQ R12, 136(REG_P1) - MOVQ R13, 144(REG_P1) - MOVQ R14, 152(REG_P1) - MOVQ R15, 160(REG_P1) - MOVQ 168(REG_P1), R12 - MOVQ 176(REG_P1), R13 - MOVQ 184(REG_P1), R14 - ADCQ $0, R12 - ADCQ $0, R13 - ADCQ $0, R14 - MOVQ R12, 168(REG_P1) - MOVQ R13, 176(REG_P1) - MOVQ R14, 184(REG_P1) + fp751MontgomeryReduceCommonPart2(0(REG_P1), 0(REG_P2), R8, R9, R10, R11, R12, R13, R14, R15, BP, BX) // a[8-11] x p751p1_nz --> result: [reg_p2+48], [reg_p2+56], [reg_p2+64], and rbp, r8:r14 - CMPB ·hasADX(SB), $0 - JE noADX3 mul256x448bmi2adx(64(REG_P1), P751P1_NZ<>, 48(REG_P2), R8, R9, R13, R10, R14, R12, R11, BP, BX, CX, R15) - JMP continue3 -noADX3: + + fp751MontgomeryReduceCommonPart3(0(REG_P1), 0(REG_P2), R8, R9, R10, R11, R12, R13, R14, R15, BP, BX) + + RET + +TEXT ·fp751MontgomeryReduceBMI2(SB), $0-16 + MOVQ z+0(FP), REG_P2 + MOVQ x+8(FP), REG_P1 + + // a[0-3] x p751p1_nz --> result: [reg_p2+48], [reg_p2+56], [reg_p2+64], and rbp, r8:r14 + mul256x448bmi2(0(REG_P1), P751P1_NZ<>, 48(REG_P2), R8, R9, R13, R10, R14, R12, R11, BP, BX, CX, R15) + + fp751MontgomeryReduceCommonPart1(0(REG_P1), 0(REG_P2), R8, R9, R10, R11, R12, R13, R14, R15, BP, BX) + + // a[4-7] x p751p1_nz --> result: [reg_p2+48], [reg_p2+56], [reg_p2+64], and rbp, r8:r14 + mul256x448bmi2(32(REG_P1), P751P1_NZ<>, 48(REG_P2), R8, R9, R13, R10, R14, R12, R11, BP, BX, CX, R15) + + fp751MontgomeryReduceCommonPart2(0(REG_P1), 0(REG_P2), R8, R9, R10, R11, R12, R13, R14, R15, BP, BX) + + // a[8-11] x p751p1_nz --> result: [reg_p2+48], [reg_p2+56], [reg_p2+64], and rbp, r8:r14 mul256x448bmi2(64(REG_P1), P751P1_NZ<>, 48(REG_P2), R8, R9, R13, R10, R14, R12, R11, BP, BX, CX, R15) -continue3: - // Final result c1:c11 - MOVQ 48(REG_P2), AX - MOVQ 56(REG_P2), DX - MOVQ 64(REG_P2), BX - ADDQ 104(REG_P1), AX - ADCQ 112(REG_P1), DX - ADCQ 120(REG_P1), BX - MOVQ AX, 8(REG_P2) - MOVQ DX, 16(REG_P2) - MOVQ BX, 24(REG_P2) - ADCQ 128(REG_P1), BP - ADCQ 136(REG_P1), R8 - ADCQ 144(REG_P1), R9 - ADCQ 152(REG_P1), R10 - ADCQ 160(REG_P1), R11 - ADCQ 168(REG_P1), R12 - ADCQ 176(REG_P1), R13 - ADCQ 184(REG_P1), R14 - MOVQ BP, 32(REG_P2) - MOVQ R8, 40(REG_P2) - MOVQ R9, 48(REG_P2) - MOVQ R10, 56(REG_P2) - MOVQ R11, 64(REG_P2) - MOVQ R12, 72(REG_P2) - MOVQ R13, 80(REG_P2) - MOVQ R14, 88(REG_P2) + fp751MontgomeryReduceCommonPart3(0(REG_P1), 0(REG_P2), R8, R9, R10, R11, R12, R13, R14, R15, BP, BX) RET -noBMI2: +TEXT ·fp751MontgomeryReduceFallback(SB), $0-16 + + MOVQ z+0(FP), REG_P2 + MOVQ x+8(FP), REG_P1 + MOVQ (REG_P1), R11 MOVQ P751P1_5, AX MULQ R11 diff --git a/p751toolbox/field_decl.go b/p751toolbox/field_decl.go index d33341a..c3e05c1 100644 --- a/p751toolbox/field_decl.go +++ b/p751toolbox/field_decl.go @@ -3,12 +3,9 @@ package p751toolbox import ( - "golang.org/x/sys/cpu" + "golang.org/x/sys/cpu" ) -var hasADX = cpu.X86.HasADX -var hasBMI2 = cpu.X86.HasBMI2 - // If choice = 0, leave x,y unchanged. If choice = 1, set x,y = y,x. // If choice is neither 0 nor 1 then behaviour is undefined. // This function executes in constant time. @@ -41,9 +38,30 @@ func fp751Mul(z *fp751X2, x, y *Fp751Element) // Perform Montgomery reduction: set z = x R^{-1} (mod 2*p). // Destroys the input value. +var fp751MontgomeryReduce func(z *Fp751Element, x *fp751X2) + +//go:noescape +func fp751MontgomeryReduceBMI2ADX(z *Fp751Element, x *fp751X2) + //go:noescape -func fp751MontgomeryReduce(z *Fp751Element, x *fp751X2) +func fp751MontgomeryReduceBMI2(z *Fp751Element, x *fp751X2) + +//go:noescape +func fp751MontgomeryReduceFallback(z *Fp751Element, x *fp751X2) // Reduce a field element in [0, 2*p) to one in [0,p). //go:noescape func fp751StrongReduce(x *Fp751Element) + +// Choose the fastest variant depending on CPU capabilities. +func init() { + if cpu.X86.HasBMI2 { + if cpu.X86.HasADX { + fp751MontgomeryReduce = fp751MontgomeryReduceBMI2ADX + } else { + fp751MontgomeryReduce = fp751MontgomeryReduceBMI2 + } + } else { + fp751MontgomeryReduce = fp751MontgomeryReduceFallback + } +} From bfaceaa1c17f60cd59a9cd32480de99fd9b54c51 Mon Sep 17 00:00:00 2001 From: Ko- Date: Thu, 23 Aug 2018 09:53:12 +0100 Subject: [PATCH 4/7] p751: remove duplicate storage of p751+1 --- p751toolbox/field_amd64.s | 133 ++++++++++++++++++-------------------- 1 file changed, 62 insertions(+), 71 deletions(-) diff --git a/p751toolbox/field_amd64.s b/p751toolbox/field_amd64.s index 133fdf2..75d142f 100644 --- a/p751toolbox/field_amd64.s +++ b/p751toolbox/field_amd64.s @@ -30,15 +30,6 @@ #define P751X2_10 $0x1C25213F2F75B8CD #define P751X2_11 $0x0000DFCBAA83EE38 -DATA P751P1_NZ<>+0x00(SB)/8, $0xEEB0000000000000 -DATA P751P1_NZ<>+0x08(SB)/8, $0xE3EC968549F878A8 -DATA P751P1_NZ<>+0x10(SB)/8, $0xDA959B1A13F7CC76 -DATA P751P1_NZ<>+0x18(SB)/8, $0x084E9867D6EBE876 -DATA P751P1_NZ<>+0x20(SB)/8, $0x8562B5045CB25748 -DATA P751P1_NZ<>+0x28(SB)/8, $0x0E12909F97BADC66 -DATA P751P1_NZ<>+0x30(SB)/8, $0x00006FE5D541F71C -GLOBL P751P1_NZ<>(SB), (NOPTR + RODATA), $0x38 - // The MSR code uses these registers for parameter passing. Keep using // them to avoid significant code changes. This means that when the Go // assembler does something strange, we can diff the machine code @@ -1428,42 +1419,42 @@ TEXT ·fp751Mul(SB), $96-24 #define mul256x448bmi2adx(M0, M1, C, T0, T1, T2, T3, T4, T5, T6, T7, T8, T9, T10) \ MOVQ 0+M0, DX \ - MULXQ M1+0(SB), T1, T0 \ - MULXQ M1+8(SB), T3, T2 \ + MULXQ M1+40(SB), T1, T0 \ + MULXQ M1+48(SB), T3, T2 \ MOVQ T1, 0+C \ // C0_final XORQ AX, AX \ - MULXQ M1+16(SB), T5, T4 \ + MULXQ M1+56(SB), T5, T4 \ ADOXQ T3, T0 \ ADOXQ T5, T2 \ - MULXQ M1+24(SB), T3, T1 \ + MULXQ M1+64(SB), T3, T1 \ ADOXQ T3, T4 \ - MULXQ M1+32(SB), T6, T5 \ + MULXQ M1+72(SB), T6, T5 \ ADOXQ T6, T1 \ - MULXQ M1+40(SB), T7, T3 \ + MULXQ M1+80(SB), T7, T3 \ ADOXQ T7, T5 \ - MULXQ M1+48(SB), T8, T6 \ + MULXQ M1+88(SB), T8, T6 \ ADOXQ T8, T3 \ ADOXQ AX, T6 \ \ MOVQ 8+M0, DX \ - MULXQ M1+0(SB), T7, T8 \ + MULXQ M1+40(SB), T7, T8 \ XORQ AX, AX \ ADCXQ T7, T0 \ MOVQ T0, 8+C \ // C1_final ADCXQ T8, T2 \ - MULXQ M1+8(SB), T8, T7 \ + MULXQ M1+48(SB), T8, T7 \ ADOXQ T8, T2 \ ADCXQ T7, T4 \ - MULXQ M1+16(SB), T8, T0 \ + MULXQ M1+56(SB), T8, T0 \ ADOXQ T8, T4 \ ADCXQ T1, T0 \ - MULXQ M1+24(SB), T7, T1 \ + MULXQ M1+64(SB), T7, T1 \ ADCXQ T5, T1 \ - MULXQ M1+32(SB), T8, T5 \ + MULXQ M1+72(SB), T8, T5 \ ADCXQ T5, T3 \ - MULXQ M1+40(SB), T9, T5 \ + MULXQ M1+80(SB), T9, T5 \ ADCXQ T5, T6 \ - MULXQ M1+48(SB), DX, T5 \ + MULXQ M1+88(SB), DX, T5 \ ADCXQ AX, T5 \ \ ADOXQ T7, T0 \ @@ -1473,24 +1464,24 @@ TEXT ·fp751Mul(SB), $96-24 ADOXQ AX, T5 \ \ MOVQ 16+M0, DX \ - MULXQ M1+0(SB), T7, T8 \ + MULXQ M1+40(SB), T7, T8 \ XORQ AX, AX \ ADCXQ T7, T2 \ MOVQ T2, 16+C \ // C2_final ADCXQ T8, T4 \ - MULXQ M1+8(SB), T7, T8 \ + MULXQ M1+48(SB), T7, T8 \ ADOXQ T7, T4 \ ADCXQ T8, T0 \ - MULXQ M1+16(SB), T8, T2 \ + MULXQ M1+56(SB), T8, T2 \ ADOXQ T8, T0 \ ADCXQ T2, T1 \ - MULXQ M1+24(SB), T7, T2 \ + MULXQ M1+64(SB), T7, T2 \ ADCXQ T2, T3 \ - MULXQ M1+32(SB), T8, T2 \ + MULXQ M1+72(SB), T8, T2 \ ADCXQ T2, T6 \ - MULXQ M1+40(SB), T9, T2 \ + MULXQ M1+80(SB), T9, T2 \ ADCXQ T2, T5 \ - MULXQ M1+48(SB), DX, T2 \ + MULXQ M1+88(SB), DX, T2 \ ADCXQ AX, T2 \ \ ADOXQ T7, T1 \ @@ -1500,23 +1491,23 @@ TEXT ·fp751Mul(SB), $96-24 ADOXQ AX, T2 \ \ MOVQ 24+M0, DX \ - MULXQ M1+0(SB), T7, T8 \ + MULXQ M1+40(SB), T7, T8 \ XORQ AX, AX \ ADCXQ T4, T7 \ ADCXQ T8, T0 \ - MULXQ M1+8(SB), T10, T8 \ + MULXQ M1+48(SB), T10, T8 \ ADOXQ T10, T0 \ ADCXQ T8, T1 \ - MULXQ M1+16(SB), T8, T4 \ + MULXQ M1+56(SB), T8, T4 \ ADOXQ T8, T1 \ ADCXQ T4, T3 \ - MULXQ M1+24(SB), T10, T4 \ + MULXQ M1+64(SB), T10, T4 \ ADCXQ T4, T6 \ - MULXQ M1+32(SB), T8, T4 \ + MULXQ M1+72(SB), T8, T4 \ ADCXQ T4, T5 \ - MULXQ M1+40(SB), T9, T4 \ + MULXQ M1+80(SB), T9, T4 \ ADCXQ T4, T2 \ - MULXQ M1+48(SB), DX, T4 \ + MULXQ M1+88(SB), DX, T4 \ ADCXQ AX, T4 \ \ ADOXQ T10, T3 \ @@ -1527,41 +1518,41 @@ TEXT ·fp751Mul(SB), $96-24 #define mul256x448bmi2(M0, M1, C, T0, T1, T2, T3, T4, T5, T6, T7, T8, T9, T10) \ MOVQ 0+M0, DX \ - MULXQ M1+0(SB), T1, T0 \ - MULXQ M1+8(SB), T3, T2 \ + MULXQ M1+40(SB), T1, T0 \ + MULXQ M1+48(SB), T3, T2 \ MOVQ T1, 0+C \ // C0_final XORQ AX, AX \ - MULXQ M1+16(SB), T5, T4 \ + MULXQ M1+56(SB), T5, T4 \ ADDQ T3, T0 \ ADCQ T5, T2 \ - MULXQ M1+24(SB), T3, T1 \ + MULXQ M1+64(SB), T3, T1 \ ADCQ T3, T4 \ - MULXQ M1+32(SB), T6, T5 \ + MULXQ M1+72(SB), T6, T5 \ ADCQ T6, T1 \ - MULXQ M1+40(SB), T7, T3 \ + MULXQ M1+80(SB), T7, T3 \ ADCQ T7, T5 \ - MULXQ M1+48(SB), T8, T6 \ + MULXQ M1+88(SB), T8, T6 \ ADCQ T8, T3 \ ADCQ AX, T6 \ \ MOVQ 8+M0, DX \ - MULXQ M1+0(SB), T7, T8 \ + MULXQ M1+40(SB), T7, T8 \ ADDQ T7, T0 \ MOVQ T0, 8+C \ // C1_final ADCQ T8, T2 \ - MULXQ M1+8(SB), T8, T7 \ + MULXQ M1+48(SB), T8, T7 \ MOVQ T8, 32+C \ ADCQ T7, T4 \ - MULXQ M1+16(SB), T8, T0 \ + MULXQ M1+56(SB), T8, T0 \ MOVQ T8, 40+C \ ADCQ T1, T0 \ - MULXQ M1+24(SB), T7, T1 \ + MULXQ M1+64(SB), T7, T1 \ ADCQ T5, T1 \ - MULXQ M1+32(SB), T8, T5 \ + MULXQ M1+72(SB), T8, T5 \ ADCQ T5, T3 \ - MULXQ M1+40(SB), T9, T5 \ + MULXQ M1+80(SB), T9, T5 \ ADCQ T5, T6 \ - MULXQ M1+48(SB), DX, T5 \ + MULXQ M1+88(SB), DX, T5 \ ADCQ AX, T5 \ \ XORQ AX, AX \ @@ -1574,23 +1565,23 @@ TEXT ·fp751Mul(SB), $96-24 ADCQ AX, T5 \ \ MOVQ 16+M0, DX \ - MULXQ M1+0(SB), T7, T8 \ + MULXQ M1+40(SB), T7, T8 \ ADDQ T7, T2 \ MOVQ T2, 16+C \ // C2_final ADCQ T8, T4 \ - MULXQ M1+8(SB), T7, T8 \ + MULXQ M1+48(SB), T7, T8 \ MOVQ T7, 32+C \ ADCQ T8, T0 \ - MULXQ M1+16(SB), T8, T2 \ + MULXQ M1+56(SB), T8, T2 \ MOVQ T8, 40+C \ ADCQ T2, T1 \ - MULXQ M1+24(SB), T7, T2 \ + MULXQ M1+64(SB), T7, T2 \ ADCQ T2, T3 \ - MULXQ M1+32(SB), T8, T2 \ + MULXQ M1+72(SB), T8, T2 \ ADCQ T2, T6 \ - MULXQ M1+40(SB), T9, T2 \ + MULXQ M1+80(SB), T9, T2 \ ADCQ T2, T5 \ - MULXQ M1+48(SB), DX, T2 \ + MULXQ M1+88(SB), DX, T2 \ ADCQ AX, T2 \ \ XORQ AX, AX \ @@ -1603,22 +1594,22 @@ TEXT ·fp751Mul(SB), $96-24 ADCQ AX, T2 \ \ MOVQ 24+M0, DX \ - MULXQ M1+0(SB), T7, T8 \ + MULXQ M1+40(SB), T7, T8 \ ADDQ T4, T7 \ ADCQ T8, T0 \ - MULXQ M1+8(SB), T10, T8 \ + MULXQ M1+48(SB), T10, T8 \ MOVQ T10, 32+C \ ADCQ T8, T1 \ - MULXQ M1+16(SB), T8, T4 \ + MULXQ M1+56(SB), T8, T4 \ MOVQ T8, 40+C \ ADCQ T4, T3 \ - MULXQ M1+24(SB), T10, T4 \ + MULXQ M1+64(SB), T10, T4 \ ADCQ T4, T6 \ - MULXQ M1+32(SB), T8, T4 \ + MULXQ M1+72(SB), T8, T4 \ ADCQ T4, T5 \ - MULXQ M1+40(SB), T9, T4 \ + MULXQ M1+80(SB), T9, T4 \ ADCQ T4, T2 \ - MULXQ M1+48(SB), DX, T4 \ + MULXQ M1+88(SB), DX, T4 \ ADCQ AX, T4 \ \ XORQ AX, AX \ @@ -1752,17 +1743,17 @@ TEXT ·fp751MontgomeryReduceBMI2ADX(SB), $0-16 MOVQ x+8(FP), REG_P1 // a[0-3] x p751p1_nz --> result: [reg_p2+48], [reg_p2+56], [reg_p2+64], and rbp, r8:r14 - mul256x448bmi2adx(0(REG_P1), P751P1_NZ<>, 48(REG_P2), R8, R9, R13, R10, R14, R12, R11, BP, BX, CX, R15) + mul256x448bmi2adx(0(REG_P1), ·p751p1, 48(REG_P2), R8, R9, R13, R10, R14, R12, R11, BP, BX, CX, R15) fp751MontgomeryReduceCommonPart1(0(REG_P1), 0(REG_P2), R8, R9, R10, R11, R12, R13, R14, R15, BP, BX) // a[4-7] x p751p1_nz --> result: [reg_p2+48], [reg_p2+56], [reg_p2+64], and rbp, r8:r14 - mul256x448bmi2adx(32(REG_P1), P751P1_NZ<>, 48(REG_P2), R8, R9, R13, R10, R14, R12, R11, BP, BX, CX, R15) + mul256x448bmi2adx(32(REG_P1), ·p751p1, 48(REG_P2), R8, R9, R13, R10, R14, R12, R11, BP, BX, CX, R15) fp751MontgomeryReduceCommonPart2(0(REG_P1), 0(REG_P2), R8, R9, R10, R11, R12, R13, R14, R15, BP, BX) // a[8-11] x p751p1_nz --> result: [reg_p2+48], [reg_p2+56], [reg_p2+64], and rbp, r8:r14 - mul256x448bmi2adx(64(REG_P1), P751P1_NZ<>, 48(REG_P2), R8, R9, R13, R10, R14, R12, R11, BP, BX, CX, R15) + mul256x448bmi2adx(64(REG_P1), ·p751p1, 48(REG_P2), R8, R9, R13, R10, R14, R12, R11, BP, BX, CX, R15) fp751MontgomeryReduceCommonPart3(0(REG_P1), 0(REG_P2), R8, R9, R10, R11, R12, R13, R14, R15, BP, BX) @@ -1773,17 +1764,17 @@ TEXT ·fp751MontgomeryReduceBMI2(SB), $0-16 MOVQ x+8(FP), REG_P1 // a[0-3] x p751p1_nz --> result: [reg_p2+48], [reg_p2+56], [reg_p2+64], and rbp, r8:r14 - mul256x448bmi2(0(REG_P1), P751P1_NZ<>, 48(REG_P2), R8, R9, R13, R10, R14, R12, R11, BP, BX, CX, R15) + mul256x448bmi2(0(REG_P1), ·p751p1, 48(REG_P2), R8, R9, R13, R10, R14, R12, R11, BP, BX, CX, R15) fp751MontgomeryReduceCommonPart1(0(REG_P1), 0(REG_P2), R8, R9, R10, R11, R12, R13, R14, R15, BP, BX) // a[4-7] x p751p1_nz --> result: [reg_p2+48], [reg_p2+56], [reg_p2+64], and rbp, r8:r14 - mul256x448bmi2(32(REG_P1), P751P1_NZ<>, 48(REG_P2), R8, R9, R13, R10, R14, R12, R11, BP, BX, CX, R15) + mul256x448bmi2(32(REG_P1), ·p751p1, 48(REG_P2), R8, R9, R13, R10, R14, R12, R11, BP, BX, CX, R15) fp751MontgomeryReduceCommonPart2(0(REG_P1), 0(REG_P2), R8, R9, R10, R11, R12, R13, R14, R15, BP, BX) // a[8-11] x p751p1_nz --> result: [reg_p2+48], [reg_p2+56], [reg_p2+64], and rbp, r8:r14 - mul256x448bmi2(64(REG_P1), P751P1_NZ<>, 48(REG_P2), R8, R9, R13, R10, R14, R12, R11, BP, BX, CX, R15) + mul256x448bmi2(64(REG_P1), ·p751p1, 48(REG_P2), R8, R9, R13, R10, R14, R12, R11, BP, BX, CX, R15) fp751MontgomeryReduceCommonPart3(0(REG_P1), 0(REG_P2), R8, R9, R10, R11, R12, R13, R14, R15, BP, BX) From 9224fbce1911319097f41299c41fa442cfac27e7 Mon Sep 17 00:00:00 2001 From: Ko- Date: Thu, 23 Aug 2018 10:06:25 +0100 Subject: [PATCH 5/7] add golang.org/x/sys/cpu as dependency to makefile --- Makefile | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/Makefile b/Makefile index 0444f84..052907f 100644 --- a/Makefile +++ b/Makefile @@ -4,7 +4,9 @@ PRJ_DIR = $(abspath $(dir $(MK_FILE_PATH))) GOPATH_LOCAL = $(PRJ_DIR)/build GOPATH_DIR = github.com/cloudflare/p751sidh CSHAKE_PKG = github.com/henrydcase/nobs/hash/sha3 +CPU_PKG = golang.org/x/sys/cpu TARGETS = p751toolbox sidh sike +GO ?= go GOARCH ?= OPTS_GCCGO ?= -compiler gccgo -O2 -g OPTS_TAGS ?= -tags=noasm @@ -20,7 +22,7 @@ clean: rm -rf coverage*.txt prep: - GOPATH=$(GOPATH_LOCAL) go get $(CSHAKE_PKG) + GOPATH=$(GOPATH_LOCAL) $(GO) get $(CSHAKE_PKG) $(CPU_PKG) mkdir -p $(GOPATH_LOCAL)/src/$(GOPATH_DIR) cp -rf p751toolbox $(GOPATH_LOCAL)/src/$(GOPATH_DIR) cp -rf sidh $(GOPATH_LOCAL)/src/$(GOPATH_DIR) @@ -28,17 +30,17 @@ prep: cp -rf etc $(GOPATH_LOCAL)/src/$(GOPATH_DIR) test-%: prep - GOPATH=$(GOPATH_LOCAL) go test -v $(OPTS) $(GOPATH_DIR)/$* + GOPATH=$(GOPATH_LOCAL) $(GO) test -v $(OPTS) $(GOPATH_DIR)/$* bench-%: prep - cd $*; GOPATH=$(GOPATH_LOCAL) go test -v $(OPTS) -bench=. + cd $*; GOPATH=$(GOPATH_LOCAL) $(GO) test -v $(OPTS) -bench=. cover-%: prep - GOPATH=$(GOPATH_LOCAL) go test \ + GOPATH=$(GOPATH_LOCAL) $(GO) test \ -race -coverprofile=coverage_$*.txt -covermode=atomic $(OPTS) $(GOPATH_DIR)/$* cat coverage_$*.txt >> coverage.txt rm coverage_$*.txt test: $(addprefix test-, $(TARGETS)) bench: $(addprefix bench-, $(TARGETS)) -cover: $(addprefix cover-, $(TARGETS)) \ No newline at end of file +cover: $(addprefix cover-, $(TARGETS)) From ade27e3126219a4ad5a7d7f9b8f2a5fb6456a764 Mon Sep 17 00:00:00 2001 From: Ko- Date: Tue, 28 Aug 2018 16:54:58 +0100 Subject: [PATCH 6/7] p751: add some more comments to clarify assembly code --- p751toolbox/field_amd64.s | 18 ++++++++++++++++++ p751toolbox/field_decl.go | 9 ++++++--- 2 files changed, 24 insertions(+), 3 deletions(-) diff --git a/p751toolbox/field_amd64.s b/p751toolbox/field_amd64.s index 75d142f..1e265df 100644 --- a/p751toolbox/field_amd64.s +++ b/p751toolbox/field_amd64.s @@ -1417,6 +1417,11 @@ TEXT ·fp751Mul(SB), $96-24 RET +// This multiplies a 256-bit number pointed to by M0 with p751+1. +// It is assumed that M1 points to p751+1 stored as a 768-bit Fp751Element. +// C points to the place to store the result and should be at least 192 bits. +// This should only be used when the BMI2 and ADX instruction set extensions +// are available. #define mul256x448bmi2adx(M0, M1, C, T0, T1, T2, T3, T4, T5, T6, T7, T8, T9, T10) \ MOVQ 0+M0, DX \ MULXQ M1+40(SB), T1, T0 \ @@ -1516,6 +1521,11 @@ TEXT ·fp751Mul(SB), $96-24 ADOXQ DX, T2 \ ADOXQ AX, T4 +// This multiplies a 256-bit number pointed to by M0 with p751+1. +// It is assumed that M1 points to p751+1 stored as a 768-bit Fp751Element. +// C points to the place to store the result and should be at least 192 bits. +// This should only be used when the BMI2 instruction set extension is +// available. #define mul256x448bmi2(M0, M1, C, T0, T1, T2, T3, T4, T5, T6, T7, T8, T9, T10) \ MOVQ 0+M0, DX \ MULXQ M1+40(SB), T1, T0 \ @@ -1738,6 +1748,9 @@ TEXT ·fp751Mul(SB), $96-24 MOVQ T5, 80+C \ MOVQ T6, 88+C +// This implements the Montgomery reduction algorithm described in +// section 5.2.3 of https://eprint.iacr.org/2017/1015.pdf. +// This assumes that the BMI2 and ADX instruction set extensions are available. TEXT ·fp751MontgomeryReduceBMI2ADX(SB), $0-16 MOVQ z+0(FP), REG_P2 MOVQ x+8(FP), REG_P1 @@ -1759,6 +1772,9 @@ TEXT ·fp751MontgomeryReduceBMI2ADX(SB), $0-16 RET +// This implements the Montgomery reduction algorithm described in +// section 5.2.3 of https://eprint.iacr.org/2017/1015.pdf. +// This assumes that the BMI2 instruction set extension is available. TEXT ·fp751MontgomeryReduceBMI2(SB), $0-16 MOVQ z+0(FP), REG_P2 MOVQ x+8(FP), REG_P1 @@ -1780,6 +1796,8 @@ TEXT ·fp751MontgomeryReduceBMI2(SB), $0-16 RET +// This implements the straightforward Montgomery reduction algorithm without +// using specific instruction set extensions. TEXT ·fp751MontgomeryReduceFallback(SB), $0-16 MOVQ z+0(FP), REG_P2 diff --git a/p751toolbox/field_decl.go b/p751toolbox/field_decl.go index c3e05c1..bd68b7b 100644 --- a/p751toolbox/field_decl.go +++ b/p751toolbox/field_decl.go @@ -36,8 +36,10 @@ func fp751X2SubLazy(z, x, y *fp751X2) //go:noescape func fp751Mul(z *fp751X2, x, y *Fp751Element) -// Perform Montgomery reduction: set z = x R^{-1} (mod 2*p). -// Destroys the input value. +// Function pointer that should point to one of the +// fp751MontgomeryReduce implementations below. +// When set, it performs Montgomery reduction: set z = x R^{-1} (mod 2*p). +// It may destroy the input value. var fp751MontgomeryReduce func(z *Fp751Element, x *fp751X2) //go:noescape @@ -53,7 +55,8 @@ func fp751MontgomeryReduceFallback(z *Fp751Element, x *fp751X2) //go:noescape func fp751StrongReduce(x *Fp751Element) -// Choose the fastest variant depending on CPU capabilities. +// On initialization, set the fp751MontgomeryReduce function pointer to the +// fastest implementation depending on CPU capabilities. func init() { if cpu.X86.HasBMI2 { if cpu.X86.HasADX { From 68adaf1ac78ffc0d10236e94c9f22b5ef9a3645e Mon Sep 17 00:00:00 2001 From: Ko- Date: Tue, 28 Aug 2018 18:08:42 +0100 Subject: [PATCH 7/7] p751: add tests for montgomery reduction implementations --- p751toolbox/field_amd64_test.go | 73 +++++++++++++++++++++++++++++++++ 1 file changed, 73 insertions(+) create mode 100644 p751toolbox/field_amd64_test.go diff --git a/p751toolbox/field_amd64_test.go b/p751toolbox/field_amd64_test.go new file mode 100644 index 0000000..b26636b --- /dev/null +++ b/p751toolbox/field_amd64_test.go @@ -0,0 +1,73 @@ +// +build amd64,!noasm + +package p751toolbox + +import ( + "golang.org/x/sys/cpu" + "testing" + "testing/quick" +) + +func TestFp751MontgomeryReduce(t *testing.T) { + // First make sure that at least one value with a known result reduces + // correctly as defined in TestPrimeFieldElementToBigInt. + fp751MontgomeryReduce = fp751MontgomeryReduceFallback + t.Run("PrimeFieldElementToBigInt", TestPrimeFieldElementToBigInt) + + if !cpu.X86.HasBMI2 { + return + } + + fp751MontgomeryReduce = fp751MontgomeryReduceBMI2 + t.Run("PrimeFieldElementToBigInt", TestPrimeFieldElementToBigInt) + + // Also check that the BMI2 implementation produces the same results + // as the fallback implementation. + compareMontgomeryReduce := func(x, y PrimeFieldElement) bool { + var z, zbackup fp751X2 + var zred1, zred2 Fp751Element + + fp751Mul(&z, &x.A, &y.A) + zbackup = z + + fp751MontgomeryReduceFallback(&zred1, &z) + // z may be destroyed. + z = zbackup + fp751MontgomeryReduceBMI2(&zred2, &z) + + return zred1 == zred2 + } + + if err := quick.Check(compareMontgomeryReduce, quickCheckConfig); err != nil { + t.Error(err) + } + + if !cpu.X86.HasADX { + return + } + + fp751MontgomeryReduce = fp751MontgomeryReduceBMI2ADX + t.Run("PrimeFieldElementToBigInt", TestPrimeFieldElementToBigInt) + + // Check that the BMI2ADX implementation produces the same results as + // the BMI2 implementation. By transitivity, it should also produce the + // same results as the fallback implementation. + compareMontgomeryReduce = func(x, y PrimeFieldElement) bool { + var z, zbackup fp751X2 + var zred1, zred2 Fp751Element + + fp751Mul(&z, &x.A, &y.A) + zbackup = z + + fp751MontgomeryReduceBMI2(&zred1, &z) + // z may be destroyed. + z = zbackup + fp751MontgomeryReduceBMI2ADX(&zred2, &z) + + return zred1 == zred2 + } + + if err := quick.Check(compareMontgomeryReduce, quickCheckConfig); err != nil { + t.Error(err) + } +}