1
0
mirror of https://github.com/henrydcase/nobs.git synced 2024-11-23 07:38:56 +00:00
nobs/dh/sidh/p751/arith_amd64.s

2539 lines
43 KiB
ArmAsm
Raw Normal View History

2018-07-23 23:18:38 +01:00
// +build amd64,!noasm
#include "textflag.h"
// p751 + 1
#define P751P1_5 $0xEEB0000000000000
#define P751P1_6 $0xE3EC968549F878A8
#define P751P1_7 $0xDA959B1A13F7CC76
#define P751P1_8 $0x084E9867D6EBE876
#define P751P1_9 $0x8562B5045CB25748
#define P751P1_10 $0x0E12909F97BADC66
#define P751P1_11 $0x00006FE5D541F71C
#define P751_0 $0xFFFFFFFFFFFFFFFF
#define P751_5 $0xEEAFFFFFFFFFFFFF
#define P751_6 $0xE3EC968549F878A8
#define P751_7 $0xDA959B1A13F7CC76
#define P751_8 $0x084E9867D6EBE876
#define P751_9 $0x8562B5045CB25748
#define P751_10 $0x0E12909F97BADC66
#define P751_11 $0x00006FE5D541F71C
#define P751X2_0 $0xFFFFFFFFFFFFFFFE
#define P751X2_1 $0xFFFFFFFFFFFFFFFF
#define P751X2_5 $0xDD5FFFFFFFFFFFFF
#define P751X2_6 $0xC7D92D0A93F0F151
#define P751X2_7 $0xB52B363427EF98ED
#define P751X2_8 $0x109D30CFADD7D0ED
#define P751X2_9 $0x0AC56A08B964AE90
#define P751X2_10 $0x1C25213F2F75B8CD
#define P751X2_11 $0x0000DFCBAA83EE38
// The MSR code uses these registers for parameter passing. Keep using
// them to avoid significant code changes. This means that when the Go
// assembler does something strange, we can diff the machine code
// against a different assembler to find out what Go did.
#define REG_P1 DI
#define REG_P2 SI
#define REG_P3 DX
TEXT ·fp751StrongReduce(SB), NOSPLIT, $0-8
MOVQ x+0(FP), REG_P1
// Zero AX for later use:
XORQ AX, AX
// Load p into registers:
MOVQ P751_0, R8
// P751_{1,2,3,4} = P751_0, so reuse R8
MOVQ P751_5, R9
MOVQ P751_6, R10
MOVQ P751_7, R11
MOVQ P751_8, R12
MOVQ P751_9, R13
MOVQ P751_10, R14
MOVQ P751_11, R15
// Set x <- x - p
SUBQ R8, (REG_P1)
SBBQ R8, (8)(REG_P1)
SBBQ R8, (16)(REG_P1)
SBBQ R8, (24)(REG_P1)
SBBQ R8, (32)(REG_P1)
SBBQ R9, (40)(REG_P1)
SBBQ R10, (48)(REG_P1)
SBBQ R11, (56)(REG_P1)
SBBQ R12, (64)(REG_P1)
SBBQ R13, (72)(REG_P1)
SBBQ R14, (80)(REG_P1)
SBBQ R15, (88)(REG_P1)
// Save carry flag indicating x-p < 0 as a mask in AX
SBBQ $0, AX
// Conditionally add p to x if x-p < 0
ANDQ AX, R8
ANDQ AX, R9
ANDQ AX, R10
ANDQ AX, R11
ANDQ AX, R12
ANDQ AX, R13
ANDQ AX, R14
ANDQ AX, R15
ADDQ R8, (REG_P1)
ADCQ R8, (8)(REG_P1)
ADCQ R8, (16)(REG_P1)
ADCQ R8, (24)(REG_P1)
ADCQ R8, (32)(REG_P1)
ADCQ R9, (40)(REG_P1)
ADCQ R10, (48)(REG_P1)
ADCQ R11, (56)(REG_P1)
ADCQ R12, (64)(REG_P1)
ADCQ R13, (72)(REG_P1)
ADCQ R14, (80)(REG_P1)
ADCQ R15, (88)(REG_P1)
RET
TEXT ·fp751ConditionalSwap(SB), NOSPLIT, $0-17
MOVQ x+0(FP), REG_P1
MOVQ y+8(FP), REG_P2
MOVBLZX choice+16(FP), AX // AL = 0 or 1
// Make AX, so that either all bits are set or non
// AX = 0 or 1
NEGQ AX
// Fill xmm15. After this step first half of XMM15 is
// just zeros and second half is whatever in AX
MOVQ AX, X15
// Copy lower double word everywhere else. So that
// XMM15=AL|AL|AL|AL. As AX has either all bits set
// or non result will be that XMM15 has also either
// all bits set or non of them.
PSHUFD $0, X15, X15
#ifndef CSWAP_BLOCK
#define CSWAP_BLOCK(idx) \
MOVOU (idx*16)(REG_P1), X0 \
MOVOU (idx*16)(REG_P2), X1 \
\ // X2 = mask & (X0 ^ X1)
MOVO X1, X2 \
PXOR X0, X2 \
PAND X15, X2 \
\
PXOR X2, X0 \
PXOR X2, X1 \
\
MOVOU X0, (idx*16)(REG_P1) \
MOVOU X1, (idx*16)(REG_P2)
#endif
CSWAP_BLOCK(0)
CSWAP_BLOCK(1)
CSWAP_BLOCK(2)
CSWAP_BLOCK(3)
CSWAP_BLOCK(4)
CSWAP_BLOCK(5)
#ifdef CSWAP_BLOCK
#undef CSWAP_BLOCK
#endif
2018-07-23 23:18:38 +01:00
RET
TEXT ·fp751AddReduced(SB), NOSPLIT, $0-24
MOVQ z+0(FP), REG_P3
MOVQ x+8(FP), REG_P1
MOVQ y+16(FP), REG_P2
MOVQ (REG_P1), R8
MOVQ (8)(REG_P1), R9
MOVQ (16)(REG_P1), R10
MOVQ (24)(REG_P1), R11
MOVQ (32)(REG_P1), R12
MOVQ (40)(REG_P1), R13
MOVQ (48)(REG_P1), R14
MOVQ (56)(REG_P1), R15
MOVQ (64)(REG_P1), CX
ADDQ (REG_P2), R8
ADCQ (8)(REG_P2), R9
ADCQ (16)(REG_P2), R10
ADCQ (24)(REG_P2), R11
ADCQ (32)(REG_P2), R12
ADCQ (40)(REG_P2), R13
ADCQ (48)(REG_P2), R14
ADCQ (56)(REG_P2), R15
ADCQ (64)(REG_P2), CX
MOVQ (72)(REG_P1), AX
ADCQ (72)(REG_P2), AX
MOVQ AX, (72)(REG_P3)
MOVQ (80)(REG_P1), AX
ADCQ (80)(REG_P2), AX
MOVQ AX, (80)(REG_P3)
MOVQ (88)(REG_P1), AX
ADCQ (88)(REG_P2), AX
MOVQ AX, (88)(REG_P3)
MOVQ P751X2_0, AX
SUBQ AX, R8
MOVQ P751X2_1, AX
SBBQ AX, R9
SBBQ AX, R10
SBBQ AX, R11
SBBQ AX, R12
MOVQ P751X2_5, AX
SBBQ AX, R13
MOVQ P751X2_6, AX
SBBQ AX, R14
MOVQ P751X2_7, AX
SBBQ AX, R15
MOVQ P751X2_8, AX
SBBQ AX, CX
MOVQ R8, (REG_P3)
MOVQ R9, (8)(REG_P3)
MOVQ R10, (16)(REG_P3)
MOVQ R11, (24)(REG_P3)
MOVQ R12, (32)(REG_P3)
MOVQ R13, (40)(REG_P3)
MOVQ R14, (48)(REG_P3)
MOVQ R15, (56)(REG_P3)
MOVQ CX, (64)(REG_P3)
MOVQ (72)(REG_P3), R8
MOVQ (80)(REG_P3), R9
MOVQ (88)(REG_P3), R10
MOVQ P751X2_9, AX
SBBQ AX, R8
MOVQ P751X2_10, AX
SBBQ AX, R9
MOVQ P751X2_11, AX
SBBQ AX, R10
MOVQ R8, (72)(REG_P3)
MOVQ R9, (80)(REG_P3)
MOVQ R10, (88)(REG_P3)
MOVQ $0, AX
2018-07-23 23:18:38 +01:00
SBBQ $0, AX
MOVQ P751X2_0, SI
ANDQ AX, SI
MOVQ P751X2_1, R8
ANDQ AX, R8
MOVQ P751X2_5, R9
ANDQ AX, R9
MOVQ P751X2_6, R10
ANDQ AX, R10
MOVQ P751X2_7, R11
ANDQ AX, R11
MOVQ P751X2_8, R12
ANDQ AX, R12
MOVQ P751X2_9, R13
ANDQ AX, R13
MOVQ P751X2_10, R14
ANDQ AX, R14
MOVQ P751X2_11, R15
ANDQ AX, R15
MOVQ (REG_P3), AX
ADDQ SI, AX
MOVQ AX, (REG_P3)
MOVQ (8)(REG_P3), AX
ADCQ R8, AX
MOVQ AX, (8)(REG_P3)
MOVQ (16)(REG_P3), AX
ADCQ R8, AX
MOVQ AX, (16)(REG_P3)
MOVQ (24)(REG_P3), AX
ADCQ R8, AX
MOVQ AX, (24)(REG_P3)
MOVQ (32)(REG_P3), AX
ADCQ R8, AX
MOVQ AX, (32)(REG_P3)
MOVQ (40)(REG_P3), AX
ADCQ R9, AX
MOVQ AX, (40)(REG_P3)
MOVQ (48)(REG_P3), AX
ADCQ R10, AX
MOVQ AX, (48)(REG_P3)
MOVQ (56)(REG_P3), AX
ADCQ R11, AX
MOVQ AX, (56)(REG_P3)
MOVQ (64)(REG_P3), AX
ADCQ R12, AX
MOVQ AX, (64)(REG_P3)
MOVQ (72)(REG_P3), AX
ADCQ R13, AX
MOVQ AX, (72)(REG_P3)
MOVQ (80)(REG_P3), AX
ADCQ R14, AX
MOVQ AX, (80)(REG_P3)
MOVQ (88)(REG_P3), AX
ADCQ R15, AX
MOVQ AX, (88)(REG_P3)
RET
TEXT ·fp751SubReduced(SB), NOSPLIT, $0-24
MOVQ z+0(FP), REG_P3
MOVQ x+8(FP), REG_P1
MOVQ y+16(FP), REG_P2
MOVQ (REG_P1), R8
MOVQ (8)(REG_P1), R9
MOVQ (16)(REG_P1), R10
MOVQ (24)(REG_P1), R11
MOVQ (32)(REG_P1), R12
MOVQ (40)(REG_P1), R13
MOVQ (48)(REG_P1), R14
MOVQ (56)(REG_P1), R15
MOVQ (64)(REG_P1), CX
SUBQ (REG_P2), R8
SBBQ (8)(REG_P2), R9
SBBQ (16)(REG_P2), R10
SBBQ (24)(REG_P2), R11
SBBQ (32)(REG_P2), R12
SBBQ (40)(REG_P2), R13
SBBQ (48)(REG_P2), R14
SBBQ (56)(REG_P2), R15
SBBQ (64)(REG_P2), CX
MOVQ R8, (REG_P3)
MOVQ R9, (8)(REG_P3)
MOVQ R10, (16)(REG_P3)
MOVQ R11, (24)(REG_P3)
MOVQ R12, (32)(REG_P3)
MOVQ R13, (40)(REG_P3)
MOVQ R14, (48)(REG_P3)
MOVQ R15, (56)(REG_P3)
MOVQ CX, (64)(REG_P3)
MOVQ (72)(REG_P1), AX
SBBQ (72)(REG_P2), AX
MOVQ AX, (72)(REG_P3)
MOVQ (80)(REG_P1), AX
SBBQ (80)(REG_P2), AX
MOVQ AX, (80)(REG_P3)
MOVQ (88)(REG_P1), AX
SBBQ (88)(REG_P2), AX
MOVQ AX, (88)(REG_P3)
MOVQ $0, AX
2018-07-23 23:18:38 +01:00
SBBQ $0, AX
MOVQ P751X2_0, SI
ANDQ AX, SI
MOVQ P751X2_1, R8
ANDQ AX, R8
MOVQ P751X2_5, R9
ANDQ AX, R9
MOVQ P751X2_6, R10
ANDQ AX, R10
MOVQ P751X2_7, R11
ANDQ AX, R11
MOVQ P751X2_8, R12
ANDQ AX, R12
MOVQ P751X2_9, R13
ANDQ AX, R13
MOVQ P751X2_10, R14
ANDQ AX, R14
MOVQ P751X2_11, R15
ANDQ AX, R15
MOVQ (REG_P3), AX
ADDQ SI, AX
MOVQ AX, (REG_P3)
MOVQ (8)(REG_P3), AX
ADCQ R8, AX
MOVQ AX, (8)(REG_P3)
MOVQ (16)(REG_P3), AX
ADCQ R8, AX
MOVQ AX, (16)(REG_P3)
MOVQ (24)(REG_P3), AX
ADCQ R8, AX
MOVQ AX, (24)(REG_P3)
MOVQ (32)(REG_P3), AX
ADCQ R8, AX
MOVQ AX, (32)(REG_P3)
MOVQ (40)(REG_P3), AX
ADCQ R9, AX
MOVQ AX, (40)(REG_P3)
MOVQ (48)(REG_P3), AX
ADCQ R10, AX
MOVQ AX, (48)(REG_P3)
MOVQ (56)(REG_P3), AX
ADCQ R11, AX
MOVQ AX, (56)(REG_P3)
MOVQ (64)(REG_P3), AX
ADCQ R12, AX
MOVQ AX, (64)(REG_P3)
MOVQ (72)(REG_P3), AX
ADCQ R13, AX
MOVQ AX, (72)(REG_P3)
MOVQ (80)(REG_P3), AX
ADCQ R14, AX
MOVQ AX, (80)(REG_P3)
MOVQ (88)(REG_P3), AX
ADCQ R15, AX
MOVQ AX, (88)(REG_P3)
RET
TEXT ·fp751Mul(SB), $96-24
// Here we store the destination in CX instead of in REG_P3 because the
// multiplication instructions use DX as an implicit destination
// operand: MULQ $REG sets DX:AX <-- AX * $REG.
MOVQ z+0(FP), CX
MOVQ x+8(FP), REG_P1
MOVQ y+16(FP), REG_P2
XORQ AX, AX
MOVQ (48)(REG_P1), R8
MOVQ (56)(REG_P1), R9
MOVQ (64)(REG_P1), R10
MOVQ (72)(REG_P1), R11
MOVQ (80)(REG_P1), R12
MOVQ (88)(REG_P1), R13
ADDQ (REG_P1), R8
ADCQ (8)(REG_P1), R9
ADCQ (16)(REG_P1), R10
ADCQ (24)(REG_P1), R11
ADCQ (32)(REG_P1), R12
ADCQ (40)(REG_P1), R13
MOVQ R8, (CX)
MOVQ R9, (8)(CX)
MOVQ R10, (16)(CX)
MOVQ R11, (24)(CX)
MOVQ R12, (32)(CX)
MOVQ R13, (40)(CX)
SBBQ $0, AX
XORQ DX, DX
MOVQ (48)(REG_P2), R8
MOVQ (56)(REG_P2), R9
MOVQ (64)(REG_P2), R10
MOVQ (72)(REG_P2), R11
MOVQ (80)(REG_P2), R12
MOVQ (88)(REG_P2), R13
ADDQ (REG_P2), R8
ADCQ (8)(REG_P2), R9
ADCQ (16)(REG_P2), R10
ADCQ (24)(REG_P2), R11
ADCQ (32)(REG_P2), R12
ADCQ (40)(REG_P2), R13
MOVQ R8, (48)(CX)
MOVQ R9, (56)(CX)
MOVQ R10, (64)(CX)
MOVQ R11, (72)(CX)
MOVQ R12, (80)(CX)
MOVQ R13, (88)(CX)
SBBQ $0, DX
MOVQ AX, (80)(SP)
MOVQ DX, (88)(SP)
// (SP[0-8],R10,R8,R9) <- (AH+AL)*(BH+BL)
MOVQ (CX), R11
MOVQ R8, AX
MULQ R11
MOVQ AX, (SP) // c0
MOVQ DX, R14
XORQ R15, R15
MOVQ R9, AX
MULQ R11
XORQ R9, R9
ADDQ AX, R14
ADCQ DX, R9
MOVQ (8)(CX), R12
MOVQ R8, AX
MULQ R12
ADDQ AX, R14
MOVQ R14, (8)(SP) // c1
ADCQ DX, R9
ADCQ $0, R15
XORQ R8, R8
MOVQ R10, AX
MULQ R11
ADDQ AX, R9
MOVQ (48)(CX), R13
ADCQ DX, R15
ADCQ $0, R8
MOVQ (16)(CX), AX
MULQ R13
ADDQ AX, R9
ADCQ DX, R15
MOVQ (56)(CX), AX
ADCQ $0, R8
MULQ R12
ADDQ AX, R9
MOVQ R9, (16)(SP) // c2
ADCQ DX, R15
ADCQ $0, R8
XORQ R9, R9
MOVQ (72)(CX), AX
MULQ R11
ADDQ AX, R15
ADCQ DX, R8
ADCQ $0, R9
MOVQ (24)(CX), AX
MULQ R13
ADDQ AX, R15
ADCQ DX, R8
ADCQ $0, R9
MOVQ R10, AX
MULQ R12
ADDQ AX, R15
ADCQ DX, R8
ADCQ $0, R9
MOVQ (16)(CX), R14
MOVQ (56)(CX), AX
MULQ R14
ADDQ AX, R15
MOVQ R15, (24)(SP) // c3
ADCQ DX, R8
ADCQ $0, R9
XORQ R10, R10
MOVQ (80)(CX), AX
MULQ R11
ADDQ AX, R8
ADCQ DX, R9
ADCQ $0, R10
MOVQ (64)(CX), AX
MULQ R14
ADDQ AX, R8
ADCQ DX, R9
ADCQ $0, R10
MOVQ (48)(CX), R15
MOVQ (32)(CX), AX
MULQ R15
ADDQ AX, R8
ADCQ DX, R9
ADCQ $0, R10
MOVQ (72)(CX), AX
MULQ R12
ADDQ AX, R8
ADCQ DX, R9
ADCQ $0, R10
MOVQ (24)(CX), R13
MOVQ (56)(CX), AX
MULQ R13
ADDQ AX, R8
MOVQ R8, (32)(SP) // c4
ADCQ DX, R9
ADCQ $0, R10
XORQ R8, R8
MOVQ (88)(CX), AX
MULQ R11
ADDQ AX, R9
ADCQ DX, R10
ADCQ $0, R8
MOVQ (64)(CX), AX
MULQ R13
ADDQ AX, R9
ADCQ DX, R10
ADCQ $0, R8
MOVQ (72)(CX), AX
MULQ R14
ADDQ AX, R9
ADCQ DX, R10
ADCQ $0, R8
MOVQ (40)(CX), AX
MULQ R15
ADDQ AX, R9
ADCQ DX, R10
ADCQ $0, R8
MOVQ (80)(CX), AX
MULQ R12
ADDQ AX, R9
ADCQ DX, R10
ADCQ $0, R8
MOVQ (32)(CX), R15
MOVQ (56)(CX), AX
MULQ R15
ADDQ AX, R9
MOVQ R9, (40)(SP) // c5
ADCQ DX, R10
ADCQ $0, R8
XORQ R9, R9
MOVQ (64)(CX), AX
MULQ R15
ADDQ AX, R10
ADCQ DX, R8
ADCQ $0, R9
MOVQ (88)(CX), AX
MULQ R12
ADDQ AX, R10
ADCQ DX, R8
ADCQ $0, R9
MOVQ (80)(CX), AX
MULQ R14
ADDQ AX, R10
ADCQ DX, R8
ADCQ $0, R9
MOVQ (40)(CX), R11
MOVQ (56)(CX), AX
MULQ R11
ADDQ AX, R10
ADCQ DX, R8
ADCQ $0, R9
MOVQ (72)(CX), AX
MULQ R13
ADDQ AX, R10
MOVQ R10, (48)(SP) // c6
ADCQ DX, R8
ADCQ $0, R9
XORQ R10, R10
MOVQ (88)(CX), AX
MULQ R14
ADDQ AX, R8
ADCQ DX, R9
ADCQ $0, R10
MOVQ (64)(CX), AX
MULQ R11
ADDQ AX, R8
ADCQ DX, R9
ADCQ $0, R10
MOVQ (80)(CX), AX
MULQ R13
ADDQ AX, R8
ADCQ DX, R9
ADCQ $0, R10
MOVQ (72)(CX), AX
MULQ R15
ADDQ AX, R8
MOVQ R8, (56)(SP) // c7
ADCQ DX, R9
ADCQ $0, R10
XORQ R8, R8
MOVQ (72)(CX), AX
MULQ R11
ADDQ AX, R9
ADCQ DX, R10
ADCQ $0, R8
MOVQ (80)(CX), AX
MULQ R15
ADDQ AX, R9
ADCQ DX, R10
ADCQ $0, R8
MOVQ (88)(CX), AX
MULQ R13
ADDQ AX, R9
MOVQ R9, (64)(SP) // c8
ADCQ DX, R10
ADCQ $0, R8
XORQ R9, R9
MOVQ (88)(CX), AX
MULQ R15
ADDQ AX, R10
ADCQ DX, R8
ADCQ $0, R9
MOVQ (80)(CX), AX
MULQ R11
ADDQ AX, R10 // c9
ADCQ DX, R8
ADCQ $0, R9
MOVQ (88)(CX), AX
MULQ R11
ADDQ AX, R8 // c10
ADCQ DX, R9 // c11
MOVQ (88)(SP), AX
MOVQ (CX), DX
ANDQ AX, R12
ANDQ AX, R14
ANDQ AX, DX
ANDQ AX, R13
ANDQ AX, R15
ANDQ AX, R11
MOVQ (48)(SP), AX
ADDQ AX, DX
MOVQ (56)(SP), AX
ADCQ AX, R12
MOVQ (64)(SP), AX
ADCQ AX, R14
ADCQ R10, R13
ADCQ R8, R15
ADCQ R9, R11
MOVQ (80)(SP), AX
MOVQ DX, (48)(SP)
MOVQ R12, (56)(SP)
MOVQ R14, (64)(SP)
MOVQ R13, (72)(SP)
MOVQ R15, (80)(SP)
MOVQ R11, (88)(SP)
MOVQ (48)(CX), R8
MOVQ (56)(CX), R9
MOVQ (64)(CX), R10
MOVQ (72)(CX), R11
MOVQ (80)(CX), R12
MOVQ (88)(CX), R13
ANDQ AX, R8
ANDQ AX, R9
ANDQ AX, R10
ANDQ AX, R11
ANDQ AX, R12
ANDQ AX, R13
MOVQ (48)(SP), AX
ADDQ AX, R8
MOVQ (56)(SP), AX
ADCQ AX, R9
MOVQ (64)(SP), AX
ADCQ AX, R10
MOVQ (72)(SP), AX
ADCQ AX, R11
MOVQ (80)(SP), AX
ADCQ AX, R12
MOVQ (88)(SP), AX
ADCQ AX, R13
MOVQ R8, (48)(SP)
MOVQ R9, (56)(SP)
MOVQ R11, (72)(SP)
// CX[0-11] <- AL*BL
MOVQ (REG_P1), R11
MOVQ (REG_P2), AX
MULQ R11
XORQ R9, R9
MOVQ AX, (CX) // c0
MOVQ R10, (64)(SP)
MOVQ DX, R8
MOVQ (8)(REG_P2), AX
MULQ R11
XORQ R10, R10
ADDQ AX, R8
MOVQ R12, (80)(SP)
ADCQ DX, R9
MOVQ (8)(REG_P1), R12
MOVQ (REG_P2), AX
MULQ R12
ADDQ AX, R8
MOVQ R8, (8)(CX) // c1
ADCQ DX, R9
MOVQ R13, (88)(SP)
ADCQ $0, R10
XORQ R8, R8
MOVQ (16)(REG_P2), AX
MULQ R11
ADDQ AX, R9
ADCQ DX, R10
ADCQ $0, R8
MOVQ (REG_P2), R13
MOVQ (16)(REG_P1), AX
MULQ R13
ADDQ AX, R9
ADCQ DX, R10
ADCQ $0, R8
MOVQ (8)(REG_P2), AX
MULQ R12
ADDQ AX, R9
MOVQ R9, (16)(CX) // c2
ADCQ DX, R10
ADCQ $0, R8
XORQ R9, R9
MOVQ (24)(REG_P2), AX
MULQ R11
ADDQ AX, R10
ADCQ DX, R8
ADCQ $0, R9
MOVQ (24)(REG_P1), AX
MULQ R13
ADDQ AX, R10
ADCQ DX, R8
ADCQ $0, R9
MOVQ (16)(REG_P2), AX
MULQ R12
ADDQ AX, R10
ADCQ DX, R8
ADCQ $0, R9
MOVQ (16)(REG_P1), R14
MOVQ (8)(REG_P2), AX
MULQ R14
ADDQ AX, R10
MOVQ R10, (24)(CX) // c3
ADCQ DX, R8
ADCQ $0, R9
XORQ R10, R10
MOVQ (32)(REG_P2), AX
MULQ R11
ADDQ AX, R8
ADCQ DX, R9
ADCQ $0, R10
MOVQ (16)(REG_P2), AX
MULQ R14
ADDQ AX, R8
ADCQ DX, R9
ADCQ $0, R10
MOVQ (32)(REG_P1), AX
MULQ R13
ADDQ AX, R8
ADCQ DX, R9
ADCQ $0, R10
MOVQ (24)(REG_P2), AX
MULQ R12
ADDQ AX, R8
ADCQ DX, R9
ADCQ $0, R10
MOVQ (24)(REG_P1), R13
MOVQ (8)(REG_P2), AX
MULQ R13
ADDQ AX, R8
MOVQ R8, (32)(CX) // c4
ADCQ DX, R9
ADCQ $0, R10
XORQ R8, R8
MOVQ (40)(REG_P2), AX
MULQ R11
ADDQ AX, R9
ADCQ DX, R10
ADCQ $0, R8
MOVQ (16)(REG_P2), AX
MULQ R13
ADDQ AX, R9
ADCQ DX, R10
ADCQ $0, R8
MOVQ (24)(REG_P2), AX
MULQ R14
ADDQ AX, R9
ADCQ DX, R10
ADCQ $0, R8
MOVQ (40)(REG_P1), R11
MOVQ (REG_P2), AX
MULQ R11
ADDQ AX, R9
ADCQ DX, R10
ADCQ $0, R8
MOVQ (32)(REG_P2), AX
MULQ R12
ADDQ AX, R9
ADCQ DX, R10
ADCQ $0, R8
MOVQ (32)(REG_P1), R15
MOVQ (8)(REG_P2), AX
MULQ R15
ADDQ AX, R9
MOVQ R9, (40)(CX) //c5
ADCQ DX, R10
ADCQ $0, R8
XORQ R9, R9
MOVQ (16)(REG_P2), AX
MULQ R15
ADDQ AX, R10
ADCQ DX, R8
ADCQ $0, R9
MOVQ (40)(REG_P2), AX
MULQ R12
ADDQ AX, R10
ADCQ DX, R8
ADCQ $0, R9
MOVQ (32)(REG_P2), AX
MULQ R14
ADDQ AX, R10
ADCQ DX, R8
ADCQ $0, R9
MOVQ (8)(REG_P2), AX
MULQ R11
ADDQ AX, R10
ADCQ DX, R8
ADCQ $0, R9
MOVQ (24)(REG_P2), AX
MULQ R13
ADDQ AX, R10
MOVQ R10, (48)(CX) // c6
ADCQ DX, R8
ADCQ $0, R9
XORQ R10, R10
MOVQ (40)(REG_P2), AX
MULQ R14
ADDQ AX, R8
ADCQ DX, R9
ADCQ $0, R10
MOVQ (16)(REG_P2), AX
MULQ R11
ADDQ AX, R8
ADCQ DX, R9
ADCQ $0, R10
MOVQ (32)(REG_P2), AX
MULQ R13
ADDQ AX, R8
ADCQ DX, R9
ADCQ $0, R10
MOVQ (24)(REG_P2), AX
MULQ R15
ADDQ AX, R8
MOVQ R8, (56)(CX) // c7
ADCQ DX, R9
ADCQ $0, R10
XORQ R8, R8
MOVQ (24)(REG_P2), AX
MULQ R11
ADDQ AX, R9
ADCQ DX, R10
ADCQ $0, R8
MOVQ (32)(REG_P2), AX
MULQ R15
ADDQ AX, R9
ADCQ DX, R10
ADCQ $0, R8
MOVQ (40)(REG_P2), AX
MULQ R13
ADDQ AX, R9
MOVQ R9, (64)(CX) // c8
ADCQ DX, R10
ADCQ $0, R8
XORQ R9, R9
MOVQ (40)(REG_P2), AX
MULQ R15
ADDQ AX, R10
ADCQ DX, R8
ADCQ $0, R9
MOVQ (32)(REG_P2), AX
MULQ R11
ADDQ AX, R10
MOVQ R10, (72)(CX) // c9
ADCQ DX, R8
ADCQ $0, R9
MOVQ (40)(REG_P2), AX
MULQ R11
ADDQ AX, R8
MOVQ R8, (80)(CX) // c10
ADCQ DX, R9
MOVQ R9, (88)(CX) // c11
// CX[12-23] <- AH*BH
MOVQ (48)(REG_P1), R11
MOVQ (48)(REG_P2), AX
MULQ R11
XORQ R9, R9
MOVQ AX, (96)(CX) // c0
MOVQ DX, R8
MOVQ (56)(REG_P2), AX
MULQ R11
XORQ R10, R10
ADDQ AX, R8
ADCQ DX, R9
MOVQ (56)(REG_P1), R12
MOVQ (48)(REG_P2), AX
MULQ R12
ADDQ AX, R8
MOVQ R8, (104)(CX) // c1
ADCQ DX, R9
ADCQ $0, R10
XORQ R8, R8
MOVQ (64)(REG_P2), AX
MULQ R11
ADDQ AX, R9
ADCQ DX, R10
ADCQ $0, R8
MOVQ (48)(REG_P2), R13
MOVQ (64)(REG_P1), AX
MULQ R13
ADDQ AX, R9
ADCQ DX, R10
ADCQ $0, R8
MOVQ (56)(REG_P2), AX
MULQ R12
ADDQ AX, R9
MOVQ R9, (112)(CX) // c2
ADCQ DX, R10
ADCQ $0, R8
XORQ R9, R9
MOVQ (72)(REG_P2), AX
MULQ R11
ADDQ AX, R10
ADCQ DX, R8
ADCQ $0, R9
MOVQ (72)(REG_P1), AX
MULQ R13
ADDQ AX, R10
ADCQ DX, R8
ADCQ $0, R9
MOVQ (64)(REG_P2), AX
MULQ R12
ADDQ AX, R10
ADCQ DX, R8
ADCQ $0, R9
MOVQ (64)(REG_P1), R14
MOVQ (56)(REG_P2), AX
MULQ R14
ADDQ AX, R10
MOVQ R10, (120)(CX) // c3
ADCQ DX, R8
ADCQ $0, R9
XORQ R10, R10
MOVQ (80)(REG_P2), AX
MULQ R11
ADDQ AX, R8
ADCQ DX, R9
ADCQ $0, R10
MOVQ (64)(REG_P2), AX
MULQ R14
ADDQ AX, R8
ADCQ DX, R9
ADCQ $0, R10
MOVQ (80)(REG_P1), R15
MOVQ R13, AX
MULQ R15
ADDQ AX, R8
ADCQ DX, R9
ADCQ $0, R10
MOVQ (72)(REG_P2), AX
MULQ R12
ADDQ AX, R8
ADCQ DX, R9
ADCQ $0, R10
MOVQ (72)(REG_P1), R13
MOVQ (56)(REG_P2), AX
MULQ R13
ADDQ AX, R8
MOVQ R8, (128)(CX) // c4
ADCQ DX, R9
ADCQ $0, R10
XORQ R8, R8
MOVQ (88)(REG_P2), AX
MULQ R11
ADDQ AX, R9
ADCQ DX, R10
ADCQ $0, R8
MOVQ (64)(REG_P2), AX
MULQ R13
ADDQ AX, R9
ADCQ DX, R10
ADCQ $0, R8
MOVQ (72)(REG_P2), AX
MULQ R14
ADDQ AX, R9
ADCQ DX, R10
ADCQ $0, R8
MOVQ (88)(REG_P1), R11
MOVQ (48)(REG_P2), AX
MULQ R11
ADDQ AX, R9
ADCQ DX, R10
ADCQ $0, R8
MOVQ (80)(REG_P2), AX
MULQ R12
ADDQ AX, R9
ADCQ DX, R10
ADCQ $0, R8
MOVQ (56)(REG_P2), AX
MULQ R15
ADDQ AX, R9
MOVQ R9, (136)(CX) // c5
ADCQ DX, R10
ADCQ $0, R8
XORQ R9, R9
MOVQ (64)(REG_P2), AX
MULQ R15
ADDQ AX, R10
ADCQ DX, R8
ADCQ $0, R9
MOVQ (88)(REG_P2), AX
MULQ R12
ADDQ AX, R10
ADCQ DX, R8
ADCQ $0, R9
MOVQ (80)(REG_P2), AX
MULQ R14
ADDQ AX, R10
ADCQ DX, R8
ADCQ $0, R9
MOVQ (56)(REG_P2), AX
MULQ R11
ADDQ AX, R10
ADCQ DX, R8
ADCQ $0, R9
MOVQ (72)(REG_P2), AX
MULQ R13
ADDQ AX, R10
MOVQ R10, (144)(CX) // c6
ADCQ DX, R8
ADCQ $0, R9
XORQ R10, R10
MOVQ (88)(REG_P2), AX
MULQ R14
ADDQ AX, R8
ADCQ DX, R9
ADCQ $0, R10
MOVQ (64)(REG_P2), AX
MULQ R11
ADDQ AX, R8
ADCQ DX, R9
ADCQ $0, R10
MOVQ (80)(REG_P2), AX
MULQ R13
ADDQ AX, R8
ADCQ DX, R9
ADCQ $0, R10
MOVQ (72)(REG_P2), AX
MULQ R15
ADDQ AX, R8
MOVQ R8, (152)(CX) // c7
ADCQ DX, R9
ADCQ $0, R10
XORQ R8, R8
MOVQ (72)(REG_P2), AX
MULQ R11
ADDQ AX, R9
ADCQ DX, R10
ADCQ $0, R8
MOVQ (80)(REG_P2), AX
MULQ R15
ADDQ AX, R9
ADCQ DX, R10
ADCQ $0, R8
MOVQ (88)(REG_P2), AX
MULQ R13
ADDQ AX, R9
MOVQ R9, (160)(CX) // c8
ADCQ DX, R10
ADCQ $0, R8
MOVQ (88)(REG_P2), AX
MULQ R15
ADDQ AX, R10
ADCQ DX, R8
MOVQ (80)(REG_P2), AX
MULQ R11
ADDQ AX, R10
MOVQ R10, (168)(CX) // c9
ADCQ DX, R8
MOVQ (88)(REG_P2), AX
MULQ R11
ADDQ AX, R8
MOVQ R8, (176)(CX) // c10
ADCQ $0, DX
MOVQ DX, (184)(CX) // c11
// [R8-R15,AX,DX,DI,(SP)] <- (AH+AL)*(BH+BL)-AL*BL
MOVQ (SP), R8
SUBQ (CX), R8
MOVQ (8)(SP), R9
SBBQ (8)(CX), R9
MOVQ (16)(SP), R10
SBBQ (16)(CX), R10
MOVQ (24)(SP), R11
SBBQ (24)(CX), R11
MOVQ (32)(SP), R12
SBBQ (32)(CX), R12
MOVQ (40)(SP), R13
SBBQ (40)(CX), R13
MOVQ (48)(SP), R14
SBBQ (48)(CX), R14
MOVQ (56)(SP), R15
SBBQ (56)(CX), R15
MOVQ (64)(SP), AX
SBBQ (64)(CX), AX
MOVQ (72)(SP), DX
SBBQ (72)(CX), DX
MOVQ (80)(SP), DI
SBBQ (80)(CX), DI
MOVQ (88)(SP), SI
SBBQ (88)(CX), SI
MOVQ SI, (SP)
// [R8-R15,AX,DX,DI,(SP)] <- (AH+AL)*(BH+BL) - AL*BL - AH*BH
MOVQ (96)(CX), SI
SUBQ SI, R8
MOVQ (104)(CX), SI
SBBQ SI, R9
MOVQ (112)(CX), SI
SBBQ SI, R10
MOVQ (120)(CX), SI
SBBQ SI, R11
MOVQ (128)(CX), SI
SBBQ SI, R12
MOVQ (136)(CX), SI
SBBQ SI, R13
MOVQ (144)(CX), SI
SBBQ SI, R14
MOVQ (152)(CX), SI
SBBQ SI, R15
MOVQ (160)(CX), SI
SBBQ SI, AX
MOVQ (168)(CX), SI
SBBQ SI, DX
MOVQ (176)(CX), SI
SBBQ SI, DI
MOVQ (SP), SI
SBBQ (184)(CX), SI
// FINAL RESULT
ADDQ (48)(CX), R8
MOVQ R8, (48)(CX)
ADCQ (56)(CX), R9
MOVQ R9, (56)(CX)
ADCQ (64)(CX), R10
MOVQ R10, (64)(CX)
ADCQ (72)(CX), R11
MOVQ R11, (72)(CX)
ADCQ (80)(CX), R12
MOVQ R12, (80)(CX)
ADCQ (88)(CX), R13
MOVQ R13, (88)(CX)
ADCQ (96)(CX), R14
MOVQ R14, (96)(CX)
ADCQ (104)(CX), R15
MOVQ R15, (104)(CX)
ADCQ (112)(CX), AX
MOVQ AX, (112)(CX)
ADCQ (120)(CX), DX
MOVQ DX, (120)(CX)
ADCQ (128)(CX), DI
MOVQ DI, (128)(CX)
ADCQ (136)(CX), SI
MOVQ SI, (136)(CX)
MOVQ (144)(CX), AX
ADCQ $0, AX
MOVQ AX, (144)(CX)
MOVQ (152)(CX), AX
ADCQ $0, AX
MOVQ AX, (152)(CX)
MOVQ (160)(CX), AX
ADCQ $0, AX
MOVQ AX, (160)(CX)
MOVQ (168)(CX), AX
ADCQ $0, AX
MOVQ AX, (168)(CX)
MOVQ (176)(CX), AX
ADCQ $0, AX
MOVQ AX, (176)(CX)
MOVQ (184)(CX), AX
ADCQ $0, AX
MOVQ AX, (184)(CX)
RET
// This multiplies a 256-bit number pointed to by M0 with p751+1.
// It is assumed that M1 points to p751+1 stored as a 768-bit Fp751Element.
// C points to the place to store the result and should be at least 192 bits.
// This should only be used when the BMI2 and ADX instruction set extensions
// are available.
#define mul256x448bmi2adx(M0, M1, C, T0, T1, T2, T3, T4, T5, T6, T7, T8, T9, T10) \
MOVQ 0+M0, DX \
MULXQ M1+40(SB), T1, T0 \
MULXQ M1+48(SB), T3, T2 \
MOVQ T1, 0+C \ // C0_final
XORQ AX, AX \
MULXQ M1+56(SB), T5, T4 \
ADOXQ T3, T0 \
ADOXQ T5, T2 \
MULXQ M1+64(SB), T3, T1 \
ADOXQ T3, T4 \
MULXQ M1+72(SB), T6, T5 \
ADOXQ T6, T1 \
MULXQ M1+80(SB), T7, T3 \
ADOXQ T7, T5 \
MULXQ M1+88(SB), T8, T6 \
ADOXQ T8, T3 \
ADOXQ AX, T6 \
\
MOVQ 8+M0, DX \
MULXQ M1+40(SB), T7, T8 \
XORQ AX, AX \
ADCXQ T7, T0 \
MOVQ T0, 8+C \ // C1_final
ADCXQ T8, T2 \
MULXQ M1+48(SB), T8, T7 \
ADOXQ T8, T2 \
ADCXQ T7, T4 \
MULXQ M1+56(SB), T8, T0 \
ADOXQ T8, T4 \
ADCXQ T1, T0 \
MULXQ M1+64(SB), T7, T1 \
ADCXQ T5, T1 \
MULXQ M1+72(SB), T8, T5 \
ADCXQ T5, T3 \
MULXQ M1+80(SB), T9, T5 \
ADCXQ T5, T6 \
MULXQ M1+88(SB), DX, T5 \
ADCXQ AX, T5 \
\
ADOXQ T7, T0 \
ADOXQ T8, T1 \
ADOXQ T9, T3 \
ADOXQ DX, T6 \
ADOXQ AX, T5 \
\
MOVQ 16+M0, DX \
MULXQ M1+40(SB), T7, T8 \
XORQ AX, AX \
ADCXQ T7, T2 \
MOVQ T2, 16+C \ // C2_final
ADCXQ T8, T4 \
MULXQ M1+48(SB), T7, T8 \
ADOXQ T7, T4 \
ADCXQ T8, T0 \
MULXQ M1+56(SB), T8, T2 \
ADOXQ T8, T0 \
ADCXQ T2, T1 \
MULXQ M1+64(SB), T7, T2 \
ADCXQ T2, T3 \
MULXQ M1+72(SB), T8, T2 \
ADCXQ T2, T6 \
MULXQ M1+80(SB), T9, T2 \
ADCXQ T2, T5 \
MULXQ M1+88(SB), DX, T2 \
ADCXQ AX, T2 \
\
ADOXQ T7, T1 \
ADOXQ T8, T3 \
ADOXQ T9, T6 \
ADOXQ DX, T5 \
ADOXQ AX, T2 \
\
MOVQ 24+M0, DX \
MULXQ M1+40(SB), T7, T8 \
XORQ AX, AX \
ADCXQ T4, T7 \
ADCXQ T8, T0 \
MULXQ M1+48(SB), T10, T8 \
ADOXQ T10, T0 \
ADCXQ T8, T1 \
MULXQ M1+56(SB), T8, T4 \
ADOXQ T8, T1 \
ADCXQ T4, T3 \
MULXQ M1+64(SB), T10, T4 \
ADCXQ T4, T6 \
MULXQ M1+72(SB), T8, T4 \
ADCXQ T4, T5 \
MULXQ M1+80(SB), T9, T4 \
ADCXQ T4, T2 \
MULXQ M1+88(SB), DX, T4 \
ADCXQ AX, T4 \
\
ADOXQ T10, T3 \
ADOXQ T8, T6 \
ADOXQ T9, T5 \
ADOXQ DX, T2 \
ADOXQ AX, T4
// This multiplies a 256-bit number pointed to by M0 with p751+1.
// It is assumed that M1 points to p751+1 stored as a 768-bit Fp751Element.
// C points to the place to store the result and should be at least 192 bits.
// This should only be used when the BMI2 instruction set extension is
// available.
#define mul256x448bmi2(M0, M1, C, T0, T1, T2, T3, T4, T5, T6, T7, T8, T9, T10) \
MOVQ 0+M0, DX \
MULXQ M1+40(SB), T1, T0 \
MULXQ M1+48(SB), T3, T2 \
MOVQ T1, 0+C \ // C0_final
XORQ AX, AX \
MULXQ M1+56(SB), T5, T4 \
ADDQ T3, T0 \
ADCQ T5, T2 \
MULXQ M1+64(SB), T3, T1 \
ADCQ T3, T4 \
MULXQ M1+72(SB), T6, T5 \
ADCQ T6, T1 \
MULXQ M1+80(SB), T7, T3 \
ADCQ T7, T5 \
MULXQ M1+88(SB), T8, T6 \
ADCQ T8, T3 \
ADCQ AX, T6 \
\
MOVQ 8+M0, DX \
MULXQ M1+40(SB), T7, T8 \
ADDQ T7, T0 \
MOVQ T0, 8+C \ // C1_final
ADCQ T8, T2 \
MULXQ M1+48(SB), T8, T7 \
MOVQ T8, 32+C \
ADCQ T7, T4 \
MULXQ M1+56(SB), T8, T0 \
MOVQ T8, 40+C \
ADCQ T1, T0 \
MULXQ M1+64(SB), T7, T1 \
ADCQ T5, T1 \
MULXQ M1+72(SB), T8, T5 \
ADCQ T5, T3 \
MULXQ M1+80(SB), T9, T5 \
ADCQ T5, T6 \
MULXQ M1+88(SB), DX, T5 \
ADCQ AX, T5 \
\
XORQ AX, AX \
ADDQ 32+C, T2 \
ADCQ 40+C, T4 \
ADCQ T7, T0 \
ADCQ T8, T1 \
ADCQ T9, T3 \
ADCQ DX, T6 \
ADCQ AX, T5 \
\
MOVQ 16+M0, DX \
MULXQ M1+40(SB), T7, T8 \
ADDQ T7, T2 \
MOVQ T2, 16+C \ // C2_final
ADCQ T8, T4 \
MULXQ M1+48(SB), T7, T8 \
MOVQ T7, 32+C \
ADCQ T8, T0 \
MULXQ M1+56(SB), T8, T2 \
MOVQ T8, 40+C \
ADCQ T2, T1 \
MULXQ M1+64(SB), T7, T2 \
ADCQ T2, T3 \
MULXQ M1+72(SB), T8, T2 \
ADCQ T2, T6 \
MULXQ M1+80(SB), T9, T2 \
ADCQ T2, T5 \
MULXQ M1+88(SB), DX, T2 \
ADCQ AX, T2 \
\
XORQ AX, AX \
ADDQ 32+C, T4 \
ADCQ 40+C, T0 \
ADCQ T7, T1 \
ADCQ T8, T3 \
ADCQ T9, T6 \
ADCQ DX, T5 \
ADCQ AX, T2 \
\
MOVQ 24+M0, DX \
MULXQ M1+40(SB), T7, T8 \
ADDQ T4, T7 \
ADCQ T8, T0 \
MULXQ M1+48(SB), T10, T8 \
MOVQ T10, 32+C \
ADCQ T8, T1 \
MULXQ M1+56(SB), T8, T4 \
MOVQ T8, 40+C \
ADCQ T4, T3 \
MULXQ M1+64(SB), T10, T4 \
ADCQ T4, T6 \
MULXQ M1+72(SB), T8, T4 \
ADCQ T4, T5 \
MULXQ M1+80(SB), T9, T4 \
ADCQ T4, T2 \
MULXQ M1+88(SB), DX, T4 \
ADCQ AX, T4 \
\
XORQ AX, AX \
ADDQ 32+C, T0 \
ADCQ 40+C, T1 \
ADCQ T10, T3 \
ADCQ T8, T6 \
ADCQ T9, T5 \
ADCQ DX, T2 \
ADCQ AX, T4
// Template for calculating the Montgomery reduction algorithm described in
// section 5.2.3 of https://eprint.iacr.org/2017/1015.pdf. Template must be
// customized with schoolbook multiplicaton for 256 x 448-bit number.
// This macro reuses memory of IN value and *changes* it. Smashes registers
// R[8-15], AX, BX, CX, DX, BP.
// Input:
// * M0: 1536-bit number to be reduced
// * C : either mul256x448bmi2 or mul256x448bmi2adx
// Output: OUT 768-bit
#define REDC(C, M0, MULS) \
\ // a[0-3] x p751p1_nz --> result: [reg_p2+48], [reg_p2+56], [reg_p2+64], and rbp, r8:r14
MULS(M0, ·p751p1, 48+C, R8, R9, R13, R10, R14, R12, R11, BP, BX, CX, R15) \
XORQ R15, R15 \
MOVQ 48+C, AX \
MOVQ 56+C, DX \
MOVQ 64+C, BX \
ADDQ 40+M0, AX \
ADCQ 48+M0, DX \
ADCQ 56+M0, BX \
MOVQ AX, 40+M0 \
MOVQ DX, 48+M0 \
MOVQ BX, 56+M0 \
ADCQ 64+M0, BP \
ADCQ 72+M0, R8 \
ADCQ 80+M0, R9 \
ADCQ 88+M0, R10 \
ADCQ 96+M0, R11 \
ADCQ 104+M0, R12 \
ADCQ 112+M0, R13 \
ADCQ 120+M0, R14 \
ADCQ 128+M0, R15 \
MOVQ BP, 64+M0 \
MOVQ R8, 72+M0 \
MOVQ R9, 80+M0 \
MOVQ R10, 88+M0 \
MOVQ R11, 96+M0 \
MOVQ R12, 104+M0 \
MOVQ R13, 112+M0 \
MOVQ R14, 120+M0 \
MOVQ R15, 128+M0 \
MOVQ 136+M0, R8 \
MOVQ 144+M0, R9 \
MOVQ 152+M0, R10 \
MOVQ 160+M0, R11 \
MOVQ 168+M0, R12 \
MOVQ 176+M0, R13 \
MOVQ 184+M0, R14 \
ADCQ $0, R8 \
ADCQ $0, R9 \
ADCQ $0, R10 \
ADCQ $0, R11 \
ADCQ $0, R12 \
ADCQ $0, R13 \
ADCQ $0, R14 \
MOVQ R8, 136+M0 \
MOVQ R9, 144+M0 \
MOVQ R10, 152+M0 \
MOVQ R11, 160+M0 \
MOVQ R12, 168+M0 \
MOVQ R13, 176+M0 \
MOVQ R14, 184+M0 \
\ // a[4-7] x p751p1_nz --> result: [reg_p2+48], [reg_p2+56], [reg_p2+64], and rbp, r8:r14
MULS(32+M0, ·p751p1, 48+C, R8, R9, R13, R10, R14, R12, R11, BP, BX, CX, R15) \
XORQ R15, R15 \
MOVQ 48+C, AX \
MOVQ 56+C, DX \
MOVQ 64+C, BX \
ADDQ 72+M0, AX \
ADCQ 80+M0, DX \
ADCQ 88+M0, BX \
MOVQ AX, 72+M0 \
MOVQ DX, 80+M0 \
MOVQ BX, 88+M0 \
ADCQ 96+M0, BP \
ADCQ 104+M0, R8 \
ADCQ 112+M0, R9 \
ADCQ 120+M0, R10 \
ADCQ 128+M0, R11 \
ADCQ 136+M0, R12 \
ADCQ 144+M0, R13 \
ADCQ 152+M0, R14 \
ADCQ 160+M0, R15 \
MOVQ BP, 0+C \ // Final result c0
MOVQ R8, 104+M0 \
MOVQ R9, 112+M0 \
MOVQ R10, 120+M0 \
MOVQ R11, 128+M0 \
MOVQ R12, 136+M0 \
MOVQ R13, 144+M0 \
MOVQ R14, 152+M0 \
MOVQ R15, 160+M0 \
MOVQ 168+M0, R12 \
MOVQ 176+M0, R13 \
MOVQ 184+M0, R14 \
ADCQ $0, R12 \
ADCQ $0, R13 \
ADCQ $0, R14 \
MOVQ R12, 168+M0 \
MOVQ R13, 176+M0 \
MOVQ R14, 184+M0 \
\ // a[8-11] x p751p1_nz --> result: [reg_p2+48], [reg_p2+56], [reg_p2+64], and rbp, r8:r14
MULS(64+M0, ·p751p1, 48+C, R8, R9, R13, R10, R14, R12, R11, BP, BX, CX, R15) \
MOVQ 48+C, AX \ // Final result c1:c11
MOVQ 56+C, DX \
MOVQ 64+C, BX \
ADDQ 104+M0, AX \
ADCQ 112+M0, DX \
ADCQ 120+M0, BX \
MOVQ AX, 8+C \
MOVQ DX, 16+C \
MOVQ BX, 24+C \
ADCQ 128+M0, BP \
ADCQ 136+M0, R8 \
ADCQ 144+M0, R9 \
ADCQ 152+M0, R10 \
ADCQ 160+M0, R11 \
ADCQ 168+M0, R12 \
ADCQ 176+M0, R13 \
ADCQ 184+M0, R14 \
MOVQ BP, 32+C \
MOVQ R8, 40+C \
MOVQ R9, 48+C \
MOVQ R10, 56+C \
MOVQ R11, 64+C \
MOVQ R12, 72+C \
MOVQ R13, 80+C \
MOVQ R14, 88+C
2018-07-23 23:18:38 +01:00
TEXT ·fp751MontgomeryReduce(SB), $0-16
2018-07-23 23:18:38 +01:00
MOVQ z+0(FP), REG_P2
MOVQ x+8(FP), REG_P1
// Check wether to use optimized implementation
CMPB ·HasADXandBMI2(SB), $1
JE redc_with_mulx_adcx_adox
CMPB ·HasBMI2(SB), $1
JE redc_with_mulx
2018-07-23 23:18:38 +01:00
MOVQ (REG_P1), R11
MOVQ P751P1_5, AX
MULQ R11
XORQ R8, R8
ADDQ (40)(REG_P1), AX
MOVQ AX, (40)(REG_P2) // Z5
ADCQ DX, R8
XORQ R9, R9
MOVQ P751P1_6, AX
MULQ R11
XORQ R10, R10
ADDQ AX, R8
ADCQ DX, R9
MOVQ (8)(REG_P1), R12
MOVQ P751P1_5, AX
MULQ R12
ADDQ AX, R8
ADCQ DX, R9
ADCQ $0, R10
ADDQ (48)(REG_P1), R8
MOVQ R8, (48)(REG_P2) // Z6
ADCQ $0, R9
ADCQ $0, R10
XORQ R8, R8
MOVQ P751P1_7, AX
MULQ R11
ADDQ AX, R9
ADCQ DX, R10
ADCQ $0, R8
MOVQ P751P1_6, AX
MULQ R12
ADDQ AX, R9
ADCQ DX, R10
ADCQ $0, R8
MOVQ (16)(REG_P1), R13
MOVQ P751P1_5, AX
MULQ R13
ADDQ AX, R9
ADCQ DX, R10
ADCQ $0, R8
ADDQ (56)(REG_P1), R9
MOVQ R9, (56)(REG_P2) // Z7
ADCQ $0, R10
ADCQ $0, R8
XORQ R9, R9
MOVQ P751P1_8, AX
MULQ R11
ADDQ AX, R10
ADCQ DX, R8
ADCQ $0, R9
MOVQ P751P1_7, AX
MULQ R12
ADDQ AX, R10
ADCQ DX, R8
ADCQ $0, R9
MOVQ P751P1_6, AX
MULQ R13
ADDQ AX, R10
ADCQ DX, R8
ADCQ $0, R9
MOVQ (24)(REG_P1), R14
MOVQ P751P1_5, AX
MULQ R14
ADDQ AX, R10
ADCQ DX, R8
ADCQ $0, R9
ADDQ (64)(REG_P1), R10
MOVQ R10, (64)(REG_P2) // Z8
ADCQ $0, R8
ADCQ $0, R9
XORQ R10, R10
MOVQ P751P1_9, AX
MULQ R11
ADDQ AX, R8
ADCQ DX, R9
ADCQ $0, R10
MOVQ P751P1_8, AX
MULQ R12
ADDQ AX, R8
ADCQ DX, R9
ADCQ $0, R10
MOVQ P751P1_7, AX
MULQ R13
ADDQ AX, R8
ADCQ DX, R9
ADCQ $0, R10
MOVQ P751P1_6, AX
MULQ R14
ADDQ AX, R8
ADCQ DX, R9
ADCQ $0, R10
MOVQ (32)(REG_P1), R15
MOVQ P751P1_5, AX
MULQ R15
ADDQ AX, R8
ADCQ DX, R9
ADCQ $0, R10
ADDQ (72)(REG_P1), R8
MOVQ R8, (72)(REG_P2) // Z9
ADCQ $0, R9
ADCQ $0, R10
XORQ R8, R8
MOVQ P751P1_10, AX
MULQ R11
ADDQ AX, R9
ADCQ DX, R10
ADCQ $0, R8
MOVQ P751P1_9, AX
MULQ R12
ADDQ AX, R9
ADCQ DX, R10
ADCQ $0, R8
MOVQ P751P1_8, AX
MULQ R13
ADDQ AX, R9
ADCQ DX, R10
ADCQ $0, R8
MOVQ P751P1_7, AX
MULQ R14
ADDQ AX, R9
ADCQ DX, R10
ADCQ $0, R8
MOVQ P751P1_6, AX
MULQ R15
ADDQ AX, R9
ADCQ DX, R10
ADCQ $0, R8
MOVQ (40)(REG_P2), CX
MOVQ P751P1_5, AX
MULQ CX
ADDQ AX, R9
ADCQ DX, R10
ADCQ $0, R8
ADDQ (80)(REG_P1), R9
MOVQ R9, (80)(REG_P2) // Z10
ADCQ $0, R10
ADCQ $0, R8
XORQ R9, R9
MOVQ P751P1_11, AX
MULQ R11
ADDQ AX, R10
ADCQ DX, R8
ADCQ $0, R9
MOVQ P751P1_10, AX
MULQ R12
ADDQ AX, R10
ADCQ DX, R8
ADCQ $0, R9
MOVQ P751P1_9, AX
MULQ R13
ADDQ AX, R10
ADCQ DX, R8
ADCQ $0, R9
MOVQ P751P1_8, AX
MULQ R14
ADDQ AX, R10
ADCQ DX, R8
ADCQ $0, R9
MOVQ P751P1_7, AX
MULQ R15
ADDQ AX, R10
ADCQ DX, R8
ADCQ $0, R9
MOVQ P751P1_6, AX
MULQ CX
ADDQ AX, R10
ADCQ DX, R8
ADCQ $0, R9
MOVQ (48)(REG_P2), R11
MOVQ P751P1_5, AX
MULQ R11
ADDQ AX, R10
ADCQ DX, R8
ADCQ $0, R9
ADDQ (88)(REG_P1), R10
MOVQ R10, (88)(REG_P2) // Z11
ADCQ $0, R8
ADCQ $0, R9
XORQ R10, R10
MOVQ P751P1_11, AX
MULQ R12
ADDQ AX, R8
ADCQ DX, R9
ADCQ $0, R10
MOVQ P751P1_10, AX
MULQ R13
ADDQ AX, R8
ADCQ DX, R9
ADCQ $0, R10
MOVQ P751P1_9, AX
MULQ R14
ADDQ AX, R8
ADCQ DX, R9
ADCQ $0, R10
MOVQ P751P1_8, AX
MULQ R15
ADDQ AX, R8
ADCQ DX, R9
ADCQ $0, R10
MOVQ P751P1_7, AX
MULQ CX
ADDQ AX, R8
ADCQ DX, R9
ADCQ $0, R10
MOVQ P751P1_6, AX
MULQ R11
ADDQ AX, R8
ADCQ DX, R9
ADCQ $0, R10
MOVQ (56)(REG_P2), R12
MOVQ P751P1_5, AX
MULQ R12
ADDQ AX, R8
ADCQ DX, R9
ADCQ $0, R10
ADDQ (96)(REG_P1), R8
MOVQ R8, (REG_P2) // Z0
ADCQ $0, R9
ADCQ $0, R10
XORQ R8, R8
MOVQ P751P1_11, AX
MULQ R13
ADDQ AX, R9
ADCQ DX, R10
ADCQ $0, R8
MOVQ P751P1_10, AX
MULQ R14
ADDQ AX, R9
ADCQ DX, R10
ADCQ $0, R8
MOVQ P751P1_9, AX
MULQ R15
ADDQ AX, R9
ADCQ DX, R10
ADCQ $0, R8
MOVQ P751P1_8, AX
MULQ CX
ADDQ AX, R9
ADCQ DX, R10
ADCQ $0, R8
MOVQ P751P1_7, AX
MULQ R11
ADDQ AX, R9
ADCQ DX, R10
ADCQ $0, R8
MOVQ P751P1_6, AX
MULQ R12
ADDQ AX, R9
ADCQ DX, R10
ADCQ $0, R8
MOVQ (64)(REG_P2), R13
MOVQ P751P1_5, AX
MULQ R13
ADDQ AX, R9
ADCQ DX, R10
ADCQ $0, R8
ADDQ (104)(REG_P1), R9
MOVQ R9, (8)(REG_P2) // Z1
ADCQ $0, R10
ADCQ $0, R8
XORQ R9, R9
MOVQ P751P1_11, AX
MULQ R14
ADDQ AX, R10
ADCQ DX, R8
ADCQ $0, R9
MOVQ P751P1_10, AX
MULQ R15
ADDQ AX, R10
ADCQ DX, R8
ADCQ $0, R9
MOVQ P751P1_9, AX
MULQ CX
ADDQ AX, R10
ADCQ DX, R8
ADCQ $0, R9
MOVQ P751P1_8, AX
MULQ R11
ADDQ AX, R10
ADCQ DX, R8
ADCQ $0, R9
MOVQ P751P1_7, AX
MULQ R12
ADDQ AX, R10
ADCQ DX, R8
ADCQ $0, R9
MOVQ P751P1_6, AX
MULQ R13
ADDQ AX, R10
ADCQ DX, R8
ADCQ $0, R9
MOVQ (72)(REG_P2), R14
MOVQ P751P1_5, AX
MULQ R14
ADDQ AX, R10
ADCQ DX, R8
ADCQ $0, R9
ADDQ (112)(REG_P1), R10
MOVQ R10, (16)(REG_P2) // Z2
ADCQ $0, R8
ADCQ $0, R9
XORQ R10, R10
MOVQ P751P1_11, AX
MULQ R15
ADDQ AX, R8
ADCQ DX, R9
ADCQ $0, R10
MOVQ P751P1_10, AX
MULQ CX
ADDQ AX, R8
ADCQ DX, R9
ADCQ $0, R10
MOVQ P751P1_9, AX
MULQ R11
ADDQ AX, R8
ADCQ DX, R9
ADCQ $0, R10
MOVQ P751P1_8, AX
MULQ R12
ADDQ AX, R8
ADCQ DX, R9
ADCQ $0, R10
MOVQ P751P1_7, AX
MULQ R13
ADDQ AX, R8
ADCQ DX, R9
ADCQ $0, R10
MOVQ P751P1_6, AX
MULQ R14
ADDQ AX, R8
ADCQ DX, R9
ADCQ $0, R10
MOVQ (80)(REG_P2), R15
MOVQ P751P1_5, AX
MULQ R15
ADDQ AX, R8
ADCQ DX, R9
ADCQ $0, R10
ADDQ (120)(REG_P1), R8
MOVQ R8, (24)(REG_P2) // Z3
ADCQ $0, R9
ADCQ $0, R10
XORQ R8, R8
MOVQ P751P1_11, AX
MULQ CX
ADDQ AX, R9
ADCQ DX, R10
ADCQ $0, R8
MOVQ P751P1_10, AX
MULQ R11
ADDQ AX, R9
ADCQ DX, R10
ADCQ $0, R8
MOVQ P751P1_9, AX
MULQ R12
ADDQ AX, R9
ADCQ DX, R10
ADCQ $0, R8
MOVQ P751P1_8, AX
MULQ R13
ADDQ AX, R9
ADCQ DX, R10
ADCQ $0, R8
MOVQ P751P1_7, AX
MULQ R14
ADDQ AX, R9
ADCQ DX, R10
ADCQ $0, R8
MOVQ P751P1_6, AX
MULQ R15
ADDQ AX, R9
ADCQ DX, R10
ADCQ $0, R8
MOVQ (88)(REG_P2), CX
MOVQ P751P1_5, AX
MULQ CX
ADDQ AX, R9
ADCQ DX, R10
ADCQ $0, R8
ADDQ (128)(REG_P1), R9
MOVQ R9, (32)(REG_P2) // Z4
ADCQ $0, R10
ADCQ $0, R8
XORQ R9, R9
MOVQ P751P1_11, AX
MULQ R11
ADDQ AX, R10
ADCQ DX, R8
ADCQ $0, R9
MOVQ P751P1_10, AX
MULQ R12
ADDQ AX, R10
ADCQ DX, R8
ADCQ $0, R9
MOVQ P751P1_9, AX
MULQ R13
ADDQ AX, R10
ADCQ DX, R8
ADCQ $0, R9
MOVQ P751P1_8, AX
MULQ R14
ADDQ AX, R10
ADCQ DX, R8
ADCQ $0, R9
MOVQ P751P1_7, AX
MULQ R15
ADDQ AX, R10
ADCQ DX, R8
ADCQ $0, R9
MOVQ P751P1_6, AX
MULQ CX
ADDQ AX, R10
ADCQ DX, R8
ADCQ $0, R9
ADDQ (136)(REG_P1), R10
MOVQ R10, (40)(REG_P2) // Z5
ADCQ $0, R8
ADCQ $0, R9
XORQ R10, R10
MOVQ P751P1_11, AX
MULQ R12
ADDQ AX, R8
ADCQ DX, R9
ADCQ $0, R10
MOVQ P751P1_10, AX
MULQ R13
ADDQ AX, R8
ADCQ DX, R9
ADCQ $0, R10
MOVQ P751P1_9, AX
MULQ R14
ADDQ AX, R8
ADCQ DX, R9
ADCQ $0, R10
MOVQ P751P1_8, AX
MULQ R15
ADDQ AX, R8
ADCQ DX, R9
ADCQ $0, R10
MOVQ P751P1_7, AX
MULQ CX
ADDQ AX, R8
ADCQ DX, R9
ADCQ $0, R10
ADDQ (144)(REG_P1), R8
MOVQ R8, (48)(REG_P2) // Z6
ADCQ $0, R9
ADCQ $0, R10
XORQ R8, R8
MOVQ P751P1_11, AX
MULQ R13
ADDQ AX, R9
ADCQ DX, R10
ADCQ $0, R8
MOVQ P751P1_10, AX
MULQ R14
ADDQ AX, R9
ADCQ DX, R10
ADCQ $0, R8
MOVQ P751P1_9, AX
MULQ R15
ADDQ AX, R9
ADCQ DX, R10
ADCQ $0, R8
MOVQ P751P1_8, AX
MULQ CX
ADDQ AX, R9
ADCQ DX, R10
ADCQ $0, R8
ADDQ (152)(REG_P1), R9
MOVQ R9, (56)(REG_P2) // Z7
ADCQ $0, R10
ADCQ $0, R8
XORQ R9, R9
MOVQ P751P1_11, AX
MULQ R14
ADDQ AX, R10
ADCQ DX, R8
ADCQ $0, R9
MOVQ P751P1_10, AX
MULQ R15
ADDQ AX, R10
ADCQ DX, R8
ADCQ $0, R9
MOVQ P751P1_9, AX
MULQ CX
ADDQ AX, R10
ADCQ DX, R8
ADCQ $0, R9
ADDQ (160)(REG_P1), R10
MOVQ R10, (64)(REG_P2) // Z8
ADCQ $0, R8
ADCQ $0, R9
XORQ R10, R10
MOVQ P751P1_11, AX
MULQ R15
ADDQ AX, R8
ADCQ DX, R9
ADCQ $0, R10
MOVQ P751P1_10, AX
MULQ CX
ADDQ AX, R8
ADCQ DX, R9
ADCQ $0, R10
ADDQ (168)(REG_P1), R8 // Z9
MOVQ R8, (72)(REG_P2) // Z9
ADCQ $0, R9
ADCQ $0, R10
MOVQ P751P1_11, AX
MULQ CX
ADDQ AX, R9
ADCQ DX, R10
ADDQ (176)(REG_P1), R9 // Z10
MOVQ R9, (80)(REG_P2) // Z10
ADCQ $0, R10
ADDQ (184)(REG_P1), R10 // Z11
MOVQ R10, (88)(REG_P2) // Z11
RET
redc_with_mulx_adcx_adox:
// This implements the Montgomery reduction algorithm described in
// section 5.2.3 of https://eprint.iacr.org/2017/1015.pdf.
// This assumes that the BMI2 and ADX instruction set extensions are available.
REDC(0(REG_P2), 0(REG_P1), mul256x448bmi2adx)
RET
2018-07-23 23:18:38 +01:00
redc_with_mulx:
// This implements the Montgomery reduction algorithm described in
// section 5.2.3 of https://eprint.iacr.org/2017/1015.pdf.
// This assumes that the BMI2 instruction set extension is available.
REDC(0(REG_P2), 0(REG_P1), mul256x448bmi2)
2018-07-23 23:18:38 +01:00
RET
TEXT ·fp751AddLazy(SB), NOSPLIT, $0-24
MOVQ z+0(FP), REG_P3
MOVQ x+8(FP), REG_P1
MOVQ y+16(FP), REG_P2
MOVQ (REG_P1), R8
MOVQ (8)(REG_P1), R9
MOVQ (16)(REG_P1), R10
MOVQ (24)(REG_P1), R11
MOVQ (32)(REG_P1), R12
MOVQ (40)(REG_P1), R13
MOVQ (48)(REG_P1), R14
MOVQ (56)(REG_P1), R15
MOVQ (64)(REG_P1), AX
MOVQ (72)(REG_P1), BX
MOVQ (80)(REG_P1), CX
MOVQ (88)(REG_P1), DI
ADDQ (REG_P2), R8
ADCQ (8)(REG_P2), R9
ADCQ (16)(REG_P2), R10
ADCQ (24)(REG_P2), R11
ADCQ (32)(REG_P2), R12
ADCQ (40)(REG_P2), R13
ADCQ (48)(REG_P2), R14
ADCQ (56)(REG_P2), R15
ADCQ (64)(REG_P2), AX
ADCQ (72)(REG_P2), BX
ADCQ (80)(REG_P2), CX
ADCQ (88)(REG_P2), DI
MOVQ R8, (REG_P3)
MOVQ R9, (8)(REG_P3)
MOVQ R10, (16)(REG_P3)
MOVQ R11, (24)(REG_P3)
MOVQ R12, (32)(REG_P3)
MOVQ R13, (40)(REG_P3)
MOVQ R14, (48)(REG_P3)
MOVQ R15, (56)(REG_P3)
MOVQ AX, (64)(REG_P3)
MOVQ BX, (72)(REG_P3)
MOVQ CX, (80)(REG_P3)
MOVQ DI, (88)(REG_P3)
RET
TEXT ·fp751X2AddLazy(SB), NOSPLIT, $0-24
MOVQ z+0(FP), REG_P3
MOVQ x+8(FP), REG_P1
MOVQ y+16(FP), REG_P2
MOVQ (REG_P1), R8
MOVQ (8)(REG_P1), R9
MOVQ (16)(REG_P1), R10
MOVQ (24)(REG_P1), R11
MOVQ (32)(REG_P1), R12
MOVQ (40)(REG_P1), R13
MOVQ (48)(REG_P1), R14
MOVQ (56)(REG_P1), R15
MOVQ (64)(REG_P1), AX
MOVQ (72)(REG_P1), BX
MOVQ (80)(REG_P1), CX
ADDQ (REG_P2), R8
ADCQ (8)(REG_P2), R9
ADCQ (16)(REG_P2), R10
ADCQ (24)(REG_P2), R11
ADCQ (32)(REG_P2), R12
ADCQ (40)(REG_P2), R13
ADCQ (48)(REG_P2), R14
ADCQ (56)(REG_P2), R15
ADCQ (64)(REG_P2), AX
ADCQ (72)(REG_P2), BX
ADCQ (80)(REG_P2), CX
MOVQ R8, (REG_P3)
MOVQ R9, (8)(REG_P3)
MOVQ R10, (16)(REG_P3)
MOVQ R11, (24)(REG_P3)
MOVQ R12, (32)(REG_P3)
MOVQ R13, (40)(REG_P3)
MOVQ R14, (48)(REG_P3)
MOVQ R15, (56)(REG_P3)
MOVQ AX, (64)(REG_P3)
MOVQ BX, (72)(REG_P3)
MOVQ CX, (80)(REG_P3)
MOVQ (88)(REG_P1), AX
ADCQ (88)(REG_P2), AX
MOVQ AX, (88)(REG_P3)
MOVQ (96)(REG_P1), R8
MOVQ (104)(REG_P1), R9
MOVQ (112)(REG_P1), R10
MOVQ (120)(REG_P1), R11
MOVQ (128)(REG_P1), R12
MOVQ (136)(REG_P1), R13
MOVQ (144)(REG_P1), R14
MOVQ (152)(REG_P1), R15
MOVQ (160)(REG_P1), AX
MOVQ (168)(REG_P1), BX
MOVQ (176)(REG_P1), CX
MOVQ (184)(REG_P1), DI
ADCQ (96)(REG_P2), R8
ADCQ (104)(REG_P2), R9
ADCQ (112)(REG_P2), R10
ADCQ (120)(REG_P2), R11
ADCQ (128)(REG_P2), R12
ADCQ (136)(REG_P2), R13
ADCQ (144)(REG_P2), R14
ADCQ (152)(REG_P2), R15
ADCQ (160)(REG_P2), AX
ADCQ (168)(REG_P2), BX
ADCQ (176)(REG_P2), CX
ADCQ (184)(REG_P2), DI
MOVQ R8, (96)(REG_P3)
MOVQ R9, (104)(REG_P3)
MOVQ R10, (112)(REG_P3)
MOVQ R11, (120)(REG_P3)
MOVQ R12, (128)(REG_P3)
MOVQ R13, (136)(REG_P3)
MOVQ R14, (144)(REG_P3)
MOVQ R15, (152)(REG_P3)
MOVQ AX, (160)(REG_P3)
MOVQ BX, (168)(REG_P3)
MOVQ CX, (176)(REG_P3)
MOVQ DI, (184)(REG_P3)
RET
TEXT ·fp751X2SubLazy(SB), NOSPLIT, $0-24
MOVQ z+0(FP), REG_P3
MOVQ x+8(FP), REG_P1
MOVQ y+16(FP), REG_P2
MOVQ (REG_P1), R8
MOVQ (8)(REG_P1), R9
MOVQ (16)(REG_P1), R10
MOVQ (24)(REG_P1), R11
MOVQ (32)(REG_P1), R12
MOVQ (40)(REG_P1), R13
MOVQ (48)(REG_P1), R14
MOVQ (56)(REG_P1), R15
MOVQ (64)(REG_P1), AX
MOVQ (72)(REG_P1), BX
MOVQ (80)(REG_P1), CX
SUBQ (REG_P2), R8
SBBQ (8)(REG_P2), R9
SBBQ (16)(REG_P2), R10
SBBQ (24)(REG_P2), R11
SBBQ (32)(REG_P2), R12
SBBQ (40)(REG_P2), R13
SBBQ (48)(REG_P2), R14
SBBQ (56)(REG_P2), R15
SBBQ (64)(REG_P2), AX
SBBQ (72)(REG_P2), BX
SBBQ (80)(REG_P2), CX
MOVQ R8, (REG_P3)
MOVQ R9, (8)(REG_P3)
MOVQ R10, (16)(REG_P3)
MOVQ R11, (24)(REG_P3)
MOVQ R12, (32)(REG_P3)
MOVQ R13, (40)(REG_P3)
MOVQ R14, (48)(REG_P3)
MOVQ R15, (56)(REG_P3)
MOVQ AX, (64)(REG_P3)
MOVQ BX, (72)(REG_P3)
MOVQ CX, (80)(REG_P3)
MOVQ (88)(REG_P1), AX
SBBQ (88)(REG_P2), AX
MOVQ AX, (88)(REG_P3)
MOVQ (96)(REG_P1), R8
MOVQ (104)(REG_P1), R9
MOVQ (112)(REG_P1), R10
MOVQ (120)(REG_P1), R11
MOVQ (128)(REG_P1), R12
MOVQ (136)(REG_P1), R13
MOVQ (144)(REG_P1), R14
MOVQ (152)(REG_P1), R15
MOVQ (160)(REG_P1), AX
MOVQ (168)(REG_P1), BX
MOVQ (176)(REG_P1), CX
MOVQ (184)(REG_P1), DI
SBBQ (96)(REG_P2), R8
SBBQ (104)(REG_P2), R9
SBBQ (112)(REG_P2), R10
SBBQ (120)(REG_P2), R11
SBBQ (128)(REG_P2), R12
SBBQ (136)(REG_P2), R13
SBBQ (144)(REG_P2), R14
SBBQ (152)(REG_P2), R15
SBBQ (160)(REG_P2), AX
SBBQ (168)(REG_P2), BX
SBBQ (176)(REG_P2), CX
SBBQ (184)(REG_P2), DI
MOVQ R8, (96)(REG_P3)
MOVQ R9, (104)(REG_P3)
MOVQ R10, (112)(REG_P3)
MOVQ R11, (120)(REG_P3)
MOVQ R12, (128)(REG_P3)
MOVQ R13, (136)(REG_P3)
MOVQ R14, (144)(REG_P3)
MOVQ R15, (152)(REG_P3)
MOVQ AX, (160)(REG_P3)
MOVQ BX, (168)(REG_P3)
MOVQ CX, (176)(REG_P3)
MOVQ DI, (184)(REG_P3)
// Now the carry flag is 1 if x-y < 0. If so, add p*2^768.
MOVQ $0, AX
2018-07-23 23:18:38 +01:00
SBBQ $0, AX
// Load p into registers:
MOVQ P751_0, R8
// P751_{1,2,3,4} = P751_0, so reuse R8
MOVQ P751_5, R9
MOVQ P751_6, R10
MOVQ P751_7, R11
MOVQ P751_8, R12
MOVQ P751_9, R13
MOVQ P751_10, R14
MOVQ P751_11, R15
ANDQ AX, R8
ANDQ AX, R9
ANDQ AX, R10
ANDQ AX, R11
ANDQ AX, R12
ANDQ AX, R13
ANDQ AX, R14
ANDQ AX, R15
ADDQ R8, (96 )(REG_P3)
ADCQ R8, (96+ 8)(REG_P3)
ADCQ R8, (96+16)(REG_P3)
ADCQ R8, (96+24)(REG_P3)
ADCQ R8, (96+32)(REG_P3)
ADCQ R9, (96+40)(REG_P3)
ADCQ R10, (96+48)(REG_P3)
ADCQ R11, (96+56)(REG_P3)
ADCQ R12, (96+64)(REG_P3)
ADCQ R13, (96+72)(REG_P3)
ADCQ R14, (96+80)(REG_P3)
ADCQ R15, (96+88)(REG_P3)
RET