1
0
mirror of https://github.com/henrydcase/nobs.git synced 2024-11-26 09:01:20 +00:00
nobs/dh/sidh/p751/arith_amd64.s
Kris Kwiatkowski e9ddb6fb45
sidh/csidh: use SEE for performing CSWAP (#6)
* Makefile

* makefile: tools for profiling

* sidh: use SIMD for performing CSWAP

Loads data into 128-bit XMM registers and performs conditional swap.
This is probably less useful for SIDH, but will be useful for cSIDH
2018-10-29 15:41:09 +00:00

2203 lines
35 KiB
ArmAsm

// +build amd64,!noasm
#include "textflag.h"
// p751 + 1
#define P751P1_5 $0xEEB0000000000000
#define P751P1_6 $0xE3EC968549F878A8
#define P751P1_7 $0xDA959B1A13F7CC76
#define P751P1_8 $0x084E9867D6EBE876
#define P751P1_9 $0x8562B5045CB25748
#define P751P1_10 $0x0E12909F97BADC66
#define P751P1_11 $0x00006FE5D541F71C
#define P751_0 $0xFFFFFFFFFFFFFFFF
#define P751_5 $0xEEAFFFFFFFFFFFFF
#define P751_6 $0xE3EC968549F878A8
#define P751_7 $0xDA959B1A13F7CC76
#define P751_8 $0x084E9867D6EBE876
#define P751_9 $0x8562B5045CB25748
#define P751_10 $0x0E12909F97BADC66
#define P751_11 $0x00006FE5D541F71C
#define P751X2_0 $0xFFFFFFFFFFFFFFFE
#define P751X2_1 $0xFFFFFFFFFFFFFFFF
#define P751X2_5 $0xDD5FFFFFFFFFFFFF
#define P751X2_6 $0xC7D92D0A93F0F151
#define P751X2_7 $0xB52B363427EF98ED
#define P751X2_8 $0x109D30CFADD7D0ED
#define P751X2_9 $0x0AC56A08B964AE90
#define P751X2_10 $0x1C25213F2F75B8CD
#define P751X2_11 $0x0000DFCBAA83EE38
// The MSR code uses these registers for parameter passing. Keep using
// them to avoid significant code changes. This means that when the Go
// assembler does something strange, we can diff the machine code
// against a different assembler to find out what Go did.
#define REG_P1 DI
#define REG_P2 SI
#define REG_P3 DX
// We can't write MOVQ $0, AX because Go's assembler incorrectly
// optimizes this to XOR AX, AX, which clobbers the carry flags.
//
// This bug was defined to be "correct" behaviour (cf.
// https://github.com/golang/go/issues/12405 ) by declaring that the MOV
// pseudo-instruction clobbers flags, although this fact is mentioned
// nowhere in the documentation for the Go assembler.
//
// Defining MOVQ to clobber flags has the effect that it is never safe
// to interleave MOVQ with ADCQ and SBBQ instructions. Since this is
// required to write a carry chain longer than registers' working set,
// all of the below code therefore relies on the unspecified and
// undocumented behaviour that MOV won't clobber flags, except in the
// case of the above-mentioned bug.
//
// However, there's also no specification of which instructions
// correspond to machine instructions, and which are
// pseudo-instructions (i.e., no specification of what the assembler
// actually does), so this doesn't seem much worse than usual.
//
// Avoid the bug by dropping the bytes for `mov eax, 0` in directly:
#define ZERO_AX_WITHOUT_CLOBBERING_FLAGS BYTE $0xB8; BYTE $0; BYTE $0; BYTE $0; BYTE $0;
TEXT ·fp751StrongReduce(SB), NOSPLIT, $0-8
MOVQ x+0(FP), REG_P1
// Zero AX for later use:
XORQ AX, AX
// Load p into registers:
MOVQ P751_0, R8
// P751_{1,2,3,4} = P751_0, so reuse R8
MOVQ P751_5, R9
MOVQ P751_6, R10
MOVQ P751_7, R11
MOVQ P751_8, R12
MOVQ P751_9, R13
MOVQ P751_10, R14
MOVQ P751_11, R15
// Set x <- x - p
SUBQ R8, (REG_P1)
SBBQ R8, (8)(REG_P1)
SBBQ R8, (16)(REG_P1)
SBBQ R8, (24)(REG_P1)
SBBQ R8, (32)(REG_P1)
SBBQ R9, (40)(REG_P1)
SBBQ R10, (48)(REG_P1)
SBBQ R11, (56)(REG_P1)
SBBQ R12, (64)(REG_P1)
SBBQ R13, (72)(REG_P1)
SBBQ R14, (80)(REG_P1)
SBBQ R15, (88)(REG_P1)
// Save carry flag indicating x-p < 0 as a mask in AX
SBBQ $0, AX
// Conditionally add p to x if x-p < 0
ANDQ AX, R8
ANDQ AX, R9
ANDQ AX, R10
ANDQ AX, R11
ANDQ AX, R12
ANDQ AX, R13
ANDQ AX, R14
ANDQ AX, R15
ADDQ R8, (REG_P1)
ADCQ R8, (8)(REG_P1)
ADCQ R8, (16)(REG_P1)
ADCQ R8, (24)(REG_P1)
ADCQ R8, (32)(REG_P1)
ADCQ R9, (40)(REG_P1)
ADCQ R10, (48)(REG_P1)
ADCQ R11, (56)(REG_P1)
ADCQ R12, (64)(REG_P1)
ADCQ R13, (72)(REG_P1)
ADCQ R14, (80)(REG_P1)
ADCQ R15, (88)(REG_P1)
RET
TEXT ·fp751ConditionalSwap(SB), NOSPLIT, $0-17
MOVQ x+0(FP), REG_P1
MOVQ y+8(FP), REG_P2
MOVBLZX choice+16(FP), AX // AL = 0 or 1
// Make AX, so that either all bits are set or non
// AX = 0 or 1
NEGQ AX
// Fill xmm15. After this step first half of XMM15 is
// just zeros and second half is whatever in AX
MOVQ AX, X15
// Copy lower double word everywhere else. So that
// XMM15=AL|AL|AL|AL. As AX has either all bits set
// or non result will be that XMM15 has also either
// all bits set or non of them.
PSHUFD $0, X15, X15
#ifndef CSWAP_BLOCK
#define CSWAP_BLOCK(idx) \
MOVOU (idx*16)(REG_P1), X0 \
MOVOU (idx*16)(REG_P2), X1 \
\ // X2 = mask & (X0 ^ X1)
MOVO X1, X2 \
PXOR X0, X2 \
PAND X15, X2 \
\
PXOR X2, X0 \
PXOR X2, X1 \
\
MOVOU X0, (idx*16)(REG_P1) \
MOVOU X1, (idx*16)(REG_P2)
#endif
CSWAP_BLOCK(0)
CSWAP_BLOCK(1)
CSWAP_BLOCK(2)
CSWAP_BLOCK(3)
CSWAP_BLOCK(4)
CSWAP_BLOCK(5)
#ifdef CSWAP_BLOCK
#undef CSWAP_BLOCK
#endif
RET
TEXT ·fp751AddReduced(SB), NOSPLIT, $0-24
MOVQ z+0(FP), REG_P3
MOVQ x+8(FP), REG_P1
MOVQ y+16(FP), REG_P2
MOVQ (REG_P1), R8
MOVQ (8)(REG_P1), R9
MOVQ (16)(REG_P1), R10
MOVQ (24)(REG_P1), R11
MOVQ (32)(REG_P1), R12
MOVQ (40)(REG_P1), R13
MOVQ (48)(REG_P1), R14
MOVQ (56)(REG_P1), R15
MOVQ (64)(REG_P1), CX
ADDQ (REG_P2), R8
ADCQ (8)(REG_P2), R9
ADCQ (16)(REG_P2), R10
ADCQ (24)(REG_P2), R11
ADCQ (32)(REG_P2), R12
ADCQ (40)(REG_P2), R13
ADCQ (48)(REG_P2), R14
ADCQ (56)(REG_P2), R15
ADCQ (64)(REG_P2), CX
MOVQ (72)(REG_P1), AX
ADCQ (72)(REG_P2), AX
MOVQ AX, (72)(REG_P3)
MOVQ (80)(REG_P1), AX
ADCQ (80)(REG_P2), AX
MOVQ AX, (80)(REG_P3)
MOVQ (88)(REG_P1), AX
ADCQ (88)(REG_P2), AX
MOVQ AX, (88)(REG_P3)
MOVQ P751X2_0, AX
SUBQ AX, R8
MOVQ P751X2_1, AX
SBBQ AX, R9
SBBQ AX, R10
SBBQ AX, R11
SBBQ AX, R12
MOVQ P751X2_5, AX
SBBQ AX, R13
MOVQ P751X2_6, AX
SBBQ AX, R14
MOVQ P751X2_7, AX
SBBQ AX, R15
MOVQ P751X2_8, AX
SBBQ AX, CX
MOVQ R8, (REG_P3)
MOVQ R9, (8)(REG_P3)
MOVQ R10, (16)(REG_P3)
MOVQ R11, (24)(REG_P3)
MOVQ R12, (32)(REG_P3)
MOVQ R13, (40)(REG_P3)
MOVQ R14, (48)(REG_P3)
MOVQ R15, (56)(REG_P3)
MOVQ CX, (64)(REG_P3)
MOVQ (72)(REG_P3), R8
MOVQ (80)(REG_P3), R9
MOVQ (88)(REG_P3), R10
MOVQ P751X2_9, AX
SBBQ AX, R8
MOVQ P751X2_10, AX
SBBQ AX, R9
MOVQ P751X2_11, AX
SBBQ AX, R10
MOVQ R8, (72)(REG_P3)
MOVQ R9, (80)(REG_P3)
MOVQ R10, (88)(REG_P3)
ZERO_AX_WITHOUT_CLOBBERING_FLAGS
SBBQ $0, AX
MOVQ P751X2_0, SI
ANDQ AX, SI
MOVQ P751X2_1, R8
ANDQ AX, R8
MOVQ P751X2_5, R9
ANDQ AX, R9
MOVQ P751X2_6, R10
ANDQ AX, R10
MOVQ P751X2_7, R11
ANDQ AX, R11
MOVQ P751X2_8, R12
ANDQ AX, R12
MOVQ P751X2_9, R13
ANDQ AX, R13
MOVQ P751X2_10, R14
ANDQ AX, R14
MOVQ P751X2_11, R15
ANDQ AX, R15
MOVQ (REG_P3), AX
ADDQ SI, AX
MOVQ AX, (REG_P3)
MOVQ (8)(REG_P3), AX
ADCQ R8, AX
MOVQ AX, (8)(REG_P3)
MOVQ (16)(REG_P3), AX
ADCQ R8, AX
MOVQ AX, (16)(REG_P3)
MOVQ (24)(REG_P3), AX
ADCQ R8, AX
MOVQ AX, (24)(REG_P3)
MOVQ (32)(REG_P3), AX
ADCQ R8, AX
MOVQ AX, (32)(REG_P3)
MOVQ (40)(REG_P3), AX
ADCQ R9, AX
MOVQ AX, (40)(REG_P3)
MOVQ (48)(REG_P3), AX
ADCQ R10, AX
MOVQ AX, (48)(REG_P3)
MOVQ (56)(REG_P3), AX
ADCQ R11, AX
MOVQ AX, (56)(REG_P3)
MOVQ (64)(REG_P3), AX
ADCQ R12, AX
MOVQ AX, (64)(REG_P3)
MOVQ (72)(REG_P3), AX
ADCQ R13, AX
MOVQ AX, (72)(REG_P3)
MOVQ (80)(REG_P3), AX
ADCQ R14, AX
MOVQ AX, (80)(REG_P3)
MOVQ (88)(REG_P3), AX
ADCQ R15, AX
MOVQ AX, (88)(REG_P3)
RET
TEXT ·fp751SubReduced(SB), NOSPLIT, $0-24
MOVQ z+0(FP), REG_P3
MOVQ x+8(FP), REG_P1
MOVQ y+16(FP), REG_P2
MOVQ (REG_P1), R8
MOVQ (8)(REG_P1), R9
MOVQ (16)(REG_P1), R10
MOVQ (24)(REG_P1), R11
MOVQ (32)(REG_P1), R12
MOVQ (40)(REG_P1), R13
MOVQ (48)(REG_P1), R14
MOVQ (56)(REG_P1), R15
MOVQ (64)(REG_P1), CX
SUBQ (REG_P2), R8
SBBQ (8)(REG_P2), R9
SBBQ (16)(REG_P2), R10
SBBQ (24)(REG_P2), R11
SBBQ (32)(REG_P2), R12
SBBQ (40)(REG_P2), R13
SBBQ (48)(REG_P2), R14
SBBQ (56)(REG_P2), R15
SBBQ (64)(REG_P2), CX
MOVQ R8, (REG_P3)
MOVQ R9, (8)(REG_P3)
MOVQ R10, (16)(REG_P3)
MOVQ R11, (24)(REG_P3)
MOVQ R12, (32)(REG_P3)
MOVQ R13, (40)(REG_P3)
MOVQ R14, (48)(REG_P3)
MOVQ R15, (56)(REG_P3)
MOVQ CX, (64)(REG_P3)
MOVQ (72)(REG_P1), AX
SBBQ (72)(REG_P2), AX
MOVQ AX, (72)(REG_P3)
MOVQ (80)(REG_P1), AX
SBBQ (80)(REG_P2), AX
MOVQ AX, (80)(REG_P3)
MOVQ (88)(REG_P1), AX
SBBQ (88)(REG_P2), AX
MOVQ AX, (88)(REG_P3)
ZERO_AX_WITHOUT_CLOBBERING_FLAGS
SBBQ $0, AX
MOVQ P751X2_0, SI
ANDQ AX, SI
MOVQ P751X2_1, R8
ANDQ AX, R8
MOVQ P751X2_5, R9
ANDQ AX, R9
MOVQ P751X2_6, R10
ANDQ AX, R10
MOVQ P751X2_7, R11
ANDQ AX, R11
MOVQ P751X2_8, R12
ANDQ AX, R12
MOVQ P751X2_9, R13
ANDQ AX, R13
MOVQ P751X2_10, R14
ANDQ AX, R14
MOVQ P751X2_11, R15
ANDQ AX, R15
MOVQ (REG_P3), AX
ADDQ SI, AX
MOVQ AX, (REG_P3)
MOVQ (8)(REG_P3), AX
ADCQ R8, AX
MOVQ AX, (8)(REG_P3)
MOVQ (16)(REG_P3), AX
ADCQ R8, AX
MOVQ AX, (16)(REG_P3)
MOVQ (24)(REG_P3), AX
ADCQ R8, AX
MOVQ AX, (24)(REG_P3)
MOVQ (32)(REG_P3), AX
ADCQ R8, AX
MOVQ AX, (32)(REG_P3)
MOVQ (40)(REG_P3), AX
ADCQ R9, AX
MOVQ AX, (40)(REG_P3)
MOVQ (48)(REG_P3), AX
ADCQ R10, AX
MOVQ AX, (48)(REG_P3)
MOVQ (56)(REG_P3), AX
ADCQ R11, AX
MOVQ AX, (56)(REG_P3)
MOVQ (64)(REG_P3), AX
ADCQ R12, AX
MOVQ AX, (64)(REG_P3)
MOVQ (72)(REG_P3), AX
ADCQ R13, AX
MOVQ AX, (72)(REG_P3)
MOVQ (80)(REG_P3), AX
ADCQ R14, AX
MOVQ AX, (80)(REG_P3)
MOVQ (88)(REG_P3), AX
ADCQ R15, AX
MOVQ AX, (88)(REG_P3)
RET
TEXT ·fp751Mul(SB), $96-24
// Here we store the destination in CX instead of in REG_P3 because the
// multiplication instructions use DX as an implicit destination
// operand: MULQ $REG sets DX:AX <-- AX * $REG.
MOVQ z+0(FP), CX
MOVQ x+8(FP), REG_P1
MOVQ y+16(FP), REG_P2
XORQ AX, AX
MOVQ (48)(REG_P1), R8
MOVQ (56)(REG_P1), R9
MOVQ (64)(REG_P1), R10
MOVQ (72)(REG_P1), R11
MOVQ (80)(REG_P1), R12
MOVQ (88)(REG_P1), R13
ADDQ (REG_P1), R8
ADCQ (8)(REG_P1), R9
ADCQ (16)(REG_P1), R10
ADCQ (24)(REG_P1), R11
ADCQ (32)(REG_P1), R12
ADCQ (40)(REG_P1), R13
MOVQ R8, (CX)
MOVQ R9, (8)(CX)
MOVQ R10, (16)(CX)
MOVQ R11, (24)(CX)
MOVQ R12, (32)(CX)
MOVQ R13, (40)(CX)
SBBQ $0, AX
XORQ DX, DX
MOVQ (48)(REG_P2), R8
MOVQ (56)(REG_P2), R9
MOVQ (64)(REG_P2), R10
MOVQ (72)(REG_P2), R11
MOVQ (80)(REG_P2), R12
MOVQ (88)(REG_P2), R13
ADDQ (REG_P2), R8
ADCQ (8)(REG_P2), R9
ADCQ (16)(REG_P2), R10
ADCQ (24)(REG_P2), R11
ADCQ (32)(REG_P2), R12
ADCQ (40)(REG_P2), R13
MOVQ R8, (48)(CX)
MOVQ R9, (56)(CX)
MOVQ R10, (64)(CX)
MOVQ R11, (72)(CX)
MOVQ R12, (80)(CX)
MOVQ R13, (88)(CX)
SBBQ $0, DX
MOVQ AX, (80)(SP)
MOVQ DX, (88)(SP)
// (SP[0-8],R10,R8,R9) <- (AH+AL)*(BH+BL)
MOVQ (CX), R11
MOVQ R8, AX
MULQ R11
MOVQ AX, (SP) // c0
MOVQ DX, R14
XORQ R15, R15
MOVQ R9, AX
MULQ R11
XORQ R9, R9
ADDQ AX, R14
ADCQ DX, R9
MOVQ (8)(CX), R12
MOVQ R8, AX
MULQ R12
ADDQ AX, R14
MOVQ R14, (8)(SP) // c1
ADCQ DX, R9
ADCQ $0, R15
XORQ R8, R8
MOVQ R10, AX
MULQ R11
ADDQ AX, R9
MOVQ (48)(CX), R13
ADCQ DX, R15
ADCQ $0, R8
MOVQ (16)(CX), AX
MULQ R13
ADDQ AX, R9
ADCQ DX, R15
MOVQ (56)(CX), AX
ADCQ $0, R8
MULQ R12
ADDQ AX, R9
MOVQ R9, (16)(SP) // c2
ADCQ DX, R15
ADCQ $0, R8
XORQ R9, R9
MOVQ (72)(CX), AX
MULQ R11
ADDQ AX, R15
ADCQ DX, R8
ADCQ $0, R9
MOVQ (24)(CX), AX
MULQ R13
ADDQ AX, R15
ADCQ DX, R8
ADCQ $0, R9
MOVQ R10, AX
MULQ R12
ADDQ AX, R15
ADCQ DX, R8
ADCQ $0, R9
MOVQ (16)(CX), R14
MOVQ (56)(CX), AX
MULQ R14
ADDQ AX, R15
MOVQ R15, (24)(SP) // c3
ADCQ DX, R8
ADCQ $0, R9
XORQ R10, R10
MOVQ (80)(CX), AX
MULQ R11
ADDQ AX, R8
ADCQ DX, R9
ADCQ $0, R10
MOVQ (64)(CX), AX
MULQ R14
ADDQ AX, R8
ADCQ DX, R9
ADCQ $0, R10
MOVQ (48)(CX), R15
MOVQ (32)(CX), AX
MULQ R15
ADDQ AX, R8
ADCQ DX, R9
ADCQ $0, R10
MOVQ (72)(CX), AX
MULQ R12
ADDQ AX, R8
ADCQ DX, R9
ADCQ $0, R10
MOVQ (24)(CX), R13
MOVQ (56)(CX), AX
MULQ R13
ADDQ AX, R8
MOVQ R8, (32)(SP) // c4
ADCQ DX, R9
ADCQ $0, R10
XORQ R8, R8
MOVQ (88)(CX), AX
MULQ R11
ADDQ AX, R9
ADCQ DX, R10
ADCQ $0, R8
MOVQ (64)(CX), AX
MULQ R13
ADDQ AX, R9
ADCQ DX, R10
ADCQ $0, R8
MOVQ (72)(CX), AX
MULQ R14
ADDQ AX, R9
ADCQ DX, R10
ADCQ $0, R8
MOVQ (40)(CX), AX
MULQ R15
ADDQ AX, R9
ADCQ DX, R10
ADCQ $0, R8
MOVQ (80)(CX), AX
MULQ R12
ADDQ AX, R9
ADCQ DX, R10
ADCQ $0, R8
MOVQ (32)(CX), R15
MOVQ (56)(CX), AX
MULQ R15
ADDQ AX, R9
MOVQ R9, (40)(SP) // c5
ADCQ DX, R10
ADCQ $0, R8
XORQ R9, R9
MOVQ (64)(CX), AX
MULQ R15
ADDQ AX, R10
ADCQ DX, R8
ADCQ $0, R9
MOVQ (88)(CX), AX
MULQ R12
ADDQ AX, R10
ADCQ DX, R8
ADCQ $0, R9
MOVQ (80)(CX), AX
MULQ R14
ADDQ AX, R10
ADCQ DX, R8
ADCQ $0, R9
MOVQ (40)(CX), R11
MOVQ (56)(CX), AX
MULQ R11
ADDQ AX, R10
ADCQ DX, R8
ADCQ $0, R9
MOVQ (72)(CX), AX
MULQ R13
ADDQ AX, R10
MOVQ R10, (48)(SP) // c6
ADCQ DX, R8
ADCQ $0, R9
XORQ R10, R10
MOVQ (88)(CX), AX
MULQ R14
ADDQ AX, R8
ADCQ DX, R9
ADCQ $0, R10
MOVQ (64)(CX), AX
MULQ R11
ADDQ AX, R8
ADCQ DX, R9
ADCQ $0, R10
MOVQ (80)(CX), AX
MULQ R13
ADDQ AX, R8
ADCQ DX, R9
ADCQ $0, R10
MOVQ (72)(CX), AX
MULQ R15
ADDQ AX, R8
MOVQ R8, (56)(SP) // c7
ADCQ DX, R9
ADCQ $0, R10
XORQ R8, R8
MOVQ (72)(CX), AX
MULQ R11
ADDQ AX, R9
ADCQ DX, R10
ADCQ $0, R8
MOVQ (80)(CX), AX
MULQ R15
ADDQ AX, R9
ADCQ DX, R10
ADCQ $0, R8
MOVQ (88)(CX), AX
MULQ R13
ADDQ AX, R9
MOVQ R9, (64)(SP) // c8
ADCQ DX, R10
ADCQ $0, R8
XORQ R9, R9
MOVQ (88)(CX), AX
MULQ R15
ADDQ AX, R10
ADCQ DX, R8
ADCQ $0, R9
MOVQ (80)(CX), AX
MULQ R11
ADDQ AX, R10 // c9
ADCQ DX, R8
ADCQ $0, R9
MOVQ (88)(CX), AX
MULQ R11
ADDQ AX, R8 // c10
ADCQ DX, R9 // c11
MOVQ (88)(SP), AX
MOVQ (CX), DX
ANDQ AX, R12
ANDQ AX, R14
ANDQ AX, DX
ANDQ AX, R13
ANDQ AX, R15
ANDQ AX, R11
MOVQ (48)(SP), AX
ADDQ AX, DX
MOVQ (56)(SP), AX
ADCQ AX, R12
MOVQ (64)(SP), AX
ADCQ AX, R14
ADCQ R10, R13
ADCQ R8, R15
ADCQ R9, R11
MOVQ (80)(SP), AX
MOVQ DX, (48)(SP)
MOVQ R12, (56)(SP)
MOVQ R14, (64)(SP)
MOVQ R13, (72)(SP)
MOVQ R15, (80)(SP)
MOVQ R11, (88)(SP)
MOVQ (48)(CX), R8
MOVQ (56)(CX), R9
MOVQ (64)(CX), R10
MOVQ (72)(CX), R11
MOVQ (80)(CX), R12
MOVQ (88)(CX), R13
ANDQ AX, R8
ANDQ AX, R9
ANDQ AX, R10
ANDQ AX, R11
ANDQ AX, R12
ANDQ AX, R13
MOVQ (48)(SP), AX
ADDQ AX, R8
MOVQ (56)(SP), AX
ADCQ AX, R9
MOVQ (64)(SP), AX
ADCQ AX, R10
MOVQ (72)(SP), AX
ADCQ AX, R11
MOVQ (80)(SP), AX
ADCQ AX, R12
MOVQ (88)(SP), AX
ADCQ AX, R13
MOVQ R8, (48)(SP)
MOVQ R9, (56)(SP)
MOVQ R11, (72)(SP)
// CX[0-11] <- AL*BL
MOVQ (REG_P1), R11
MOVQ (REG_P2), AX
MULQ R11
XORQ R9, R9
MOVQ AX, (CX) // c0
MOVQ R10, (64)(SP)
MOVQ DX, R8
MOVQ (8)(REG_P2), AX
MULQ R11
XORQ R10, R10
ADDQ AX, R8
MOVQ R12, (80)(SP)
ADCQ DX, R9
MOVQ (8)(REG_P1), R12
MOVQ (REG_P2), AX
MULQ R12
ADDQ AX, R8
MOVQ R8, (8)(CX) // c1
ADCQ DX, R9
MOVQ R13, (88)(SP)
ADCQ $0, R10
XORQ R8, R8
MOVQ (16)(REG_P2), AX
MULQ R11
ADDQ AX, R9
ADCQ DX, R10
ADCQ $0, R8
MOVQ (REG_P2), R13
MOVQ (16)(REG_P1), AX
MULQ R13
ADDQ AX, R9
ADCQ DX, R10
ADCQ $0, R8
MOVQ (8)(REG_P2), AX
MULQ R12
ADDQ AX, R9
MOVQ R9, (16)(CX) // c2
ADCQ DX, R10
ADCQ $0, R8
XORQ R9, R9
MOVQ (24)(REG_P2), AX
MULQ R11
ADDQ AX, R10
ADCQ DX, R8
ADCQ $0, R9
MOVQ (24)(REG_P1), AX
MULQ R13
ADDQ AX, R10
ADCQ DX, R8
ADCQ $0, R9
MOVQ (16)(REG_P2), AX
MULQ R12
ADDQ AX, R10
ADCQ DX, R8
ADCQ $0, R9
MOVQ (16)(REG_P1), R14
MOVQ (8)(REG_P2), AX
MULQ R14
ADDQ AX, R10
MOVQ R10, (24)(CX) // c3
ADCQ DX, R8
ADCQ $0, R9
XORQ R10, R10
MOVQ (32)(REG_P2), AX
MULQ R11
ADDQ AX, R8
ADCQ DX, R9
ADCQ $0, R10
MOVQ (16)(REG_P2), AX
MULQ R14
ADDQ AX, R8
ADCQ DX, R9
ADCQ $0, R10
MOVQ (32)(REG_P1), AX
MULQ R13
ADDQ AX, R8
ADCQ DX, R9
ADCQ $0, R10
MOVQ (24)(REG_P2), AX
MULQ R12
ADDQ AX, R8
ADCQ DX, R9
ADCQ $0, R10
MOVQ (24)(REG_P1), R13
MOVQ (8)(REG_P2), AX
MULQ R13
ADDQ AX, R8
MOVQ R8, (32)(CX) // c4
ADCQ DX, R9
ADCQ $0, R10
XORQ R8, R8
MOVQ (40)(REG_P2), AX
MULQ R11
ADDQ AX, R9
ADCQ DX, R10
ADCQ $0, R8
MOVQ (16)(REG_P2), AX
MULQ R13
ADDQ AX, R9
ADCQ DX, R10
ADCQ $0, R8
MOVQ (24)(REG_P2), AX
MULQ R14
ADDQ AX, R9
ADCQ DX, R10
ADCQ $0, R8
MOVQ (40)(REG_P1), R11
MOVQ (REG_P2), AX
MULQ R11
ADDQ AX, R9
ADCQ DX, R10
ADCQ $0, R8
MOVQ (32)(REG_P2), AX
MULQ R12
ADDQ AX, R9
ADCQ DX, R10
ADCQ $0, R8
MOVQ (32)(REG_P1), R15
MOVQ (8)(REG_P2), AX
MULQ R15
ADDQ AX, R9
MOVQ R9, (40)(CX) //c5
ADCQ DX, R10
ADCQ $0, R8
XORQ R9, R9
MOVQ (16)(REG_P2), AX
MULQ R15
ADDQ AX, R10
ADCQ DX, R8
ADCQ $0, R9
MOVQ (40)(REG_P2), AX
MULQ R12
ADDQ AX, R10
ADCQ DX, R8
ADCQ $0, R9
MOVQ (32)(REG_P2), AX
MULQ R14
ADDQ AX, R10
ADCQ DX, R8
ADCQ $0, R9
MOVQ (8)(REG_P2), AX
MULQ R11
ADDQ AX, R10
ADCQ DX, R8
ADCQ $0, R9
MOVQ (24)(REG_P2), AX
MULQ R13
ADDQ AX, R10
MOVQ R10, (48)(CX) // c6
ADCQ DX, R8
ADCQ $0, R9
XORQ R10, R10
MOVQ (40)(REG_P2), AX
MULQ R14
ADDQ AX, R8
ADCQ DX, R9
ADCQ $0, R10
MOVQ (16)(REG_P2), AX
MULQ R11
ADDQ AX, R8
ADCQ DX, R9
ADCQ $0, R10
MOVQ (32)(REG_P2), AX
MULQ R13
ADDQ AX, R8
ADCQ DX, R9
ADCQ $0, R10
MOVQ (24)(REG_P2), AX
MULQ R15
ADDQ AX, R8
MOVQ R8, (56)(CX) // c7
ADCQ DX, R9
ADCQ $0, R10
XORQ R8, R8
MOVQ (24)(REG_P2), AX
MULQ R11
ADDQ AX, R9
ADCQ DX, R10
ADCQ $0, R8
MOVQ (32)(REG_P2), AX
MULQ R15
ADDQ AX, R9
ADCQ DX, R10
ADCQ $0, R8
MOVQ (40)(REG_P2), AX
MULQ R13
ADDQ AX, R9
MOVQ R9, (64)(CX) // c8
ADCQ DX, R10
ADCQ $0, R8
XORQ R9, R9
MOVQ (40)(REG_P2), AX
MULQ R15
ADDQ AX, R10
ADCQ DX, R8
ADCQ $0, R9
MOVQ (32)(REG_P2), AX
MULQ R11
ADDQ AX, R10
MOVQ R10, (72)(CX) // c9
ADCQ DX, R8
ADCQ $0, R9
MOVQ (40)(REG_P2), AX
MULQ R11
ADDQ AX, R8
MOVQ R8, (80)(CX) // c10
ADCQ DX, R9
MOVQ R9, (88)(CX) // c11
// CX[12-23] <- AH*BH
MOVQ (48)(REG_P1), R11
MOVQ (48)(REG_P2), AX
MULQ R11
XORQ R9, R9
MOVQ AX, (96)(CX) // c0
MOVQ DX, R8
MOVQ (56)(REG_P2), AX
MULQ R11
XORQ R10, R10
ADDQ AX, R8
ADCQ DX, R9
MOVQ (56)(REG_P1), R12
MOVQ (48)(REG_P2), AX
MULQ R12
ADDQ AX, R8
MOVQ R8, (104)(CX) // c1
ADCQ DX, R9
ADCQ $0, R10
XORQ R8, R8
MOVQ (64)(REG_P2), AX
MULQ R11
ADDQ AX, R9
ADCQ DX, R10
ADCQ $0, R8
MOVQ (48)(REG_P2), R13
MOVQ (64)(REG_P1), AX
MULQ R13
ADDQ AX, R9
ADCQ DX, R10
ADCQ $0, R8
MOVQ (56)(REG_P2), AX
MULQ R12
ADDQ AX, R9
MOVQ R9, (112)(CX) // c2
ADCQ DX, R10
ADCQ $0, R8
XORQ R9, R9
MOVQ (72)(REG_P2), AX
MULQ R11
ADDQ AX, R10
ADCQ DX, R8
ADCQ $0, R9
MOVQ (72)(REG_P1), AX
MULQ R13
ADDQ AX, R10
ADCQ DX, R8
ADCQ $0, R9
MOVQ (64)(REG_P2), AX
MULQ R12
ADDQ AX, R10
ADCQ DX, R8
ADCQ $0, R9
MOVQ (64)(REG_P1), R14
MOVQ (56)(REG_P2), AX
MULQ R14
ADDQ AX, R10
MOVQ R10, (120)(CX) // c3
ADCQ DX, R8
ADCQ $0, R9
XORQ R10, R10
MOVQ (80)(REG_P2), AX
MULQ R11
ADDQ AX, R8
ADCQ DX, R9
ADCQ $0, R10
MOVQ (64)(REG_P2), AX
MULQ R14
ADDQ AX, R8
ADCQ DX, R9
ADCQ $0, R10
MOVQ (80)(REG_P1), R15
MOVQ R13, AX
MULQ R15
ADDQ AX, R8
ADCQ DX, R9
ADCQ $0, R10
MOVQ (72)(REG_P2), AX
MULQ R12
ADDQ AX, R8
ADCQ DX, R9
ADCQ $0, R10
MOVQ (72)(REG_P1), R13
MOVQ (56)(REG_P2), AX
MULQ R13
ADDQ AX, R8
MOVQ R8, (128)(CX) // c4
ADCQ DX, R9
ADCQ $0, R10
XORQ R8, R8
MOVQ (88)(REG_P2), AX
MULQ R11
ADDQ AX, R9
ADCQ DX, R10
ADCQ $0, R8
MOVQ (64)(REG_P2), AX
MULQ R13
ADDQ AX, R9
ADCQ DX, R10
ADCQ $0, R8
MOVQ (72)(REG_P2), AX
MULQ R14
ADDQ AX, R9
ADCQ DX, R10
ADCQ $0, R8
MOVQ (88)(REG_P1), R11
MOVQ (48)(REG_P2), AX
MULQ R11
ADDQ AX, R9
ADCQ DX, R10
ADCQ $0, R8
MOVQ (80)(REG_P2), AX
MULQ R12
ADDQ AX, R9
ADCQ DX, R10
ADCQ $0, R8
MOVQ (56)(REG_P2), AX
MULQ R15
ADDQ AX, R9
MOVQ R9, (136)(CX) // c5
ADCQ DX, R10
ADCQ $0, R8
XORQ R9, R9
MOVQ (64)(REG_P2), AX
MULQ R15
ADDQ AX, R10
ADCQ DX, R8
ADCQ $0, R9
MOVQ (88)(REG_P2), AX
MULQ R12
ADDQ AX, R10
ADCQ DX, R8
ADCQ $0, R9
MOVQ (80)(REG_P2), AX
MULQ R14
ADDQ AX, R10
ADCQ DX, R8
ADCQ $0, R9
MOVQ (56)(REG_P2), AX
MULQ R11
ADDQ AX, R10
ADCQ DX, R8
ADCQ $0, R9
MOVQ (72)(REG_P2), AX
MULQ R13
ADDQ AX, R10
MOVQ R10, (144)(CX) // c6
ADCQ DX, R8
ADCQ $0, R9
XORQ R10, R10
MOVQ (88)(REG_P2), AX
MULQ R14
ADDQ AX, R8
ADCQ DX, R9
ADCQ $0, R10
MOVQ (64)(REG_P2), AX
MULQ R11
ADDQ AX, R8
ADCQ DX, R9
ADCQ $0, R10
MOVQ (80)(REG_P2), AX
MULQ R13
ADDQ AX, R8
ADCQ DX, R9
ADCQ $0, R10
MOVQ (72)(REG_P2), AX
MULQ R15
ADDQ AX, R8
MOVQ R8, (152)(CX) // c7
ADCQ DX, R9
ADCQ $0, R10
XORQ R8, R8
MOVQ (72)(REG_P2), AX
MULQ R11
ADDQ AX, R9
ADCQ DX, R10
ADCQ $0, R8
MOVQ (80)(REG_P2), AX
MULQ R15
ADDQ AX, R9
ADCQ DX, R10
ADCQ $0, R8
MOVQ (88)(REG_P2), AX
MULQ R13
ADDQ AX, R9
MOVQ R9, (160)(CX) // c8
ADCQ DX, R10
ADCQ $0, R8
MOVQ (88)(REG_P2), AX
MULQ R15
ADDQ AX, R10
ADCQ DX, R8
MOVQ (80)(REG_P2), AX
MULQ R11
ADDQ AX, R10
MOVQ R10, (168)(CX) // c9
ADCQ DX, R8
MOVQ (88)(REG_P2), AX
MULQ R11
ADDQ AX, R8
MOVQ R8, (176)(CX) // c10
ADCQ $0, DX
MOVQ DX, (184)(CX) // c11
// [R8-R15,AX,DX,DI,(SP)] <- (AH+AL)*(BH+BL)-AL*BL
MOVQ (SP), R8
SUBQ (CX), R8
MOVQ (8)(SP), R9
SBBQ (8)(CX), R9
MOVQ (16)(SP), R10
SBBQ (16)(CX), R10
MOVQ (24)(SP), R11
SBBQ (24)(CX), R11
MOVQ (32)(SP), R12
SBBQ (32)(CX), R12
MOVQ (40)(SP), R13
SBBQ (40)(CX), R13
MOVQ (48)(SP), R14
SBBQ (48)(CX), R14
MOVQ (56)(SP), R15
SBBQ (56)(CX), R15
MOVQ (64)(SP), AX
SBBQ (64)(CX), AX
MOVQ (72)(SP), DX
SBBQ (72)(CX), DX
MOVQ (80)(SP), DI
SBBQ (80)(CX), DI
MOVQ (88)(SP), SI
SBBQ (88)(CX), SI
MOVQ SI, (SP)
// [R8-R15,AX,DX,DI,(SP)] <- (AH+AL)*(BH+BL) - AL*BL - AH*BH
MOVQ (96)(CX), SI
SUBQ SI, R8
MOVQ (104)(CX), SI
SBBQ SI, R9
MOVQ (112)(CX), SI
SBBQ SI, R10
MOVQ (120)(CX), SI
SBBQ SI, R11
MOVQ (128)(CX), SI
SBBQ SI, R12
MOVQ (136)(CX), SI
SBBQ SI, R13
MOVQ (144)(CX), SI
SBBQ SI, R14
MOVQ (152)(CX), SI
SBBQ SI, R15
MOVQ (160)(CX), SI
SBBQ SI, AX
MOVQ (168)(CX), SI
SBBQ SI, DX
MOVQ (176)(CX), SI
SBBQ SI, DI
MOVQ (SP), SI
SBBQ (184)(CX), SI
// FINAL RESULT
ADDQ (48)(CX), R8
MOVQ R8, (48)(CX)
ADCQ (56)(CX), R9
MOVQ R9, (56)(CX)
ADCQ (64)(CX), R10
MOVQ R10, (64)(CX)
ADCQ (72)(CX), R11
MOVQ R11, (72)(CX)
ADCQ (80)(CX), R12
MOVQ R12, (80)(CX)
ADCQ (88)(CX), R13
MOVQ R13, (88)(CX)
ADCQ (96)(CX), R14
MOVQ R14, (96)(CX)
ADCQ (104)(CX), R15
MOVQ R15, (104)(CX)
ADCQ (112)(CX), AX
MOVQ AX, (112)(CX)
ADCQ (120)(CX), DX
MOVQ DX, (120)(CX)
ADCQ (128)(CX), DI
MOVQ DI, (128)(CX)
ADCQ (136)(CX), SI
MOVQ SI, (136)(CX)
MOVQ (144)(CX), AX
ADCQ $0, AX
MOVQ AX, (144)(CX)
MOVQ (152)(CX), AX
ADCQ $0, AX
MOVQ AX, (152)(CX)
MOVQ (160)(CX), AX
ADCQ $0, AX
MOVQ AX, (160)(CX)
MOVQ (168)(CX), AX
ADCQ $0, AX
MOVQ AX, (168)(CX)
MOVQ (176)(CX), AX
ADCQ $0, AX
MOVQ AX, (176)(CX)
MOVQ (184)(CX), AX
ADCQ $0, AX
MOVQ AX, (184)(CX)
RET
TEXT ·fp751MontgomeryReduce(SB), $0-16
MOVQ z+0(FP), REG_P2
MOVQ x+8(FP), REG_P1
MOVQ (REG_P1), R11
MOVQ P751P1_5, AX
MULQ R11
XORQ R8, R8
ADDQ (40)(REG_P1), AX
MOVQ AX, (40)(REG_P2) // Z5
ADCQ DX, R8
XORQ R9, R9
MOVQ P751P1_6, AX
MULQ R11
XORQ R10, R10
ADDQ AX, R8
ADCQ DX, R9
MOVQ (8)(REG_P1), R12
MOVQ P751P1_5, AX
MULQ R12
ADDQ AX, R8
ADCQ DX, R9
ADCQ $0, R10
ADDQ (48)(REG_P1), R8
MOVQ R8, (48)(REG_P2) // Z6
ADCQ $0, R9
ADCQ $0, R10
XORQ R8, R8
MOVQ P751P1_7, AX
MULQ R11
ADDQ AX, R9
ADCQ DX, R10
ADCQ $0, R8
MOVQ P751P1_6, AX
MULQ R12
ADDQ AX, R9
ADCQ DX, R10
ADCQ $0, R8
MOVQ (16)(REG_P1), R13
MOVQ P751P1_5, AX
MULQ R13
ADDQ AX, R9
ADCQ DX, R10
ADCQ $0, R8
ADDQ (56)(REG_P1), R9
MOVQ R9, (56)(REG_P2) // Z7
ADCQ $0, R10
ADCQ $0, R8
XORQ R9, R9
MOVQ P751P1_8, AX
MULQ R11
ADDQ AX, R10
ADCQ DX, R8
ADCQ $0, R9
MOVQ P751P1_7, AX
MULQ R12
ADDQ AX, R10
ADCQ DX, R8
ADCQ $0, R9
MOVQ P751P1_6, AX
MULQ R13
ADDQ AX, R10
ADCQ DX, R8
ADCQ $0, R9
MOVQ (24)(REG_P1), R14
MOVQ P751P1_5, AX
MULQ R14
ADDQ AX, R10
ADCQ DX, R8
ADCQ $0, R9
ADDQ (64)(REG_P1), R10
MOVQ R10, (64)(REG_P2) // Z8
ADCQ $0, R8
ADCQ $0, R9
XORQ R10, R10
MOVQ P751P1_9, AX
MULQ R11
ADDQ AX, R8
ADCQ DX, R9
ADCQ $0, R10
MOVQ P751P1_8, AX
MULQ R12
ADDQ AX, R8
ADCQ DX, R9
ADCQ $0, R10
MOVQ P751P1_7, AX
MULQ R13
ADDQ AX, R8
ADCQ DX, R9
ADCQ $0, R10
MOVQ P751P1_6, AX
MULQ R14
ADDQ AX, R8
ADCQ DX, R9
ADCQ $0, R10
MOVQ (32)(REG_P1), R15
MOVQ P751P1_5, AX
MULQ R15
ADDQ AX, R8
ADCQ DX, R9
ADCQ $0, R10
ADDQ (72)(REG_P1), R8
MOVQ R8, (72)(REG_P2) // Z9
ADCQ $0, R9
ADCQ $0, R10
XORQ R8, R8
MOVQ P751P1_10, AX
MULQ R11
ADDQ AX, R9
ADCQ DX, R10
ADCQ $0, R8
MOVQ P751P1_9, AX
MULQ R12
ADDQ AX, R9
ADCQ DX, R10
ADCQ $0, R8
MOVQ P751P1_8, AX
MULQ R13
ADDQ AX, R9
ADCQ DX, R10
ADCQ $0, R8
MOVQ P751P1_7, AX
MULQ R14
ADDQ AX, R9
ADCQ DX, R10
ADCQ $0, R8
MOVQ P751P1_6, AX
MULQ R15
ADDQ AX, R9
ADCQ DX, R10
ADCQ $0, R8
MOVQ (40)(REG_P2), CX
MOVQ P751P1_5, AX
MULQ CX
ADDQ AX, R9
ADCQ DX, R10
ADCQ $0, R8
ADDQ (80)(REG_P1), R9
MOVQ R9, (80)(REG_P2) // Z10
ADCQ $0, R10
ADCQ $0, R8
XORQ R9, R9
MOVQ P751P1_11, AX
MULQ R11
ADDQ AX, R10
ADCQ DX, R8
ADCQ $0, R9
MOVQ P751P1_10, AX
MULQ R12
ADDQ AX, R10
ADCQ DX, R8
ADCQ $0, R9
MOVQ P751P1_9, AX
MULQ R13
ADDQ AX, R10
ADCQ DX, R8
ADCQ $0, R9
MOVQ P751P1_8, AX
MULQ R14
ADDQ AX, R10
ADCQ DX, R8
ADCQ $0, R9
MOVQ P751P1_7, AX
MULQ R15
ADDQ AX, R10
ADCQ DX, R8
ADCQ $0, R9
MOVQ P751P1_6, AX
MULQ CX
ADDQ AX, R10
ADCQ DX, R8
ADCQ $0, R9
MOVQ (48)(REG_P2), R11
MOVQ P751P1_5, AX
MULQ R11
ADDQ AX, R10
ADCQ DX, R8
ADCQ $0, R9
ADDQ (88)(REG_P1), R10
MOVQ R10, (88)(REG_P2) // Z11
ADCQ $0, R8
ADCQ $0, R9
XORQ R10, R10
MOVQ P751P1_11, AX
MULQ R12
ADDQ AX, R8
ADCQ DX, R9
ADCQ $0, R10
MOVQ P751P1_10, AX
MULQ R13
ADDQ AX, R8
ADCQ DX, R9
ADCQ $0, R10
MOVQ P751P1_9, AX
MULQ R14
ADDQ AX, R8
ADCQ DX, R9
ADCQ $0, R10
MOVQ P751P1_8, AX
MULQ R15
ADDQ AX, R8
ADCQ DX, R9
ADCQ $0, R10
MOVQ P751P1_7, AX
MULQ CX
ADDQ AX, R8
ADCQ DX, R9
ADCQ $0, R10
MOVQ P751P1_6, AX
MULQ R11
ADDQ AX, R8
ADCQ DX, R9
ADCQ $0, R10
MOVQ (56)(REG_P2), R12
MOVQ P751P1_5, AX
MULQ R12
ADDQ AX, R8
ADCQ DX, R9
ADCQ $0, R10
ADDQ (96)(REG_P1), R8
MOVQ R8, (REG_P2) // Z0
ADCQ $0, R9
ADCQ $0, R10
XORQ R8, R8
MOVQ P751P1_11, AX
MULQ R13
ADDQ AX, R9
ADCQ DX, R10
ADCQ $0, R8
MOVQ P751P1_10, AX
MULQ R14
ADDQ AX, R9
ADCQ DX, R10
ADCQ $0, R8
MOVQ P751P1_9, AX
MULQ R15
ADDQ AX, R9
ADCQ DX, R10
ADCQ $0, R8
MOVQ P751P1_8, AX
MULQ CX
ADDQ AX, R9
ADCQ DX, R10
ADCQ $0, R8
MOVQ P751P1_7, AX
MULQ R11
ADDQ AX, R9
ADCQ DX, R10
ADCQ $0, R8
MOVQ P751P1_6, AX
MULQ R12
ADDQ AX, R9
ADCQ DX, R10
ADCQ $0, R8
MOVQ (64)(REG_P2), R13
MOVQ P751P1_5, AX
MULQ R13
ADDQ AX, R9
ADCQ DX, R10
ADCQ $0, R8
ADDQ (104)(REG_P1), R9
MOVQ R9, (8)(REG_P2) // Z1
ADCQ $0, R10
ADCQ $0, R8
XORQ R9, R9
MOVQ P751P1_11, AX
MULQ R14
ADDQ AX, R10
ADCQ DX, R8
ADCQ $0, R9
MOVQ P751P1_10, AX
MULQ R15
ADDQ AX, R10
ADCQ DX, R8
ADCQ $0, R9
MOVQ P751P1_9, AX
MULQ CX
ADDQ AX, R10
ADCQ DX, R8
ADCQ $0, R9
MOVQ P751P1_8, AX
MULQ R11
ADDQ AX, R10
ADCQ DX, R8
ADCQ $0, R9
MOVQ P751P1_7, AX
MULQ R12
ADDQ AX, R10
ADCQ DX, R8
ADCQ $0, R9
MOVQ P751P1_6, AX
MULQ R13
ADDQ AX, R10
ADCQ DX, R8
ADCQ $0, R9
MOVQ (72)(REG_P2), R14
MOVQ P751P1_5, AX
MULQ R14
ADDQ AX, R10
ADCQ DX, R8
ADCQ $0, R9
ADDQ (112)(REG_P1), R10
MOVQ R10, (16)(REG_P2) // Z2
ADCQ $0, R8
ADCQ $0, R9
XORQ R10, R10
MOVQ P751P1_11, AX
MULQ R15
ADDQ AX, R8
ADCQ DX, R9
ADCQ $0, R10
MOVQ P751P1_10, AX
MULQ CX
ADDQ AX, R8
ADCQ DX, R9
ADCQ $0, R10
MOVQ P751P1_9, AX
MULQ R11
ADDQ AX, R8
ADCQ DX, R9
ADCQ $0, R10
MOVQ P751P1_8, AX
MULQ R12
ADDQ AX, R8
ADCQ DX, R9
ADCQ $0, R10
MOVQ P751P1_7, AX
MULQ R13
ADDQ AX, R8
ADCQ DX, R9
ADCQ $0, R10
MOVQ P751P1_6, AX
MULQ R14
ADDQ AX, R8
ADCQ DX, R9
ADCQ $0, R10
MOVQ (80)(REG_P2), R15
MOVQ P751P1_5, AX
MULQ R15
ADDQ AX, R8
ADCQ DX, R9
ADCQ $0, R10
ADDQ (120)(REG_P1), R8
MOVQ R8, (24)(REG_P2) // Z3
ADCQ $0, R9
ADCQ $0, R10
XORQ R8, R8
MOVQ P751P1_11, AX
MULQ CX
ADDQ AX, R9
ADCQ DX, R10
ADCQ $0, R8
MOVQ P751P1_10, AX
MULQ R11
ADDQ AX, R9
ADCQ DX, R10
ADCQ $0, R8
MOVQ P751P1_9, AX
MULQ R12
ADDQ AX, R9
ADCQ DX, R10
ADCQ $0, R8
MOVQ P751P1_8, AX
MULQ R13
ADDQ AX, R9
ADCQ DX, R10
ADCQ $0, R8
MOVQ P751P1_7, AX
MULQ R14
ADDQ AX, R9
ADCQ DX, R10
ADCQ $0, R8
MOVQ P751P1_6, AX
MULQ R15
ADDQ AX, R9
ADCQ DX, R10
ADCQ $0, R8
MOVQ (88)(REG_P2), CX
MOVQ P751P1_5, AX
MULQ CX
ADDQ AX, R9
ADCQ DX, R10
ADCQ $0, R8
ADDQ (128)(REG_P1), R9
MOVQ R9, (32)(REG_P2) // Z4
ADCQ $0, R10
ADCQ $0, R8
XORQ R9, R9
MOVQ P751P1_11, AX
MULQ R11
ADDQ AX, R10
ADCQ DX, R8
ADCQ $0, R9
MOVQ P751P1_10, AX
MULQ R12
ADDQ AX, R10
ADCQ DX, R8
ADCQ $0, R9
MOVQ P751P1_9, AX
MULQ R13
ADDQ AX, R10
ADCQ DX, R8
ADCQ $0, R9
MOVQ P751P1_8, AX
MULQ R14
ADDQ AX, R10
ADCQ DX, R8
ADCQ $0, R9
MOVQ P751P1_7, AX
MULQ R15
ADDQ AX, R10
ADCQ DX, R8
ADCQ $0, R9
MOVQ P751P1_6, AX
MULQ CX
ADDQ AX, R10
ADCQ DX, R8
ADCQ $0, R9
ADDQ (136)(REG_P1), R10
MOVQ R10, (40)(REG_P2) // Z5
ADCQ $0, R8
ADCQ $0, R9
XORQ R10, R10
MOVQ P751P1_11, AX
MULQ R12
ADDQ AX, R8
ADCQ DX, R9
ADCQ $0, R10
MOVQ P751P1_10, AX
MULQ R13
ADDQ AX, R8
ADCQ DX, R9
ADCQ $0, R10
MOVQ P751P1_9, AX
MULQ R14
ADDQ AX, R8
ADCQ DX, R9
ADCQ $0, R10
MOVQ P751P1_8, AX
MULQ R15
ADDQ AX, R8
ADCQ DX, R9
ADCQ $0, R10
MOVQ P751P1_7, AX
MULQ CX
ADDQ AX, R8
ADCQ DX, R9
ADCQ $0, R10
ADDQ (144)(REG_P1), R8
MOVQ R8, (48)(REG_P2) // Z6
ADCQ $0, R9
ADCQ $0, R10
XORQ R8, R8
MOVQ P751P1_11, AX
MULQ R13
ADDQ AX, R9
ADCQ DX, R10
ADCQ $0, R8
MOVQ P751P1_10, AX
MULQ R14
ADDQ AX, R9
ADCQ DX, R10
ADCQ $0, R8
MOVQ P751P1_9, AX
MULQ R15
ADDQ AX, R9
ADCQ DX, R10
ADCQ $0, R8
MOVQ P751P1_8, AX
MULQ CX
ADDQ AX, R9
ADCQ DX, R10
ADCQ $0, R8
ADDQ (152)(REG_P1), R9
MOVQ R9, (56)(REG_P2) // Z7
ADCQ $0, R10
ADCQ $0, R8
XORQ R9, R9
MOVQ P751P1_11, AX
MULQ R14
ADDQ AX, R10
ADCQ DX, R8
ADCQ $0, R9
MOVQ P751P1_10, AX
MULQ R15
ADDQ AX, R10
ADCQ DX, R8
ADCQ $0, R9
MOVQ P751P1_9, AX
MULQ CX
ADDQ AX, R10
ADCQ DX, R8
ADCQ $0, R9
ADDQ (160)(REG_P1), R10
MOVQ R10, (64)(REG_P2) // Z8
ADCQ $0, R8
ADCQ $0, R9
XORQ R10, R10
MOVQ P751P1_11, AX
MULQ R15
ADDQ AX, R8
ADCQ DX, R9
ADCQ $0, R10
MOVQ P751P1_10, AX
MULQ CX
ADDQ AX, R8
ADCQ DX, R9
ADCQ $0, R10
ADDQ (168)(REG_P1), R8 // Z9
MOVQ R8, (72)(REG_P2) // Z9
ADCQ $0, R9
ADCQ $0, R10
MOVQ P751P1_11, AX
MULQ CX
ADDQ AX, R9
ADCQ DX, R10
ADDQ (176)(REG_P1), R9 // Z10
MOVQ R9, (80)(REG_P2) // Z10
ADCQ $0, R10
ADDQ (184)(REG_P1), R10 // Z11
MOVQ R10, (88)(REG_P2) // Z11
RET
TEXT ·fp751AddLazy(SB), NOSPLIT, $0-24
MOVQ z+0(FP), REG_P3
MOVQ x+8(FP), REG_P1
MOVQ y+16(FP), REG_P2
MOVQ (REG_P1), R8
MOVQ (8)(REG_P1), R9
MOVQ (16)(REG_P1), R10
MOVQ (24)(REG_P1), R11
MOVQ (32)(REG_P1), R12
MOVQ (40)(REG_P1), R13
MOVQ (48)(REG_P1), R14
MOVQ (56)(REG_P1), R15
MOVQ (64)(REG_P1), AX
MOVQ (72)(REG_P1), BX
MOVQ (80)(REG_P1), CX
MOVQ (88)(REG_P1), DI
ADDQ (REG_P2), R8
ADCQ (8)(REG_P2), R9
ADCQ (16)(REG_P2), R10
ADCQ (24)(REG_P2), R11
ADCQ (32)(REG_P2), R12
ADCQ (40)(REG_P2), R13
ADCQ (48)(REG_P2), R14
ADCQ (56)(REG_P2), R15
ADCQ (64)(REG_P2), AX
ADCQ (72)(REG_P2), BX
ADCQ (80)(REG_P2), CX
ADCQ (88)(REG_P2), DI
MOVQ R8, (REG_P3)
MOVQ R9, (8)(REG_P3)
MOVQ R10, (16)(REG_P3)
MOVQ R11, (24)(REG_P3)
MOVQ R12, (32)(REG_P3)
MOVQ R13, (40)(REG_P3)
MOVQ R14, (48)(REG_P3)
MOVQ R15, (56)(REG_P3)
MOVQ AX, (64)(REG_P3)
MOVQ BX, (72)(REG_P3)
MOVQ CX, (80)(REG_P3)
MOVQ DI, (88)(REG_P3)
RET
TEXT ·fp751X2AddLazy(SB), NOSPLIT, $0-24
MOVQ z+0(FP), REG_P3
MOVQ x+8(FP), REG_P1
MOVQ y+16(FP), REG_P2
MOVQ (REG_P1), R8
MOVQ (8)(REG_P1), R9
MOVQ (16)(REG_P1), R10
MOVQ (24)(REG_P1), R11
MOVQ (32)(REG_P1), R12
MOVQ (40)(REG_P1), R13
MOVQ (48)(REG_P1), R14
MOVQ (56)(REG_P1), R15
MOVQ (64)(REG_P1), AX
MOVQ (72)(REG_P1), BX
MOVQ (80)(REG_P1), CX
ADDQ (REG_P2), R8
ADCQ (8)(REG_P2), R9
ADCQ (16)(REG_P2), R10
ADCQ (24)(REG_P2), R11
ADCQ (32)(REG_P2), R12
ADCQ (40)(REG_P2), R13
ADCQ (48)(REG_P2), R14
ADCQ (56)(REG_P2), R15
ADCQ (64)(REG_P2), AX
ADCQ (72)(REG_P2), BX
ADCQ (80)(REG_P2), CX
MOVQ R8, (REG_P3)
MOVQ R9, (8)(REG_P3)
MOVQ R10, (16)(REG_P3)
MOVQ R11, (24)(REG_P3)
MOVQ R12, (32)(REG_P3)
MOVQ R13, (40)(REG_P3)
MOVQ R14, (48)(REG_P3)
MOVQ R15, (56)(REG_P3)
MOVQ AX, (64)(REG_P3)
MOVQ BX, (72)(REG_P3)
MOVQ CX, (80)(REG_P3)
MOVQ (88)(REG_P1), AX
ADCQ (88)(REG_P2), AX
MOVQ AX, (88)(REG_P3)
MOVQ (96)(REG_P1), R8
MOVQ (104)(REG_P1), R9
MOVQ (112)(REG_P1), R10
MOVQ (120)(REG_P1), R11
MOVQ (128)(REG_P1), R12
MOVQ (136)(REG_P1), R13
MOVQ (144)(REG_P1), R14
MOVQ (152)(REG_P1), R15
MOVQ (160)(REG_P1), AX
MOVQ (168)(REG_P1), BX
MOVQ (176)(REG_P1), CX
MOVQ (184)(REG_P1), DI
ADCQ (96)(REG_P2), R8
ADCQ (104)(REG_P2), R9
ADCQ (112)(REG_P2), R10
ADCQ (120)(REG_P2), R11
ADCQ (128)(REG_P2), R12
ADCQ (136)(REG_P2), R13
ADCQ (144)(REG_P2), R14
ADCQ (152)(REG_P2), R15
ADCQ (160)(REG_P2), AX
ADCQ (168)(REG_P2), BX
ADCQ (176)(REG_P2), CX
ADCQ (184)(REG_P2), DI
MOVQ R8, (96)(REG_P3)
MOVQ R9, (104)(REG_P3)
MOVQ R10, (112)(REG_P3)
MOVQ R11, (120)(REG_P3)
MOVQ R12, (128)(REG_P3)
MOVQ R13, (136)(REG_P3)
MOVQ R14, (144)(REG_P3)
MOVQ R15, (152)(REG_P3)
MOVQ AX, (160)(REG_P3)
MOVQ BX, (168)(REG_P3)
MOVQ CX, (176)(REG_P3)
MOVQ DI, (184)(REG_P3)
RET
TEXT ·fp751X2SubLazy(SB), NOSPLIT, $0-24
MOVQ z+0(FP), REG_P3
MOVQ x+8(FP), REG_P1
MOVQ y+16(FP), REG_P2
MOVQ (REG_P1), R8
MOVQ (8)(REG_P1), R9
MOVQ (16)(REG_P1), R10
MOVQ (24)(REG_P1), R11
MOVQ (32)(REG_P1), R12
MOVQ (40)(REG_P1), R13
MOVQ (48)(REG_P1), R14
MOVQ (56)(REG_P1), R15
MOVQ (64)(REG_P1), AX
MOVQ (72)(REG_P1), BX
MOVQ (80)(REG_P1), CX
SUBQ (REG_P2), R8
SBBQ (8)(REG_P2), R9
SBBQ (16)(REG_P2), R10
SBBQ (24)(REG_P2), R11
SBBQ (32)(REG_P2), R12
SBBQ (40)(REG_P2), R13
SBBQ (48)(REG_P2), R14
SBBQ (56)(REG_P2), R15
SBBQ (64)(REG_P2), AX
SBBQ (72)(REG_P2), BX
SBBQ (80)(REG_P2), CX
MOVQ R8, (REG_P3)
MOVQ R9, (8)(REG_P3)
MOVQ R10, (16)(REG_P3)
MOVQ R11, (24)(REG_P3)
MOVQ R12, (32)(REG_P3)
MOVQ R13, (40)(REG_P3)
MOVQ R14, (48)(REG_P3)
MOVQ R15, (56)(REG_P3)
MOVQ AX, (64)(REG_P3)
MOVQ BX, (72)(REG_P3)
MOVQ CX, (80)(REG_P3)
MOVQ (88)(REG_P1), AX
SBBQ (88)(REG_P2), AX
MOVQ AX, (88)(REG_P3)
MOVQ (96)(REG_P1), R8
MOVQ (104)(REG_P1), R9
MOVQ (112)(REG_P1), R10
MOVQ (120)(REG_P1), R11
MOVQ (128)(REG_P1), R12
MOVQ (136)(REG_P1), R13
MOVQ (144)(REG_P1), R14
MOVQ (152)(REG_P1), R15
MOVQ (160)(REG_P1), AX
MOVQ (168)(REG_P1), BX
MOVQ (176)(REG_P1), CX
MOVQ (184)(REG_P1), DI
SBBQ (96)(REG_P2), R8
SBBQ (104)(REG_P2), R9
SBBQ (112)(REG_P2), R10
SBBQ (120)(REG_P2), R11
SBBQ (128)(REG_P2), R12
SBBQ (136)(REG_P2), R13
SBBQ (144)(REG_P2), R14
SBBQ (152)(REG_P2), R15
SBBQ (160)(REG_P2), AX
SBBQ (168)(REG_P2), BX
SBBQ (176)(REG_P2), CX
SBBQ (184)(REG_P2), DI
MOVQ R8, (96)(REG_P3)
MOVQ R9, (104)(REG_P3)
MOVQ R10, (112)(REG_P3)
MOVQ R11, (120)(REG_P3)
MOVQ R12, (128)(REG_P3)
MOVQ R13, (136)(REG_P3)
MOVQ R14, (144)(REG_P3)
MOVQ R15, (152)(REG_P3)
MOVQ AX, (160)(REG_P3)
MOVQ BX, (168)(REG_P3)
MOVQ CX, (176)(REG_P3)
MOVQ DI, (184)(REG_P3)
// Now the carry flag is 1 if x-y < 0. If so, add p*2^768.
ZERO_AX_WITHOUT_CLOBBERING_FLAGS
SBBQ $0, AX
// Load p into registers:
MOVQ P751_0, R8
// P751_{1,2,3,4} = P751_0, so reuse R8
MOVQ P751_5, R9
MOVQ P751_6, R10
MOVQ P751_7, R11
MOVQ P751_8, R12
MOVQ P751_9, R13
MOVQ P751_10, R14
MOVQ P751_11, R15
ANDQ AX, R8
ANDQ AX, R9
ANDQ AX, R10
ANDQ AX, R11
ANDQ AX, R12
ANDQ AX, R13
ANDQ AX, R14
ANDQ AX, R15
ADDQ R8, (96 )(REG_P3)
ADCQ R8, (96+ 8)(REG_P3)
ADCQ R8, (96+16)(REG_P3)
ADCQ R8, (96+24)(REG_P3)
ADCQ R8, (96+32)(REG_P3)
ADCQ R9, (96+40)(REG_P3)
ADCQ R10, (96+48)(REG_P3)
ADCQ R11, (96+56)(REG_P3)
ADCQ R12, (96+64)(REG_P3)
ADCQ R13, (96+72)(REG_P3)
ADCQ R14, (96+80)(REG_P3)
ADCQ R15, (96+88)(REG_P3)
RET