1
0
mirror of https://github.com/henrydcase/nobs.git synced 2024-11-26 09:01:20 +00:00

makes AddReduced503 25% faster

This commit is contained in:
Henry Case 2018-11-20 12:34:51 +00:00
parent ea2ffa2d61
commit e621ca22b3

View File

@ -55,13 +55,13 @@ TEXT ·fp503StrongReduce(SB), NOSPLIT, $0-8
// Set x <- x - p // Set x <- x - p
MOVQ ( 0)(REG_P1), CX; SUBQ R8, CX; MOVQ CX, ( 0)(REG_P1) MOVQ ( 0)(REG_P1), CX; SUBQ R8, CX; MOVQ CX, ( 0)(REG_P1)
MOVQ ( 8)(REG_P1), CX; SBBQ R8, CX; MOVQ CX, ( 8)(REG_P1) MOVQ ( 8)(REG_P1), CX; SBBQ R8, CX; MOVQ CX, ( 8)(REG_P1)
MOVQ (16)(REG_P1), CX; SBBQ R8, CX; MOVQ CX, (16)(REG_P1) MOVQ (16)(REG_P1), CX; SBBQ R8, CX; MOVQ CX, (16)(REG_P1)
MOVQ (24)(REG_P1), CX; SBBQ R9, CX; MOVQ CX, (24)(REG_P1) MOVQ (24)(REG_P1), CX; SBBQ R9, CX; MOVQ CX, (24)(REG_P1)
MOVQ (32)(REG_P1), CX; SBBQ R10, CX; MOVQ CX, (32)(REG_P1) MOVQ (32)(REG_P1), CX; SBBQ R10, CX; MOVQ CX, (32)(REG_P1)
MOVQ (40)(REG_P1), CX; SBBQ R11, CX; MOVQ CX, (40)(REG_P1) MOVQ (40)(REG_P1), CX; SBBQ R11, CX; MOVQ CX, (40)(REG_P1)
MOVQ (48)(REG_P1), CX; SBBQ R12, CX; MOVQ CX, (48)(REG_P1) MOVQ (48)(REG_P1), CX; SBBQ R12, CX; MOVQ CX, (48)(REG_P1)
MOVQ (56)(REG_P1), CX; SBBQ R13, CX; MOVQ CX, (56)(REG_P1) MOVQ (56)(REG_P1), CX; SBBQ R13, CX; MOVQ CX, (56)(REG_P1)
// Save carry flag indicating x-p < 0 as a mask // Save carry flag indicating x-p < 0 as a mask
SBBQ $0, AX SBBQ $0, AX
@ -74,14 +74,14 @@ TEXT ·fp503StrongReduce(SB), NOSPLIT, $0-8
ANDQ AX, R12 ANDQ AX, R12
ANDQ AX, R13 ANDQ AX, R13
MOVQ ( 0)(REG_P1), CX; ADDQ R8, CX; MOVQ CX, ( 0)(REG_P1) MOVQ ( 0)(REG_P1), CX; ADDQ R8, CX; MOVQ CX, ( 0)(REG_P1)
MOVQ ( 8)(REG_P1), CX; ADCQ R8, CX; MOVQ CX, ( 8)(REG_P1) MOVQ ( 8)(REG_P1), CX; ADCQ R8, CX; MOVQ CX, ( 8)(REG_P1)
MOVQ (16)(REG_P1), CX; ADCQ R8, CX; MOVQ CX, (16)(REG_P1) MOVQ (16)(REG_P1), CX; ADCQ R8, CX; MOVQ CX, (16)(REG_P1)
MOVQ (24)(REG_P1), CX; ADCQ R9, CX; MOVQ CX, (24)(REG_P1) MOVQ (24)(REG_P1), CX; ADCQ R9, CX; MOVQ CX, (24)(REG_P1)
MOVQ (32)(REG_P1), CX; ADCQ R10, CX; MOVQ CX, (32)(REG_P1) MOVQ (32)(REG_P1), CX; ADCQ R10, CX; MOVQ CX, (32)(REG_P1)
MOVQ (40)(REG_P1), CX; ADCQ R11, CX; MOVQ CX, (40)(REG_P1) MOVQ (40)(REG_P1), CX; ADCQ R11, CX; MOVQ CX, (40)(REG_P1)
MOVQ (48)(REG_P1), CX; ADCQ R12, CX; MOVQ CX, (48)(REG_P1) MOVQ (48)(REG_P1), CX; ADCQ R12, CX; MOVQ CX, (48)(REG_P1)
MOVQ (56)(REG_P1), CX; ADCQ R13, CX; MOVQ CX, (56)(REG_P1) MOVQ (56)(REG_P1), CX; ADCQ R13, CX; MOVQ CX, (56)(REG_P1)
RET RET
@ -139,145 +139,100 @@ TEXT ·fp503AddReduced(SB),NOSPLIT,$0-24
MOVQ x+8(FP), REG_P1 MOVQ x+8(FP), REG_P1
MOVQ y+16(FP), REG_P2 MOVQ y+16(FP), REG_P2
// Used later to calculate a mask // [R8-R15]: z = x + y
XORQ CX, CX MOVQ ( 0)(REG_P1), R8; ADDQ ( 0)(REG_P2), R8; MOVQ R8, ( 0)(REG_P3)
MOVQ ( 8)(REG_P1), R9; ADCQ ( 8)(REG_P2), R9; MOVQ R9, ( 8)(REG_P3)
MOVQ (16)(REG_P1), R10; ADCQ (16)(REG_P2), R10; MOVQ R10, (16)(REG_P3)
MOVQ (24)(REG_P1), R11; ADCQ (24)(REG_P2), R11; MOVQ R11, (24)(REG_P3)
MOVQ (32)(REG_P1), R12; ADCQ (32)(REG_P2), R12; MOVQ R12, (32)(REG_P3)
MOVQ (40)(REG_P1), R13; ADCQ (40)(REG_P2), R13; MOVQ R13, (40)(REG_P3)
MOVQ (48)(REG_P1), R14; ADCQ (48)(REG_P2), R14; MOVQ R14, (48)(REG_P3)
MOVQ (56)(REG_P1), R15; ADCQ (56)(REG_P2), R15; MOVQ R15, (56)(REG_P3)
// [R8-R15]: z = x + y MOVQ ·p503x2+ 0(SB), AX; SUBQ AX, R8
MOVQ ( 0)(REG_P1), R8 MOVQ ·p503x2+ 8(SB), AX; SBBQ AX, R9
MOVQ ( 8)(REG_P1), R9 MOVQ ·p503x2+16(SB), AX; SBBQ AX, R10
MOVQ (16)(REG_P1), R10 MOVQ ·p503x2+24(SB), AX; SBBQ AX, R11
MOVQ (24)(REG_P1), R11 MOVQ ·p503x2+32(SB), AX; SBBQ AX, R12
MOVQ (32)(REG_P1), R12 MOVQ ·p503x2+40(SB), AX; SBBQ AX, R13
MOVQ (40)(REG_P1), R13 MOVQ ·p503x2+48(SB), AX; SBBQ AX, R14
MOVQ (48)(REG_P1), R14 MOVQ ·p503x2+56(SB), AX; SBBQ AX, R15
MOVQ (56)(REG_P1), R15
ADDQ ( 0)(REG_P2), R8
ADCQ ( 8)(REG_P2), R9
ADCQ (16)(REG_P2), R10
ADCQ (24)(REG_P2), R11
ADCQ (32)(REG_P2), R12
ADCQ (40)(REG_P2), R13
ADCQ (48)(REG_P2), R14
ADCQ (56)(REG_P2), R15
MOVQ P503X2_0, AX MOVQ ( 0)(REG_P3), AX; CMOVQCC R8, AX; MOVQ AX, ( 0)(REG_P3)
SUBQ AX, R8 MOVQ ( 8)(REG_P3), AX; CMOVQCC R9, AX; MOVQ AX, ( 8)(REG_P3)
MOVQ P503X2_1, AX MOVQ (16)(REG_P3), AX; CMOVQCC R10, AX; MOVQ AX, (16)(REG_P3)
SBBQ AX, R9 MOVQ (24)(REG_P3), AX; CMOVQCC R11, AX; MOVQ AX, (24)(REG_P3)
SBBQ AX, R10 MOVQ (32)(REG_P3), AX; CMOVQCC R12, AX; MOVQ AX, (32)(REG_P3)
MOVQ P503X2_3, AX MOVQ (40)(REG_P3), AX; CMOVQCC R13, AX; MOVQ AX, (40)(REG_P3)
SBBQ AX, R11 MOVQ (48)(REG_P3), AX; CMOVQCC R14, AX; MOVQ AX, (48)(REG_P3)
MOVQ P503X2_4, AX MOVQ (56)(REG_P3), AX; CMOVQCC R15, AX; MOVQ AX, (56)(REG_P3)
SBBQ AX, R12
MOVQ P503X2_5, AX
SBBQ AX, R13
MOVQ P503X2_6, AX
SBBQ AX, R14
MOVQ P503X2_7, AX
SBBQ AX, R15
SBBQ $0, CX // mask
// move z to REG_P3
MOVQ R8, ( 0)(REG_P3)
MOVQ R9, ( 8)(REG_P3)
MOVQ R10, (16)(REG_P3)
MOVQ R11, (24)(REG_P3)
MOVQ R12, (32)(REG_P3)
MOVQ R13, (40)(REG_P3)
MOVQ R14, (48)(REG_P3)
MOVQ R15, (56)(REG_P3)
// if z<0 add p503x2 back
MOVQ P503X2_0, R8
MOVQ P503X2_1, R9
MOVQ P503X2_3, R10
MOVQ P503X2_4, R11
MOVQ P503X2_5, R12
MOVQ P503X2_6, R13
MOVQ P503X2_7, R14
ANDQ CX, R8
ANDQ CX, R9
ANDQ CX, R10
ANDQ CX, R11
ANDQ CX, R12
ANDQ CX, R13
ANDQ CX, R14
ADDQ R8, ( 0)(REG_P3)
ADCQ R9, ( 8)(REG_P3)
ADCQ R9, (16)(REG_P3)
ADCQ R10,(24)(REG_P3)
ADCQ R11,(32)(REG_P3)
ADCQ R12,(40)(REG_P3)
ADCQ R13,(48)(REG_P3)
ADCQ R14,(56)(REG_P3)
RET RET
TEXT ·fp503SubReduced(SB), NOSPLIT, $0-24 TEXT ·fp503SubReduced(SB), NOSPLIT, $0-24
MOVQ z+0(FP), REG_P3 MOVQ z+0(FP), REG_P3
MOVQ x+8(FP), REG_P1 MOVQ x+8(FP), REG_P1
MOVQ y+16(FP), REG_P2 MOVQ y+16(FP), REG_P2
// Used later to calculate a mask // Used later to calculate a mask
XORQ CX, CX XORQ CX, CX
MOVQ ( 0)(REG_P1), R8 MOVQ ( 0)(REG_P1), R8
MOVQ ( 8)(REG_P1), R9 MOVQ ( 8)(REG_P1), R9
MOVQ (16)(REG_P1), R10 MOVQ (16)(REG_P1), R10
MOVQ (24)(REG_P1), R11 MOVQ (24)(REG_P1), R11
MOVQ (32)(REG_P1), R12 MOVQ (32)(REG_P1), R12
MOVQ (40)(REG_P1), R13 MOVQ (40)(REG_P1), R13
MOVQ (48)(REG_P1), R14 MOVQ (48)(REG_P1), R14
MOVQ (56)(REG_P1), R15 MOVQ (56)(REG_P1), R15
SUBQ ( 0)(REG_P2), R8 SUBQ ( 0)(REG_P2), R8
SBBQ ( 8)(REG_P2), R9 SBBQ ( 8)(REG_P2), R9
SBBQ (16)(REG_P2), R10 SBBQ (16)(REG_P2), R10
SBBQ (24)(REG_P2), R11 SBBQ (24)(REG_P2), R11
SBBQ (32)(REG_P2), R12 SBBQ (32)(REG_P2), R12
SBBQ (40)(REG_P2), R13 SBBQ (40)(REG_P2), R13
SBBQ (48)(REG_P2), R14 SBBQ (48)(REG_P2), R14
SBBQ (56)(REG_P2), R15 SBBQ (56)(REG_P2), R15
// mask // mask
SBBQ $0, CX SBBQ $0, CX
// store x-y in REG_P3 // store x-y in REG_P3
MOVQ R8, ( 0)(REG_P3) MOVQ R8, ( 0)(REG_P3)
MOVQ R9, ( 8)(REG_P3) MOVQ R9, ( 8)(REG_P3)
MOVQ R10, (16)(REG_P3) MOVQ R10, (16)(REG_P3)
MOVQ R11, (24)(REG_P3) MOVQ R11, (24)(REG_P3)
MOVQ R12, (32)(REG_P3) MOVQ R12, (32)(REG_P3)
MOVQ R13, (40)(REG_P3) MOVQ R13, (40)(REG_P3)
MOVQ R14, (48)(REG_P3) MOVQ R14, (48)(REG_P3)
MOVQ R15, (56)(REG_P3) MOVQ R15, (56)(REG_P3)
// if z<0 add p503x2 back // if z<0 add p503x2 back
MOVQ P503X2_0, R8 MOVQ P503X2_0, R8
MOVQ P503X2_1, R9 MOVQ P503X2_1, R9
MOVQ P503X2_3, R10 MOVQ P503X2_3, R10
MOVQ P503X2_4, R11 MOVQ P503X2_4, R11
MOVQ P503X2_5, R12 MOVQ P503X2_5, R12
MOVQ P503X2_6, R13 MOVQ P503X2_6, R13
MOVQ P503X2_7, R14 MOVQ P503X2_7, R14
ANDQ CX, R8 ANDQ CX, R8
ANDQ CX, R9 ANDQ CX, R9
ANDQ CX, R10 ANDQ CX, R10
ANDQ CX, R11 ANDQ CX, R11
ANDQ CX, R12 ANDQ CX, R12
ANDQ CX, R13 ANDQ CX, R13
ANDQ CX, R14 ANDQ CX, R14
ADDQ R8, ( 0)(REG_P3) ADDQ R8, ( 0)(REG_P3)
ADCQ R9, ( 8)(REG_P3) ADCQ R9, ( 8)(REG_P3)
ADCQ R9, (16)(REG_P3) ADCQ R9, (16)(REG_P3)
ADCQ R10,(24)(REG_P3) ADCQ R10,(24)(REG_P3)
ADCQ R11,(32)(REG_P3) ADCQ R11,(32)(REG_P3)
ADCQ R12,(40)(REG_P3) ADCQ R12,(40)(REG_P3)
ADCQ R13,(48)(REG_P3) ADCQ R13,(48)(REG_P3)
ADCQ R14,(56)(REG_P3) ADCQ R14,(56)(REG_P3)
RET RET
@ -332,124 +287,124 @@ TEXT ·fp503Mul(SB), $96-24
// (SP[0-3],R8,R9,R10,R11) <- (AH+AL)*(BH+BL). // (SP[0-3],R8,R9,R10,R11) <- (AH+AL)*(BH+BL).
// MUL using comba; In comments below U=AH+AL V=BH+BL // MUL using comba; In comments below U=AH+AL V=BH+BL
// U0*V0 // U0*V0
MOVQ (CX), AX MOVQ (CX), AX
MULQ R12 MULQ R12
MOVQ AX, (SP) // C0 MOVQ AX, (SP) // C0
MOVQ DX, R8 MOVQ DX, R8
// U0*V1 // U0*V1
XORQ R9, R9 XORQ R9, R9
MOVQ (CX), AX MOVQ (CX), AX
MULQ R13 MULQ R13
ADDQ AX, R8 ADDQ AX, R8
ADCQ DX, R9 ADCQ DX, R9
// U1*V0 // U1*V0
XORQ R10, R10 XORQ R10, R10
MOVQ (8)(CX), AX MOVQ (8)(CX), AX
MULQ R12 MULQ R12
ADDQ AX, R8 ADDQ AX, R8
MOVQ R8, (8)(SP) // C1 MOVQ R8, (8)(SP) // C1
ADCQ DX, R9 ADCQ DX, R9
ADCQ $0, R10 ADCQ $0, R10
// U0*V2 // U0*V2
XORQ R8, R8 XORQ R8, R8
MOVQ (CX), AX MOVQ (CX), AX
MULQ R14 MULQ R14
ADDQ AX, R9 ADDQ AX, R9
ADCQ DX, R10 ADCQ DX, R10
ADCQ $0, R8 ADCQ $0, R8
// U2*V0 // U2*V0
MOVQ (16)(CX), AX MOVQ (16)(CX), AX
MULQ R12 MULQ R12
ADDQ AX, R9 ADDQ AX, R9
ADCQ DX, R10 ADCQ DX, R10
ADCQ $0, R8 ADCQ $0, R8
// U1*V1 // U1*V1
MOVQ (8)(CX), AX MOVQ (8)(CX), AX
MULQ R13 MULQ R13
ADDQ AX, R9 ADDQ AX, R9
MOVQ R9, (16)(SP) // C2 MOVQ R9, (16)(SP) // C2
ADCQ DX, R10 ADCQ DX, R10
ADCQ $0, R8 ADCQ $0, R8
// U0*V3 // U0*V3
XORQ R9, R9 XORQ R9, R9
MOVQ (CX), AX MOVQ (CX), AX
MULQ R15 MULQ R15
ADDQ AX, R10 ADDQ AX, R10
ADCQ DX, R8 ADCQ DX, R8
ADCQ $0, R9 ADCQ $0, R9
// U3*V0 // U3*V0
MOVQ (24)(CX), AX MOVQ (24)(CX), AX
MULQ R12 MULQ R12
ADDQ AX, R10 ADDQ AX, R10
ADCQ DX, R8 ADCQ DX, R8
ADCQ $0, R9 ADCQ $0, R9
// U1*V2 // U1*V2
MOVQ (8)(CX), AX MOVQ (8)(CX), AX
MULQ R14 MULQ R14
ADDQ AX, R10 ADDQ AX, R10
ADCQ DX, R8 ADCQ DX, R8
ADCQ $0, R9 ADCQ $0, R9
// U2*V1 // U2*V1
MOVQ (16)(CX), AX MOVQ (16)(CX), AX
MULQ R13 MULQ R13
ADDQ AX, R10 ADDQ AX, R10
MOVQ R10, (24)(SP) // C3 MOVQ R10, (24)(SP) // C3
ADCQ DX, R8 ADCQ DX, R8
ADCQ $0, R9 ADCQ $0, R9
// U1*V3 // U1*V3
XORQ R10, R10 XORQ R10, R10
MOVQ (8)(CX), AX MOVQ (8)(CX), AX
MULQ R15 MULQ R15
ADDQ AX, R8 ADDQ AX, R8
ADCQ DX, R9 ADCQ DX, R9
ADCQ $0, R10 ADCQ $0, R10
// U3*V1 // U3*V1
MOVQ (24)(CX), AX MOVQ (24)(CX), AX
MULQ R13 MULQ R13
ADDQ AX, R8 ADDQ AX, R8
ADCQ DX, R9 ADCQ DX, R9
ADCQ $0, R10 ADCQ $0, R10
// U2*V2 // U2*V2
MOVQ (16)(CX), AX MOVQ (16)(CX), AX
MULQ R14 MULQ R14
ADDQ AX, R8 ADDQ AX, R8
MOVQ R8, (32)(SP) // C4 MOVQ R8, (32)(SP) // C4
ADCQ DX, R9 ADCQ DX, R9
ADCQ $0, R10 ADCQ $0, R10
// U2*V3 // U2*V3
XORQ R11, R11 XORQ R11, R11
MOVQ (16)(CX), AX MOVQ (16)(CX), AX
MULQ R15 MULQ R15
ADDQ AX, R9 ADDQ AX, R9
ADCQ DX, R10 ADCQ DX, R10
ADCQ $0, R11 ADCQ $0, R11
// U3*V2 // U3*V2
MOVQ (24)(CX), AX MOVQ (24)(CX), AX
MULQ R14 MULQ R14
ADDQ AX, R9 // C5 ADDQ AX, R9 // C5
ADCQ DX, R10 ADCQ DX, R10
ADCQ $0, R11 ADCQ $0, R11
// U3*V3 // U3*V3
MOVQ (24)(CX), AX MOVQ (24)(CX), AX
MULQ R15 MULQ R15
ADDQ AX, R10 // C6 ADDQ AX, R10 // C6
ADCQ DX, R11 // C7 ADCQ DX, R11 // C7
MOVQ (64)(SP), AX MOVQ (64)(SP), AX
@ -483,244 +438,244 @@ TEXT ·fp503Mul(SB), $96-24
// CX[0-7] <- AL*BL // CX[0-7] <- AL*BL
// U0*V0 // U0*V0
MOVQ (REG_P1), R11 MOVQ (REG_P1), R11
MOVQ (REG_P2), AX MOVQ (REG_P2), AX
MULQ R11 MULQ R11
XORQ R9, R9 XORQ R9, R9
MOVQ AX, (CX) // C0 MOVQ AX, (CX) // C0
MOVQ DX, R8 MOVQ DX, R8
// U0*V1 // U0*V1
MOVQ (16)(REG_P1), R14 MOVQ (16)(REG_P1), R14
MOVQ (8)(REG_P2), AX MOVQ (8)(REG_P2), AX
MULQ R11 MULQ R11
XORQ R10, R10 XORQ R10, R10
ADDQ AX, R8 ADDQ AX, R8
ADCQ DX, R9 ADCQ DX, R9
// U1*V0 // U1*V0
MOVQ (8)(REG_P1), R12 MOVQ (8)(REG_P1), R12
MOVQ (REG_P2), AX MOVQ (REG_P2), AX
MULQ R12 MULQ R12
ADDQ AX, R8 ADDQ AX, R8
MOVQ R8, (8)(CX) // C1 MOVQ R8, (8)(CX) // C1
ADCQ DX, R9 ADCQ DX, R9
ADCQ $0, R10 ADCQ $0, R10
// U0*V2 // U0*V2
XORQ R8, R8 XORQ R8, R8
MOVQ (16)(REG_P2), AX MOVQ (16)(REG_P2), AX
MULQ R11 MULQ R11
ADDQ AX, R9 ADDQ AX, R9
ADCQ DX, R10 ADCQ DX, R10
ADCQ $0, R8 ADCQ $0, R8
// U2*V0 // U2*V0
MOVQ (REG_P2), R13 MOVQ (REG_P2), R13
MOVQ R14, AX MOVQ R14, AX
MULQ R13 MULQ R13
ADDQ AX, R9 ADDQ AX, R9
ADCQ DX, R10 ADCQ DX, R10
ADCQ $0, R8 ADCQ $0, R8
// U1*V1 // U1*V1
MOVQ (8)(REG_P2), AX MOVQ (8)(REG_P2), AX
MULQ R12 MULQ R12
ADDQ AX, R9 ADDQ AX, R9
MOVQ R9, (16)(CX) // C2 MOVQ R9, (16)(CX) // C2
ADCQ DX, R10 ADCQ DX, R10
ADCQ $0, R8 ADCQ $0, R8
// U0*V3 // U0*V3
XORQ R9, R9 XORQ R9, R9
MOVQ (24)(REG_P2), AX MOVQ (24)(REG_P2), AX
MULQ R11 MULQ R11
MOVQ (24)(REG_P1), R15 MOVQ (24)(REG_P1), R15
ADDQ AX, R10 ADDQ AX, R10
ADCQ DX, R8 ADCQ DX, R8
ADCQ $0, R9 ADCQ $0, R9
// U3*V1 // U3*V1
MOVQ R15, AX MOVQ R15, AX
MULQ R13 MULQ R13
ADDQ AX, R10 ADDQ AX, R10
ADCQ DX, R8 ADCQ DX, R8
ADCQ $0, R9 ADCQ $0, R9
// U2*V2 // U2*V2
MOVQ (16)(REG_P2), AX MOVQ (16)(REG_P2), AX
MULQ R12 MULQ R12
ADDQ AX, R10 ADDQ AX, R10
ADCQ DX, R8 ADCQ DX, R8
ADCQ $0, R9 ADCQ $0, R9
// U2*V3 // U2*V3
MOVQ (8)(REG_P2), AX MOVQ (8)(REG_P2), AX
MULQ R14 MULQ R14
ADDQ AX, R10 ADDQ AX, R10
MOVQ R10, (24)(CX) // C3 MOVQ R10, (24)(CX) // C3
ADCQ DX, R8 ADCQ DX, R8
ADCQ $0, R9 ADCQ $0, R9
// U3*V2 // U3*V2
XORQ R10, R10 XORQ R10, R10
MOVQ (24)(REG_P2), AX MOVQ (24)(REG_P2), AX
MULQ R12 MULQ R12
ADDQ AX, R8 ADDQ AX, R8
ADCQ DX, R9 ADCQ DX, R9
ADCQ $0, R10 ADCQ $0, R10
// U3*V1 // U3*V1
MOVQ (8)(REG_P2), AX MOVQ (8)(REG_P2), AX
MULQ R15 MULQ R15
ADDQ AX, R8 ADDQ AX, R8
ADCQ DX, R9 ADCQ DX, R9
ADCQ $0, R10 ADCQ $0, R10
// U2*V2 // U2*V2
MOVQ (16)(REG_P2), AX MOVQ (16)(REG_P2), AX
MULQ R14 MULQ R14
ADDQ AX, R8 ADDQ AX, R8
MOVQ R8, (32)(CX) // C4 MOVQ R8, (32)(CX) // C4
ADCQ DX, R9 ADCQ DX, R9
ADCQ $0, R10 ADCQ $0, R10
// U2*V3 // U2*V3
XORQ R8, R8 XORQ R8, R8
MOVQ (24)(REG_P2), AX MOVQ (24)(REG_P2), AX
MULQ R14 MULQ R14
ADDQ AX, R9 ADDQ AX, R9
ADCQ DX, R10 ADCQ DX, R10
ADCQ $0, R8 ADCQ $0, R8
// U3*V2 // U3*V2
MOVQ (16)(REG_P2), AX MOVQ (16)(REG_P2), AX
MULQ R15 MULQ R15
ADDQ AX, R9 ADDQ AX, R9
MOVQ R9, (40)(CX) // C5 MOVQ R9, (40)(CX) // C5
ADCQ DX, R10 ADCQ DX, R10
ADCQ $0, R8 ADCQ $0, R8
// U3*V3 // U3*V3
MOVQ (24)(REG_P2), AX MOVQ (24)(REG_P2), AX
MULQ R15 MULQ R15
ADDQ AX, R10 ADDQ AX, R10
MOVQ R10, (48)(CX) // C6 MOVQ R10, (48)(CX) // C6
ADCQ DX, R8 ADCQ DX, R8
MOVQ R8, (56)(CX) // C7 MOVQ R8, (56)(CX) // C7
// CX[8-15] <- AH*BH // CX[8-15] <- AH*BH
MOVQ (32)(REG_P1), R11 MOVQ (32)(REG_P1), R11
MOVQ (32)(REG_P2), AX MOVQ (32)(REG_P2), AX
MULQ R11 MULQ R11
XORQ R9, R9 XORQ R9, R9
MOVQ AX, (64)(CX) // C0 MOVQ AX, (64)(CX) // C0
MOVQ DX, R8 MOVQ DX, R8
MOVQ (48)(REG_P1), R14 MOVQ (48)(REG_P1), R14
MOVQ (40)(REG_P2), AX MOVQ (40)(REG_P2), AX
MULQ R11 MULQ R11
XORQ R10, R10 XORQ R10, R10
ADDQ AX, R8 ADDQ AX, R8
ADCQ DX, R9 ADCQ DX, R9
MOVQ (40)(REG_P1), R12 MOVQ (40)(REG_P1), R12
MOVQ (32)(REG_P2), AX MOVQ (32)(REG_P2), AX
MULQ R12 MULQ R12
ADDQ AX, R8 ADDQ AX, R8
MOVQ R8, (72)(CX) // C1 MOVQ R8, (72)(CX) // C1
ADCQ DX, R9 ADCQ DX, R9
ADCQ $0, R10 ADCQ $0, R10
XORQ R8, R8 XORQ R8, R8
MOVQ (48)(REG_P2), AX MOVQ (48)(REG_P2), AX
MULQ R11 MULQ R11
ADDQ AX, R9 ADDQ AX, R9
ADCQ DX, R10 ADCQ DX, R10
ADCQ $0, R8 ADCQ $0, R8
MOVQ (32)(REG_P2), R13 MOVQ (32)(REG_P2), R13
MOVQ R14, AX MOVQ R14, AX
MULQ R13 MULQ R13
ADDQ AX, R9 ADDQ AX, R9
ADCQ DX, R10 ADCQ DX, R10
ADCQ $0, R8 ADCQ $0, R8
MOVQ (40)(REG_P2), AX MOVQ (40)(REG_P2), AX
MULQ R12 MULQ R12
ADDQ AX, R9 ADDQ AX, R9
MOVQ R9, (80)(CX) // C2 MOVQ R9, (80)(CX) // C2
ADCQ DX, R10 ADCQ DX, R10
ADCQ $0, R8 ADCQ $0, R8
XORQ R9, R9 XORQ R9, R9
MOVQ (56)(REG_P2), AX MOVQ (56)(REG_P2), AX
MULQ R11 MULQ R11
MOVQ (56)(REG_P1), R15 MOVQ (56)(REG_P1), R15
ADDQ AX, R10 ADDQ AX, R10
ADCQ DX, R8 ADCQ DX, R8
ADCQ $0, R9 ADCQ $0, R9
MOVQ R15, AX MOVQ R15, AX
MULQ R13 MULQ R13
ADDQ AX, R10 ADDQ AX, R10
ADCQ DX, R8 ADCQ DX, R8
ADCQ $0, R9 ADCQ $0, R9
MOVQ (48)(REG_P2), AX MOVQ (48)(REG_P2), AX
MULQ R12 MULQ R12
ADDQ AX, R10 ADDQ AX, R10
ADCQ DX, R8 ADCQ DX, R8
ADCQ $0, R9 ADCQ $0, R9
MOVQ (40)(REG_P2), AX MOVQ (40)(REG_P2), AX
MULQ R14 MULQ R14
ADDQ AX, R10 ADDQ AX, R10
MOVQ R10, (88)(CX) // C3 MOVQ R10, (88)(CX) // C3
ADCQ DX, R8 ADCQ DX, R8
ADCQ $0, R9 ADCQ $0, R9
XORQ R10, R10 XORQ R10, R10
MOVQ (56)(REG_P2), AX MOVQ (56)(REG_P2), AX
MULQ R12 MULQ R12
ADDQ AX, R8 ADDQ AX, R8
ADCQ DX, R9 ADCQ DX, R9
ADCQ $0, R10 ADCQ $0, R10
MOVQ (40)(REG_P2), AX MOVQ (40)(REG_P2), AX
MULQ R15 MULQ R15
ADDQ AX, R8 ADDQ AX, R8
ADCQ DX, R9 ADCQ DX, R9
ADCQ $0, R10 ADCQ $0, R10
MOVQ (48)(REG_P2), AX MOVQ (48)(REG_P2), AX
MULQ R14 MULQ R14
ADDQ AX, R8 ADDQ AX, R8
MOVQ R8, (96)(CX) // C4 MOVQ R8, (96)(CX) // C4
ADCQ DX, R9 ADCQ DX, R9
ADCQ $0, R10 ADCQ $0, R10
XORQ R8, R8 XORQ R8, R8
MOVQ (56)(REG_P2), AX MOVQ (56)(REG_P2), AX
MULQ R14 MULQ R14
ADDQ AX, R9 ADDQ AX, R9
ADCQ DX, R10 ADCQ DX, R10
ADCQ $0, R8 ADCQ $0, R8
MOVQ (48)(REG_P2), AX MOVQ (48)(REG_P2), AX
MULQ R15 MULQ R15
ADDQ AX, R9 ADDQ AX, R9
MOVQ R9, (104)(CX) // C5 MOVQ R9, (104)(CX) // C5
ADCQ DX, R10 ADCQ DX, R10
ADCQ $0, R8 ADCQ $0, R8
MOVQ (56)(REG_P2), AX MOVQ (56)(REG_P2), AX
MULQ R15 MULQ R15
ADDQ AX, R10 ADDQ AX, R10
MOVQ R10, (112)(CX) // C6 MOVQ R10, (112)(CX) // C6
ADCQ DX, R8 ADCQ DX, R8
MOVQ R8, (120)(CX) // C7 MOVQ R8, (120)(CX) // C7
// [R8-R15] <- (AH+AL)*(BH+BL) - AL*BL // [R8-R15] <- (AH+AL)*(BH+BL) - AL*BL
MOVQ (SP), R8 MOVQ (SP), R8