|
- #if defined(__APPLE__)
- /* OS X's C ABI prefixes functions with underscore. */
- #define C_ABI(x) _ ## x
- #define HIDDEN .private_extern
- #else
- #define C_ABI(x) x
- #define HIDDEN .hidden
- #endif
-
- .p2align 6
- .LpermMask0:
- .word 0,1,2,3, 3,4,5,6, 6,7,8,9, 9,10,11,12, 13,14,15,16, 16,17,18,19, 19,20,21,22, 22,23,24,25
- .LshiftMask0:
- .quad 0,4,8,12,0,4,8,12
- .LandMask:
- .quad 0xfffffffffffff
-
- .p2align 6
- .Lpoly:
- .quad 0x000fffffffffffff, 0x000fffffffffffff, 0x000fffffffffffff, 0x000fffffffffffff
- .quad 0x000fffffffffffff, 0x000fffffffffffff, 0x000fffffffffffff, 0x00049f878a8eeaff
- .quad 0x0007cc76e3ec9685, 0x00076da959b1a13f, 0x00084e9867d6ebe8, 0x000b5045cb257480
- .quad 0x000f97badc668562, 0x00041f71c0e12909, 0x00000000006fe5d5, 0
-
- .LR2:
- .quad 0x000dad40589641fd, 0x000452a233046449, 0x000edb010161a696, 0x00036941472e3fd8
- .quad 0x000e2082a2e7065e, 0x000904f8751f40bf, 0x0007fc814932cca8, 0x00033f174b08b2ee
- .quad 0x0009814efb9f1375, 0x00099594a1afe512, 0x00043c75310de66d, 0x000197021a5b37b0
- .quad 0x000cc1a272e73959, 0x000a733d7c97cd76, 0x0000000000292ee8, 0
-
- .Lone:
- .quad 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
-
- .globl C_ABI(norm2red)
- .p2align 6
- C_ABI(norm2red):
- mov $0x3FFFFF, %eax
- kmovd %eax, %k1
- mov $0x7F, %eax
- kmovd %eax, %k2
-
- vmovdqa64 .LpermMask0(%rip), %zmm0
- vmovdqa64 .LshiftMask0(%rip), %zmm1
- vpbroadcastq .LandMask(%rip), %zmm10
-
- vpermw 52*0(%rsi), %zmm0, %zmm2
- vmovdqu16 52*1(%rsi), %zmm3{%k1}{z}
- vpermw %zmm3, %zmm0, %zmm3
-
- vpsrlvq %zmm1, %zmm2, %zmm2
- vpsrlvq %zmm1, %zmm3, %zmm3
- vpsrlvq %zmm1, %zmm4, %zmm4
-
- vpandq %zmm10, %zmm2, %zmm2
- vpandq %zmm10, %zmm3, %zmm3
- vpandq %zmm10, %zmm4, %zmm4
-
- vmovdqu64 %zmm2, 64*0(%rdi)
- vmovdqu64 %zmm3, 64*1(%rdi){%k2}
- ret
-
-
- #define res %rdi // uint64_t *rp,
- #define a0 %rsi // const uint64_t *ap,
- #define bpi %rdx // const uint64_t *bptr,
- #define m0 %rcx
-
- #define b_ptr %rax
-
- #define acc0 %r9
-
- #define itr %r10
- #define t0 %r11
- #define t1 %r12
- #define t2 %r13
-
- #define A0 %zmm0
- #define A1 %zmm1
-
- #define M0 %zmm2
- #define M1 %zmm3
-
- #define ACC0 %zmm4
- #define ACC0_xmm %xmm4
- #define ACC1 %zmm5
-
- #define Y_curr %zmm6
- #define Y_prev %zmm7
- #define B_curr %zmm8
- #define B_prev %zmm9
-
- #define TMP %zmm10
- #define TMP_xmm %xmm10
-
- #define ZERO %zmm11
- #define AND_MASK %zmm12
-
- #define ACC0b %zmm13
- #define ACC1b %zmm14
-
- ###############################################################################
- .globl C_ABI(to_mont_ifma)
- .p2align 6
- C_ABI(to_mont_ifma):
- leaq .LR2(%rip), bpi
- jmp C_ABI(fp_mul_ifma)
- ###############################################################################
- .globl C_ABI(from_mont_ifma)
- .p2align 6
- C_ABI(from_mont_ifma):
- leaq .Lone(%rip), bpi
- jmp C_ABI(fp_mul_ifma)
- ###############################################################################
- .globl C_ABI(fp_mul_ifma)
- .p2align 6
- C_ABI(fp_mul_ifma):
-
- push %rbx
- push %r12
- push %r13
-
- mov bpi, b_ptr
-
- mov $1, t0
- mov $0x3f, t1
- kmovq t0, %k1
- kmovq t1, %k2
-
- vpbroadcastq .LandMask(%rip), AND_MASK
- vpxorq ZERO, ZERO, ZERO
-
- # Load operands A into registers. A[0] is stored in ALU register, in order to compensate for the latency of IFMA when computing (A*B)[0] * K0
- vmovdqu64 8*1+64*0(a0), A0
- vmovdqu64 8*1+64*1(a0), A1{%k2}{z}
- mov 8*0(a0), a0
-
- # Load the modulii
- mov .Lpoly(%rip), m0
- vmovdqu64 8*1+64*0+.Lpoly(%rip), M0
- vmovdqu64 8*1+64*1+.Lpoly(%rip), M1{%k2}{z}
-
- # Prepare the accumulators
- vpxorq ACC0, ACC0, ACC0
- vpxorq ACC1, ACC1, ACC1
- vpxorq B_curr, B_curr, B_curr
- vpxorq Y_curr, Y_curr, Y_curr
- xor acc0, acc0
-
- mov $15, itr
- 1:
- vpxorq ACC0b, ACC0b, ACC0b
- vpxorq ACC1b, ACC1b, ACC1b
-
- # High multiplications
- vpmadd52huq B_curr, A0, ACC0b
- vpmadd52huq B_curr, A1, ACC1b
-
- vpmadd52huq Y_curr, M0, ACC0b
- vpmadd52huq Y_curr, M1, ACC1b
-
- # Shift the ACC in zmms right by a word
- valignq $1, ACC0, ACC1, ACC0
- valignq $1, ACC1, ZERO, ACC1
- mov a0, %rdx
-
- mulx (b_ptr), t0, t2
- add t0, acc0
- adc $0, t2
-
- mov acc0, %rdx
- and .LandMask(%rip), %rdx
-
- vpbroadcastq %rdx, Y_curr
- vpbroadcastq (b_ptr), B_curr
-
- mulx m0, t0, t1
- add t0, acc0
- adc t1, t2
-
- shrd $52, t2, acc0
-
- # Low multiplications
- vpmadd52luq B_curr, A0, ACC0b
- vpmadd52luq B_curr, A1, ACC1b
-
- vpmadd52luq Y_curr, M0, ACC0
- vpmadd52luq Y_curr, M1, ACC1
-
- vpaddq ACC0b, ACC0, ACC0
- vpaddq ACC1b, ACC1, ACC1
-
- vmovq ACC0_xmm, t0
- add t0, acc0
-
- lea 8(b_ptr), b_ptr
- dec itr
- jne 1b
-
- vmovq acc0, TMP_xmm
-
- vmovdqa64 TMP, ACC0{%k1}
-
- valignq $7, A0, A1, A1
- valignq $7, ZERO, A0, A0
-
- valignq $7, M0, M1, M1
- valignq $7, ZERO, M0, M0
-
- # The last high multiplications
- vpmadd52huq B_curr, A0, ACC0
- vpmadd52huq B_curr, A1, ACC1
-
- vpmadd52huq Y_curr, M0, ACC0
- vpmadd52huq Y_curr, M1, ACC1
-
- # Now 'normalize' the result to 52 bit words
- vpsrlq $52, ACC0, A0
- vpsrlq $52, ACC1, A1
-
- vpandq AND_MASK, ACC0, ACC0
- vpandq AND_MASK, ACC1, ACC1
-
- valignq $7, A0, A1, A1
- valignq $7, ZERO, A0, A0
-
- vpaddq A0, ACC0, ACC0
- vpaddq A1, ACC1, ACC1
-
- vpcmpuq $1, A0, ACC0, %k1
- vpcmpuq $1, A1, ACC1, %k2
-
- kmovb %k1, %eax
- kmovb %k2, %ebx
-
- add %al, %al
- adc %cl, %cl
-
- vpcmpuq $0, AND_MASK, ACC0, %k1
- vpcmpuq $0, AND_MASK, ACC1, %k2
-
- kmovb %k1, %r8d
- kmovb %k2, %r9d
-
- add %r8b, %al
- adc %r9b, %bl
-
- xor %r8b, %al
- xor %r9b, %bl
-
- kmovb %eax, %k1
- kmovb %ebx, %k2
-
- vpsubq AND_MASK, ACC0, ACC0{%k1}
- vpsubq AND_MASK, ACC1, ACC1{%k2}
-
- vpandq AND_MASK, ACC0, ACC0
- vpandq AND_MASK, ACC1, ACC1
-
- mov $0x7f, t0
- kmovq t0, %k1
-
- vmovdqu64 ACC0, 64*0(res)
- vmovdqu64 ACC1, 64*1(res){%k1}
- bail:
- pop %r13
- pop %r12
- pop %rbx
- ret
|