#if defined(__APPLE__) /* OS X's C ABI prefixes functions with underscore. */ #define C_ABI(x) _ ## x #define HIDDEN .private_extern #else #define C_ABI(x) x #define HIDDEN .hidden #endif #define ACC0 %zmm0 #define ACC1 %zmm1 #define ACC2 %zmm2 #define ACC3 %zmm3 #define ACC4 %zmm4 #define ACC5 %zmm5 #define ACC6 %zmm6 #define ACC7 %zmm7 #define ACC8 %zmm8 #define ACC9 %zmm9 #define ACC10 %zmm10 #define ACC11 %zmm11 #define ACC12 %zmm12 #define ACC13 %zmm13 #define ACC14 %zmm14 #define ACC15 %zmm15 #define A0 %zmm16 #define A1 %zmm17 #define A2 %zmm18 #define A3 %zmm19 #define A4 %zmm20 #define A5 %zmm21 #define A6 %zmm22 #define A7 %zmm23 #define A8 %zmm24 #define A9 %zmm25 #define A10 %zmm26 #define A11 %zmm27 #define A12 %zmm28 #define A13 %zmm29 #define A14 %zmm30 #define B %zmm31 #define rptr %rdi #define aptr %rsi #define bptr %rdx #define r0ptr %rdi #define a0ptr %rsi #define b0ptr %rdx #define r1ptr %rcx #define a1ptr %r8 #define b1ptr %r9 #define hlp %rax .p2align 6 .Lmask: .Lpoly: .quad 0x000fffffffffffff, 0x000fffffffffffff, 0x000fffffffffffff, 0x000fffffffffffff .quad 0x000fffffffffffff, 0x000fffffffffffff, 0x000fffffffffffff, 0x00049f878a8eeaff .quad 0x0007cc76e3ec9685, 0x00076da959b1a13f, 0x00084e9867d6ebe8, 0x000b5045cb257480 .quad 0x000f97badc668562, 0x00041f71c0e12909, 0x00000000006fe5d5, 0 .LpolyX: .quad 0x0fffffffffffff00, 0x0fffffffffffff00, 0x0fffffffffffff00, 0x0fffffffffffff00 .quad 0x0fffffffffffff00, 0x0fffffffffffff00, 0x0fffffffffffff00, 0x049f878a8eeaff00 .quad 0x07cc76e3ec968500, 0x076da959b1a13f00, 0x084e9867d6ebe800, 0x0b5045cb25748000 .quad 0x0f97badc66856200, 0x041f71c0e1290900, 0x000000006fe5d500, 0 .Lperm0: .quad 0,1,0,1,2,3,2,3 .Lperm1: .quad 4,5,5,4,6,7,7,6 // TODO: avoid transposing every call by keeping data vertical throughout // Performs 8 field multiplications in parallel .globl C_ABI(fp2_mul_ifma_x2) C_ABI(fp2_mul_ifma_x2): push %rbp mov %rsp, %rbp sub $960, %rsp and $-64, %rsp mov $0x7f, %rax kmovq %rax, %k5 // Load a0[0] vmovdqu64 0*64(a0ptr), %zmm0 vmovdqu64 1*64(a0ptr), %zmm1{%k5}{z} lea 15*8(a0ptr), a0ptr // Load a0[1] vmovdqu64 0*64(a0ptr), %zmm2 vmovdqu64 1*64(a0ptr), %zmm3{%k5}{z} // Load b0[0] vmovdqu64 0*64(b0ptr), %zmm4 vmovdqu64 1*64(b0ptr), %zmm5{%k5}{z} lea 15*8(b0ptr), b0ptr // Load b0[1] vmovdqu64 0*64(b0ptr), %zmm6 vmovdqu64 1*64(b0ptr), %zmm7{%k5}{z} // Load a1[0] vmovdqu64 0*64(a1ptr), %zmm8 vmovdqu64 1*64(a1ptr), %zmm9{%k5}{z} lea 15*8(a1ptr), a1ptr // Load a1[1] vmovdqu64 0*64(a1ptr), %zmm10 vmovdqu64 1*64(a1ptr), %zmm11{%k5}{z} // Load b1[0] vmovdqu64 0*64(b1ptr), %zmm12 vmovdqu64 1*64(b1ptr), %zmm13{%k5}{z} lea 15*8(b1ptr), b1ptr // Load b1[1] vmovdqu64 0*64(b1ptr), %zmm14 vmovdqu64 1*64(b1ptr), %zmm15{%k5}{z} // Transpose vpunpcklqdq %zmm2, %zmm0, %zmm16 // 0 0 2 2 4 4 6 6 vpunpckhqdq %zmm2, %zmm0, %zmm17 // 1 1 3 3 5 5 7 7 vpunpcklqdq %zmm6, %zmm4, %zmm18 // 0 0 2 2 4 4 6 6 vpunpckhqdq %zmm6, %zmm4, %zmm19 // 1 1 3 3 5 5 7 7 vpunpcklqdq %zmm10, %zmm8, %zmm20 // 0 0 2 2 4 4 6 6 vpunpckhqdq %zmm10, %zmm8, %zmm21 // 1 1 3 3 5 5 7 7 vpunpcklqdq %zmm14, %zmm12, %zmm22 // 0 0 2 2 4 4 6 6 vpunpckhqdq %zmm14, %zmm12, %zmm23 // 1 1 3 3 5 5 7 7 vpunpcklqdq %zmm3, %zmm1, %zmm24 // 8 8 10 10 12 12 14 14 vpunpckhqdq %zmm3, %zmm1, %zmm25 // 9 9 11 11 13 13 15 15 vpunpcklqdq %zmm7, %zmm5, %zmm26 // 8 8 10 10 12 12 14 14 vpunpckhqdq %zmm7, %zmm5, %zmm27 // 9 9 11 11 13 13 15 15 vpunpcklqdq %zmm11, %zmm9, %zmm28 // 8 8 10 10 12 12 14 14 vpunpckhqdq %zmm11, %zmm9, %zmm29 // 9 9 11 11 13 13 15 15 vpunpcklqdq %zmm15, %zmm13, %zmm30 // 8 8 10 10 12 12 14 14 vpunpckhqdq %zmm15, %zmm13, %zmm31 // 9 9 11 11 13 13 15 15 vshufi64x2 $0x44, %zmm20, %zmm16, %zmm0 // 0 0 2 2 0 0 2 2 vshufi64x2 $0x44, %zmm22, %zmm18, %zmm1 // 0 0 2 2 0 0 2 2 vshufi64x2 $0xee, %zmm20, %zmm16, %zmm2 // 4 4 6 6 4 4 6 6 vshufi64x2 $0xee, %zmm22, %zmm18, %zmm3 // 4 4 6 6 4 4 6 6 vshufi64x2 $0x44, %zmm21, %zmm17, %zmm4 // 1 1 3 3 1 1 3 3 vshufi64x2 $0x44, %zmm23, %zmm19, %zmm5 // 1 1 3 3 1 1 3 3 vshufi64x2 $0xee, %zmm21, %zmm17, %zmm6 // 5 5 7 7 5 5 7 7 vshufi64x2 $0xee, %zmm23, %zmm19, %zmm7 // 5 5 7 7 5 5 7 7 vshufi64x2 $0x44, %zmm28, %zmm24, %zmm8 // 8 8 10 10 8 8 10 10 vshufi64x2 $0x44, %zmm30, %zmm26, %zmm9 // 8 8 10 10 8 8 10 10 vshufi64x2 $0xee, %zmm28, %zmm24, %zmm10 // 12 12 14 14 12 12 14 14 vshufi64x2 $0xee, %zmm30, %zmm26, %zmm11 // 12 12 14 14 12 12 14 14 vshufi64x2 $0x44, %zmm29, %zmm25, %zmm12 // 9 9 11 11 9 9 11 11 vshufi64x2 $0x44, %zmm31, %zmm27, %zmm13 // 9 9 11 11 9 9 11 11 vshufi64x2 $0xee, %zmm29, %zmm25, %zmm14 // 13 13 15 15 13 13 15 15 vshufi64x2 $0xee, %zmm31, %zmm27, %zmm15 // 13 13 15 15 13 13 15 15 vshufi64x2 $0x88, %zmm1, %zmm0, %zmm16 //0 vshufi64x2 $0x88, %zmm5, %zmm4, %zmm17 //1 vshufi64x2 $0xdd, %zmm1, %zmm0, %zmm18 // vshufi64x2 $0xdd, %zmm5, %zmm4, %zmm19 vshufi64x2 $0x88, %zmm3, %zmm2, %zmm20 vshufi64x2 $0x88, %zmm7, %zmm6, %zmm21 vshufi64x2 $0xdd, %zmm3, %zmm2, %zmm22 vshufi64x2 $0xdd, %zmm7, %zmm6, %zmm23 vshufi64x2 $0x88, %zmm9, %zmm8, %zmm24 vshufi64x2 $0x88, %zmm13, %zmm12, %zmm25 vshufi64x2 $0xdd, %zmm9, %zmm8, %zmm26 vshufi64x2 $0xdd, %zmm13, %zmm12, %zmm27 vshufi64x2 $0x88, %zmm11, %zmm10, %zmm28 vshufi64x2 $0x88, %zmm15, %zmm14, %zmm29 vshufi64x2 $0xdd, %zmm11, %zmm10, %zmm30 vmovdqa64 .Lperm0(%rip), %zmm31 vpermq %zmm16, %zmm31, %zmm0 vpermq %zmm17, %zmm31, %zmm1 vpermq %zmm18, %zmm31, %zmm2 vpermq %zmm19, %zmm31, %zmm3 vpermq %zmm20, %zmm31, %zmm4 vpermq %zmm21, %zmm31, %zmm5 vpermq %zmm22, %zmm31, %zmm6 vpermq %zmm23, %zmm31, %zmm7 vpermq %zmm24, %zmm31, %zmm8 vpermq %zmm25, %zmm31, %zmm9 vpermq %zmm26, %zmm31, %zmm10 vpermq %zmm27, %zmm31, %zmm11 vpermq %zmm28, %zmm31, %zmm12 vpermq %zmm29, %zmm31, %zmm13 vpermq %zmm30, %zmm31, %zmm14 .irp r,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14 vmovdqu64 %zmm\r, \r*64(%rsp) .endr vmovdqa64 .Lperm1(%rip), %zmm31 vpermq %zmm16, %zmm31, A0 vpermq %zmm17, %zmm31, A1 vpermq %zmm18, %zmm31, A2 vpermq %zmm19, %zmm31, A3 vpermq %zmm20, %zmm31, A4 vpermq %zmm21, %zmm31, A5 vpermq %zmm22, %zmm31, A6 vpermq %zmm23, %zmm31, A7 vpermq %zmm24, %zmm31, A8 vpermq %zmm25, %zmm31, A9 vpermq %zmm26, %zmm31, A10 vpermq %zmm27, %zmm31, A11 vpermq %zmm28, %zmm31, A12 vpermq %zmm29, %zmm31, A13 vpermq %zmm30, %zmm31, A14 lea (%rsp), bptr call do_mul_x2 // After parallel multiplication the layout is: // A0[0] * B0[0], A0[1] * B0[1], A0[0] * B0[1], A0[1] * B0[0], A1[0] * B1[0], A1[1] * B1[1], A1[0] * B1[1], A1[1] * B1[0] // We need to compute: // A0[0] * B0[0] - A0[1] * B0[1], A0[0] * B0[1] + A0[1] * B0[0], A1[0] * B1[0] - A0[1] * B1[1], A1[0] * B1[1] + A1[1] * B1[0] vpsrldq $8, ACC0, A0 vpsrldq $8, ACC1, A1 vpsrldq $8, ACC2, A2 vpsrldq $8, ACC3, A3 vpsrldq $8, ACC4, A4 vpsrldq $8, ACC5, A5 vpsrldq $8, ACC6, A6 vpsrldq $8, ACC7, A7 vpsrldq $8, ACC8, A8 vpsrldq $8, ACC9, A9 vpsrldq $8, ACC10, A10 vpsrldq $8, ACC11, A11 vpsrldq $8, ACC12, A12 vpsrldq $8, ACC13, A13 vpsrldq $8, ACC14, A14 mov $0x44, hlp kmovq hlp, %k7 vpaddq A0, ACC0, ACC0{%k7} vpaddq A1, ACC1, ACC1{%k7} vpaddq A2, ACC2, ACC2{%k7} vpaddq A3, ACC3, ACC3{%k7} vpaddq A4, ACC4, ACC4{%k7} vpaddq A5, ACC5, ACC5{%k7} vpaddq A6, ACC6, ACC6{%k7} vpaddq A7, ACC7, ACC7{%k7} vpaddq A8, ACC8, ACC8{%k7} vpaddq A9, ACC9, ACC9{%k7} vpaddq A10, ACC10, ACC10{%k7} vpaddq A11, ACC11, ACC11{%k7} vpaddq A12, ACC12, ACC12{%k7} vpaddq A13, ACC13, ACC13{%k7} vpaddq A14, ACC14, ACC14{%k7} mov $0x11, hlp kmovq hlp, %k7 vpaddq 0*8+.LpolyX(%rip){1to8}, ACC0, ACC0{%k7} vpaddq 1*8+.LpolyX(%rip){1to8}, ACC1, ACC1{%k7} vpaddq 2*8+.LpolyX(%rip){1to8}, ACC2, ACC2{%k7} vpaddq 3*8+.LpolyX(%rip){1to8}, ACC3, ACC3{%k7} vpaddq 4*8+.LpolyX(%rip){1to8}, ACC4, ACC4{%k7} vpaddq 5*8+.LpolyX(%rip){1to8}, ACC5, ACC5{%k7} vpaddq 6*8+.LpolyX(%rip){1to8}, ACC6, ACC6{%k7} vpaddq 7*8+.LpolyX(%rip){1to8}, ACC7, ACC7{%k7} vpaddq 8*8+.LpolyX(%rip){1to8}, ACC8, ACC8{%k7} vpaddq 9*8+.LpolyX(%rip){1to8}, ACC9, ACC9{%k7} vpaddq 10*8+.LpolyX(%rip){1to8}, ACC10, ACC10{%k7} vpaddq 11*8+.LpolyX(%rip){1to8}, ACC11, ACC11{%k7} vpaddq 12*8+.LpolyX(%rip){1to8}, ACC12, ACC12{%k7} vpaddq 13*8+.LpolyX(%rip){1to8}, ACC13, ACC13{%k7} vpaddq 14*8+.LpolyX(%rip){1to8}, ACC14, ACC14{%k7} vpsubq A0, ACC0, ACC0{%k7} vpsubq A1, ACC1, ACC1{%k7} vpsubq A2, ACC2, ACC2{%k7} vpsubq A3, ACC3, ACC3{%k7} vpsubq A4, ACC4, ACC4{%k7} vpsubq A5, ACC5, ACC5{%k7} vpsubq A6, ACC6, ACC6{%k7} vpsubq A7, ACC7, ACC7{%k7} vpsubq A8, ACC8, ACC8{%k7} vpsubq A9, ACC9, ACC9{%k7} vpsubq A10, ACC10, ACC10{%k7} vpsubq A11, ACC11, ACC11{%k7} vpsubq A12, ACC12, ACC12{%k7} vpsubq A13, ACC13, ACC13{%k7} vpsubq A14, ACC14, ACC14{%k7} vpsrlq $52, ACC0, B vpaddq B, ACC1, ACC1 vpandq .Lpoly(%rip){1to8}, ACC0, ACC0 vpsrlq $52, ACC1, B vpaddq B, ACC2, ACC2 vpandq .Lpoly(%rip){1to8}, ACC1, ACC1 vpsrlq $52, ACC2, B vpaddq B, ACC3, ACC3 vpandq .Lpoly(%rip){1to8}, ACC2, ACC2 vpsrlq $52, ACC3, B vpaddq B, ACC4, ACC4 vpandq .Lpoly(%rip){1to8}, ACC3, ACC3 vpsrlq $52, ACC4, B vpaddq B, ACC5, ACC5 vpandq .Lpoly(%rip){1to8}, ACC4, ACC4 vpsrlq $52, ACC5, B vpaddq B, ACC6, ACC6 vpandq .Lpoly(%rip){1to8}, ACC5, ACC5 vpsrlq $52, ACC6, B vpaddq B, ACC7, ACC7 vpandq .Lpoly(%rip){1to8}, ACC6, ACC6 vpsrlq $52, ACC7, B vpaddq B, ACC8, ACC8 vpandq .Lpoly(%rip){1to8}, ACC7, ACC7 vpsrlq $52, ACC8, B vpaddq B, ACC9, ACC9 vpandq .Lpoly(%rip){1to8}, ACC8, ACC8 vpsrlq $52, ACC9, B vpaddq B, ACC10, ACC10 vpandq .Lpoly(%rip){1to8}, ACC9, ACC9 vpsrlq $52, ACC10, B vpaddq B, ACC11, ACC11 vpandq .Lpoly(%rip){1to8}, ACC10, ACC10 vpsrlq $52, ACC11, B vpaddq B, ACC12, ACC12 vpandq .Lpoly(%rip){1to8}, ACC11, ACC11 vpsrlq $52, ACC12, B vpaddq B, ACC13, ACC13 vpandq .Lpoly(%rip){1to8}, ACC12, ACC12 vpsrlq $52, ACC13, B vpaddq B, ACC14, ACC14 vpandq .Lpoly(%rip){1to8}, ACC13, ACC13 vpandq .Lpoly(%rip){1to8}, ACC14, ACC14 // Transpose to horizontal vpunpcklqdq ACC1, ACC0, ACC0 vpunpcklqdq ACC3, ACC2, ACC1 vpunpcklqdq ACC5, ACC4, ACC2 vpunpcklqdq ACC7, ACC6, ACC3 vpunpcklqdq ACC9, ACC8, ACC4 vpunpcklqdq ACC11, ACC10, ACC5 vpunpcklqdq ACC13, ACC12, ACC6 vmovdqa64 ACC14, ACC7 vshufi64x2 $0x44, ACC1, ACC0, A0 vshufi64x2 $0x44, ACC3, ACC2, A1 vshufi64x2 $0x44, ACC5, ACC4, A2 vshufi64x2 $0x44, ACC7, ACC6, A3 vshufi64x2 $0xee, ACC1, ACC0, A4 vshufi64x2 $0xee, ACC3, ACC2, A5 vshufi64x2 $0xee, ACC5, ACC4, A6 vshufi64x2 $0xee, ACC7, ACC6, A7 vshufi64x2 $0x88, A1, A0, ACC0 vshufi64x2 $0x88, A3, A2, ACC1 vshufi64x2 $0xdd, A1, A0, ACC2 vshufi64x2 $0xdd, A3, A2, ACC3 vshufi64x2 $0x88, A5, A4, ACC4 vshufi64x2 $0x88, A7, A6, ACC5 vshufi64x2 $0xdd, A5, A4, ACC6 vshufi64x2 $0xdd, A7, A6, ACC7 vmovdqu64 ACC0, 0*64(r0ptr) vmovdqu64 ACC1, 1*64(r0ptr){%k5} lea 15*8(r0ptr), r0ptr vmovdqu64 ACC2, 0*64(r0ptr) vmovdqu64 ACC3, 1*64(r0ptr){%k5} vmovdqu64 ACC4, 0*64(r1ptr) vmovdqu64 ACC5, 1*64(r1ptr){%k5} lea 15*8(r1ptr), r1ptr vmovdqu64 ACC6, 0*64(r1ptr) vmovdqu64 ACC7, 1*64(r1ptr){%k5} mov %rbp, %rsp pop %rbp ret // Performs 8 field multiplications in parallel .globl C_ABI(amm_751_ifma_x2) C_ABI(amm_751_ifma_x2): vmovdqu64 0*64(aptr), A0 vmovdqu64 1*64(aptr), A1 vmovdqu64 2*64(aptr), A2 vmovdqu64 3*64(aptr), A3 vmovdqu64 4*64(aptr), A4 vmovdqu64 5*64(aptr), A5 vmovdqu64 6*64(aptr), A6 vmovdqu64 7*64(aptr), A7 vmovdqu64 8*64(aptr), A8 vmovdqu64 9*64(aptr), A9 vmovdqu64 10*64(aptr), A10 vmovdqu64 11*64(aptr), A11 vmovdqu64 12*64(aptr), A12 vmovdqu64 13*64(aptr), A13 vmovdqu64 14*64(aptr), A14 do_mul_x2: vpxorq ACC0, ACC0, ACC0 vpxorq ACC1, ACC1, ACC1 vpxorq ACC2, ACC2, ACC2 vpxorq ACC3, ACC3, ACC3 vpxorq ACC4, ACC4, ACC4 vpxorq ACC5, ACC5, ACC5 vpxorq ACC6, ACC6, ACC6 vpxorq ACC7, ACC7, ACC7 vpxorq ACC8, ACC8, ACC8 vpxorq ACC9, ACC9, ACC9 vpxorq ACC10, ACC10, ACC10 vpxorq ACC11, ACC11, ACC11 vpxorq ACC12, ACC12, ACC12 vpxorq ACC13, ACC13, ACC13 vpxorq ACC14, ACC14, ACC14 vpxorq ACC15, ACC15, ACC15 mov $15, hlp 1: vmovdqu64 (bptr), B lea 1*64(bptr), bptr vpmadd52luq A0, B, ACC0 vpmadd52luq A1, B, ACC1 vpmadd52luq A2, B, ACC2 vpmadd52luq A3, B, ACC3 vpmadd52luq A4, B, ACC4 vpmadd52luq A5, B, ACC5 vpmadd52luq A6, B, ACC6 vpmadd52luq A7, B, ACC7 vpmadd52luq A8, B, ACC8 vpmadd52luq A9, B, ACC9 vpmadd52luq A10, B, ACC10 vpmadd52luq A11, B, ACC11 vpmadd52luq A12, B, ACC12 vpmadd52luq A13, B, ACC13 vpmadd52luq A14, B, ACC14 vpmadd52huq A0, B, ACC1 vpmadd52huq A1, B, ACC2 vpmadd52huq A2, B, ACC3 vpmadd52huq A3, B, ACC4 vpmadd52huq A4, B, ACC5 vpmadd52huq A5, B, ACC6 vpmadd52huq A6, B, ACC7 vpmadd52huq A7, B, ACC8 vpmadd52huq A8, B, ACC9 vpmadd52huq A9, B, ACC10 vpmadd52huq A10, B, ACC11 vpmadd52huq A11, B, ACC12 vpmadd52huq A12, B, ACC13 vpmadd52huq A13, B, ACC14 vpmadd52huq A14, B, ACC15 vmovdqa64 ACC0, B vpmadd52luq 0*8 + .Lpoly(%rip){1to8}, B, ACC0 vpsrlq $52, ACC0, ACC0 vpmadd52luq 1*8 + .Lpoly(%rip){1to8}, B, ACC1 vpaddq ACC1, ACC0, ACC0 vpmadd52luq 2*8 + .Lpoly(%rip){1to8}, B, ACC2 vmovdqa64 ACC2, ACC1 vpmadd52luq 3*8 + .Lpoly(%rip){1to8}, B, ACC3 vmovdqa64 ACC3, ACC2 vpmadd52luq 4*8 + .Lpoly(%rip){1to8}, B, ACC4 vmovdqa64 ACC4, ACC3 vpmadd52luq 5*8 + .Lpoly(%rip){1to8}, B, ACC5 vmovdqa64 ACC5, ACC4 vpmadd52luq 6*8 + .Lpoly(%rip){1to8}, B, ACC6 vmovdqa64 ACC6, ACC5 vpmadd52luq 7*8 + .Lpoly(%rip){1to8}, B, ACC7 vmovdqa64 ACC7, ACC6 vpmadd52luq 8*8 + .Lpoly(%rip){1to8}, B, ACC8 vmovdqa64 ACC8, ACC7 vpmadd52luq 9*8 + .Lpoly(%rip){1to8}, B, ACC9 vmovdqa64 ACC9, ACC8 vpmadd52luq 10*8 + .Lpoly(%rip){1to8}, B, ACC10 vmovdqa64 ACC10, ACC9 vpmadd52luq 11*8 + .Lpoly(%rip){1to8}, B, ACC11 vmovdqa64 ACC11, ACC10 vpmadd52luq 12*8 + .Lpoly(%rip){1to8}, B, ACC12 vmovdqa64 ACC12, ACC11 vpmadd52luq 13*8 + .Lpoly(%rip){1to8}, B, ACC13 vmovdqa64 ACC13, ACC12 vpmadd52luq 14*8 + .Lpoly(%rip){1to8}, B, ACC14 vmovdqa64 ACC14, ACC13 vmovdqa64 ACC15, ACC14 vpxorq ACC15, ACC15, ACC15 vpmadd52huq 0*8 + .Lpoly(%rip){1to8}, B, ACC0 vpmadd52huq 1*8 + .Lpoly(%rip){1to8}, B, ACC1 vpmadd52huq 2*8 + .Lpoly(%rip){1to8}, B, ACC2 vpmadd52huq 3*8 + .Lpoly(%rip){1to8}, B, ACC3 vpmadd52huq 4*8 + .Lpoly(%rip){1to8}, B, ACC4 vpmadd52huq 5*8 + .Lpoly(%rip){1to8}, B, ACC5 vpmadd52huq 6*8 + .Lpoly(%rip){1to8}, B, ACC6 vpmadd52huq 7*8 + .Lpoly(%rip){1to8}, B, ACC7 vpmadd52huq 8*8 + .Lpoly(%rip){1to8}, B, ACC8 vpmadd52huq 9*8 + .Lpoly(%rip){1to8}, B, ACC9 vpmadd52huq 10*8 + .Lpoly(%rip){1to8}, B, ACC10 vpmadd52huq 11*8 + .Lpoly(%rip){1to8}, B, ACC11 vpmadd52huq 12*8 + .Lpoly(%rip){1to8}, B, ACC12 vpmadd52huq 13*8 + .Lpoly(%rip){1to8}, B, ACC13 vpmadd52huq 14*8 + .Lpoly(%rip){1to8}, B, ACC14 dec hlp jnz 1b ret