Du kan inte välja fler än 25 ämnen Ämnen måste starta med en bokstav eller siffra, kan innehålla bindestreck ('-') och vara max 35 tecken långa.
 
 
 

523 rader
16 KiB

  1. #if defined(__APPLE__)
  2. /* OS X's C ABI prefixes functions with underscore. */
  3. #define C_ABI(x) _ ## x
  4. #define HIDDEN .private_extern
  5. #else
  6. #define C_ABI(x) x
  7. #define HIDDEN .hidden
  8. #endif
  9. #define ACC0 %zmm0
  10. #define ACC1 %zmm1
  11. #define ACC2 %zmm2
  12. #define ACC3 %zmm3
  13. #define ACC4 %zmm4
  14. #define ACC5 %zmm5
  15. #define ACC6 %zmm6
  16. #define ACC7 %zmm7
  17. #define ACC8 %zmm8
  18. #define ACC9 %zmm9
  19. #define ACC10 %zmm10
  20. #define ACC11 %zmm11
  21. #define ACC12 %zmm12
  22. #define ACC13 %zmm13
  23. #define ACC14 %zmm14
  24. #define ACC15 %zmm15
  25. #define A0 %zmm16
  26. #define A1 %zmm17
  27. #define A2 %zmm18
  28. #define A3 %zmm19
  29. #define A4 %zmm20
  30. #define A5 %zmm21
  31. #define A6 %zmm22
  32. #define A7 %zmm23
  33. #define A8 %zmm24
  34. #define A9 %zmm25
  35. #define A10 %zmm26
  36. #define A11 %zmm27
  37. #define A12 %zmm28
  38. #define A13 %zmm29
  39. #define A14 %zmm30
  40. #define B %zmm31
  41. #define rptr %rdi
  42. #define aptr %rsi
  43. #define bptr %rdx
  44. #define r0ptr %rdi
  45. #define a0ptr %rsi
  46. #define b0ptr %rdx
  47. #define r1ptr %rcx
  48. #define a1ptr %r8
  49. #define b1ptr %r9
  50. #define hlp %rax
  51. .p2align 6
  52. .Lmask:
  53. .Lpoly:
  54. .quad 0x000fffffffffffff, 0x000fffffffffffff, 0x000fffffffffffff, 0x000fffffffffffff
  55. .quad 0x000fffffffffffff, 0x000fffffffffffff, 0x000fffffffffffff, 0x00049f878a8eeaff
  56. .quad 0x0007cc76e3ec9685, 0x00076da959b1a13f, 0x00084e9867d6ebe8, 0x000b5045cb257480
  57. .quad 0x000f97badc668562, 0x00041f71c0e12909, 0x00000000006fe5d5, 0
  58. .LpolyX:
  59. .quad 0x0fffffffffffff00, 0x0fffffffffffff00, 0x0fffffffffffff00, 0x0fffffffffffff00
  60. .quad 0x0fffffffffffff00, 0x0fffffffffffff00, 0x0fffffffffffff00, 0x049f878a8eeaff00
  61. .quad 0x07cc76e3ec968500, 0x076da959b1a13f00, 0x084e9867d6ebe800, 0x0b5045cb25748000
  62. .quad 0x0f97badc66856200, 0x041f71c0e1290900, 0x000000006fe5d500, 0
  63. .Lperm0:
  64. .quad 0,1,0,1,2,3,2,3
  65. .Lperm1:
  66. .quad 4,5,5,4,6,7,7,6
  67. // TODO: avoid transposing every call by keeping data vertical throughout
  68. // Performs 8 field multiplications in parallel
  69. .globl C_ABI(fp2_mul_ifma_x2)
  70. C_ABI(fp2_mul_ifma_x2):
  71. push %rbp
  72. mov %rsp, %rbp
  73. sub $960, %rsp
  74. and $-64, %rsp
  75. mov $0x7f, %rax
  76. kmovq %rax, %k5
  77. // Load a0[0]
  78. vmovdqu64 0*64(a0ptr), %zmm0
  79. vmovdqu64 1*64(a0ptr), %zmm1{%k5}{z}
  80. lea 15*8(a0ptr), a0ptr
  81. // Load a0[1]
  82. vmovdqu64 0*64(a0ptr), %zmm2
  83. vmovdqu64 1*64(a0ptr), %zmm3{%k5}{z}
  84. // Load b0[0]
  85. vmovdqu64 0*64(b0ptr), %zmm4
  86. vmovdqu64 1*64(b0ptr), %zmm5{%k5}{z}
  87. lea 15*8(b0ptr), b0ptr
  88. // Load b0[1]
  89. vmovdqu64 0*64(b0ptr), %zmm6
  90. vmovdqu64 1*64(b0ptr), %zmm7{%k5}{z}
  91. // Load a1[0]
  92. vmovdqu64 0*64(a1ptr), %zmm8
  93. vmovdqu64 1*64(a1ptr), %zmm9{%k5}{z}
  94. lea 15*8(a1ptr), a1ptr
  95. // Load a1[1]
  96. vmovdqu64 0*64(a1ptr), %zmm10
  97. vmovdqu64 1*64(a1ptr), %zmm11{%k5}{z}
  98. // Load b1[0]
  99. vmovdqu64 0*64(b1ptr), %zmm12
  100. vmovdqu64 1*64(b1ptr), %zmm13{%k5}{z}
  101. lea 15*8(b1ptr), b1ptr
  102. // Load b1[1]
  103. vmovdqu64 0*64(b1ptr), %zmm14
  104. vmovdqu64 1*64(b1ptr), %zmm15{%k5}{z}
  105. // Transpose
  106. vpunpcklqdq %zmm2, %zmm0, %zmm16 // 0 0 2 2 4 4 6 6
  107. vpunpckhqdq %zmm2, %zmm0, %zmm17 // 1 1 3 3 5 5 7 7
  108. vpunpcklqdq %zmm6, %zmm4, %zmm18 // 0 0 2 2 4 4 6 6
  109. vpunpckhqdq %zmm6, %zmm4, %zmm19 // 1 1 3 3 5 5 7 7
  110. vpunpcklqdq %zmm10, %zmm8, %zmm20 // 0 0 2 2 4 4 6 6
  111. vpunpckhqdq %zmm10, %zmm8, %zmm21 // 1 1 3 3 5 5 7 7
  112. vpunpcklqdq %zmm14, %zmm12, %zmm22 // 0 0 2 2 4 4 6 6
  113. vpunpckhqdq %zmm14, %zmm12, %zmm23 // 1 1 3 3 5 5 7 7
  114. vpunpcklqdq %zmm3, %zmm1, %zmm24 // 8 8 10 10 12 12 14 14
  115. vpunpckhqdq %zmm3, %zmm1, %zmm25 // 9 9 11 11 13 13 15 15
  116. vpunpcklqdq %zmm7, %zmm5, %zmm26 // 8 8 10 10 12 12 14 14
  117. vpunpckhqdq %zmm7, %zmm5, %zmm27 // 9 9 11 11 13 13 15 15
  118. vpunpcklqdq %zmm11, %zmm9, %zmm28 // 8 8 10 10 12 12 14 14
  119. vpunpckhqdq %zmm11, %zmm9, %zmm29 // 9 9 11 11 13 13 15 15
  120. vpunpcklqdq %zmm15, %zmm13, %zmm30 // 8 8 10 10 12 12 14 14
  121. vpunpckhqdq %zmm15, %zmm13, %zmm31 // 9 9 11 11 13 13 15 15
  122. vshufi64x2 $0x44, %zmm20, %zmm16, %zmm0 // 0 0 2 2 0 0 2 2
  123. vshufi64x2 $0x44, %zmm22, %zmm18, %zmm1 // 0 0 2 2 0 0 2 2
  124. vshufi64x2 $0xee, %zmm20, %zmm16, %zmm2 // 4 4 6 6 4 4 6 6
  125. vshufi64x2 $0xee, %zmm22, %zmm18, %zmm3 // 4 4 6 6 4 4 6 6
  126. vshufi64x2 $0x44, %zmm21, %zmm17, %zmm4 // 1 1 3 3 1 1 3 3
  127. vshufi64x2 $0x44, %zmm23, %zmm19, %zmm5 // 1 1 3 3 1 1 3 3
  128. vshufi64x2 $0xee, %zmm21, %zmm17, %zmm6 // 5 5 7 7 5 5 7 7
  129. vshufi64x2 $0xee, %zmm23, %zmm19, %zmm7 // 5 5 7 7 5 5 7 7
  130. vshufi64x2 $0x44, %zmm28, %zmm24, %zmm8 // 8 8 10 10 8 8 10 10
  131. vshufi64x2 $0x44, %zmm30, %zmm26, %zmm9 // 8 8 10 10 8 8 10 10
  132. vshufi64x2 $0xee, %zmm28, %zmm24, %zmm10 // 12 12 14 14 12 12 14 14
  133. vshufi64x2 $0xee, %zmm30, %zmm26, %zmm11 // 12 12 14 14 12 12 14 14
  134. vshufi64x2 $0x44, %zmm29, %zmm25, %zmm12 // 9 9 11 11 9 9 11 11
  135. vshufi64x2 $0x44, %zmm31, %zmm27, %zmm13 // 9 9 11 11 9 9 11 11
  136. vshufi64x2 $0xee, %zmm29, %zmm25, %zmm14 // 13 13 15 15 13 13 15 15
  137. vshufi64x2 $0xee, %zmm31, %zmm27, %zmm15 // 13 13 15 15 13 13 15 15
  138. vshufi64x2 $0x88, %zmm1, %zmm0, %zmm16 //0
  139. vshufi64x2 $0x88, %zmm5, %zmm4, %zmm17 //1
  140. vshufi64x2 $0xdd, %zmm1, %zmm0, %zmm18 //
  141. vshufi64x2 $0xdd, %zmm5, %zmm4, %zmm19
  142. vshufi64x2 $0x88, %zmm3, %zmm2, %zmm20
  143. vshufi64x2 $0x88, %zmm7, %zmm6, %zmm21
  144. vshufi64x2 $0xdd, %zmm3, %zmm2, %zmm22
  145. vshufi64x2 $0xdd, %zmm7, %zmm6, %zmm23
  146. vshufi64x2 $0x88, %zmm9, %zmm8, %zmm24
  147. vshufi64x2 $0x88, %zmm13, %zmm12, %zmm25
  148. vshufi64x2 $0xdd, %zmm9, %zmm8, %zmm26
  149. vshufi64x2 $0xdd, %zmm13, %zmm12, %zmm27
  150. vshufi64x2 $0x88, %zmm11, %zmm10, %zmm28
  151. vshufi64x2 $0x88, %zmm15, %zmm14, %zmm29
  152. vshufi64x2 $0xdd, %zmm11, %zmm10, %zmm30
  153. vmovdqa64 .Lperm0(%rip), %zmm31
  154. vpermq %zmm16, %zmm31, %zmm0
  155. vpermq %zmm17, %zmm31, %zmm1
  156. vpermq %zmm18, %zmm31, %zmm2
  157. vpermq %zmm19, %zmm31, %zmm3
  158. vpermq %zmm20, %zmm31, %zmm4
  159. vpermq %zmm21, %zmm31, %zmm5
  160. vpermq %zmm22, %zmm31, %zmm6
  161. vpermq %zmm23, %zmm31, %zmm7
  162. vpermq %zmm24, %zmm31, %zmm8
  163. vpermq %zmm25, %zmm31, %zmm9
  164. vpermq %zmm26, %zmm31, %zmm10
  165. vpermq %zmm27, %zmm31, %zmm11
  166. vpermq %zmm28, %zmm31, %zmm12
  167. vpermq %zmm29, %zmm31, %zmm13
  168. vpermq %zmm30, %zmm31, %zmm14
  169. .irp r,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
  170. vmovdqu64 %zmm\r, \r*64(%rsp)
  171. .endr
  172. vmovdqa64 .Lperm1(%rip), %zmm31
  173. vpermq %zmm16, %zmm31, A0
  174. vpermq %zmm17, %zmm31, A1
  175. vpermq %zmm18, %zmm31, A2
  176. vpermq %zmm19, %zmm31, A3
  177. vpermq %zmm20, %zmm31, A4
  178. vpermq %zmm21, %zmm31, A5
  179. vpermq %zmm22, %zmm31, A6
  180. vpermq %zmm23, %zmm31, A7
  181. vpermq %zmm24, %zmm31, A8
  182. vpermq %zmm25, %zmm31, A9
  183. vpermq %zmm26, %zmm31, A10
  184. vpermq %zmm27, %zmm31, A11
  185. vpermq %zmm28, %zmm31, A12
  186. vpermq %zmm29, %zmm31, A13
  187. vpermq %zmm30, %zmm31, A14
  188. lea (%rsp), bptr
  189. call do_mul_x2
  190. // After parallel multiplication the layout is:
  191. // A0[0] * B0[0], A0[1] * B0[1], A0[0] * B0[1], A0[1] * B0[0], A1[0] * B1[0], A1[1] * B1[1], A1[0] * B1[1], A1[1] * B1[0]
  192. // We need to compute:
  193. // A0[0] * B0[0] - A0[1] * B0[1], A0[0] * B0[1] + A0[1] * B0[0], A1[0] * B1[0] - A0[1] * B1[1], A1[0] * B1[1] + A1[1] * B1[0]
  194. vpsrldq $8, ACC0, A0
  195. vpsrldq $8, ACC1, A1
  196. vpsrldq $8, ACC2, A2
  197. vpsrldq $8, ACC3, A3
  198. vpsrldq $8, ACC4, A4
  199. vpsrldq $8, ACC5, A5
  200. vpsrldq $8, ACC6, A6
  201. vpsrldq $8, ACC7, A7
  202. vpsrldq $8, ACC8, A8
  203. vpsrldq $8, ACC9, A9
  204. vpsrldq $8, ACC10, A10
  205. vpsrldq $8, ACC11, A11
  206. vpsrldq $8, ACC12, A12
  207. vpsrldq $8, ACC13, A13
  208. vpsrldq $8, ACC14, A14
  209. mov $0x44, hlp
  210. kmovq hlp, %k7
  211. vpaddq A0, ACC0, ACC0{%k7}
  212. vpaddq A1, ACC1, ACC1{%k7}
  213. vpaddq A2, ACC2, ACC2{%k7}
  214. vpaddq A3, ACC3, ACC3{%k7}
  215. vpaddq A4, ACC4, ACC4{%k7}
  216. vpaddq A5, ACC5, ACC5{%k7}
  217. vpaddq A6, ACC6, ACC6{%k7}
  218. vpaddq A7, ACC7, ACC7{%k7}
  219. vpaddq A8, ACC8, ACC8{%k7}
  220. vpaddq A9, ACC9, ACC9{%k7}
  221. vpaddq A10, ACC10, ACC10{%k7}
  222. vpaddq A11, ACC11, ACC11{%k7}
  223. vpaddq A12, ACC12, ACC12{%k7}
  224. vpaddq A13, ACC13, ACC13{%k7}
  225. vpaddq A14, ACC14, ACC14{%k7}
  226. mov $0x11, hlp
  227. kmovq hlp, %k7
  228. vpaddq 0*8+.LpolyX(%rip){1to8}, ACC0, ACC0{%k7}
  229. vpaddq 1*8+.LpolyX(%rip){1to8}, ACC1, ACC1{%k7}
  230. vpaddq 2*8+.LpolyX(%rip){1to8}, ACC2, ACC2{%k7}
  231. vpaddq 3*8+.LpolyX(%rip){1to8}, ACC3, ACC3{%k7}
  232. vpaddq 4*8+.LpolyX(%rip){1to8}, ACC4, ACC4{%k7}
  233. vpaddq 5*8+.LpolyX(%rip){1to8}, ACC5, ACC5{%k7}
  234. vpaddq 6*8+.LpolyX(%rip){1to8}, ACC6, ACC6{%k7}
  235. vpaddq 7*8+.LpolyX(%rip){1to8}, ACC7, ACC7{%k7}
  236. vpaddq 8*8+.LpolyX(%rip){1to8}, ACC8, ACC8{%k7}
  237. vpaddq 9*8+.LpolyX(%rip){1to8}, ACC9, ACC9{%k7}
  238. vpaddq 10*8+.LpolyX(%rip){1to8}, ACC10, ACC10{%k7}
  239. vpaddq 11*8+.LpolyX(%rip){1to8}, ACC11, ACC11{%k7}
  240. vpaddq 12*8+.LpolyX(%rip){1to8}, ACC12, ACC12{%k7}
  241. vpaddq 13*8+.LpolyX(%rip){1to8}, ACC13, ACC13{%k7}
  242. vpaddq 14*8+.LpolyX(%rip){1to8}, ACC14, ACC14{%k7}
  243. vpsubq A0, ACC0, ACC0{%k7}
  244. vpsubq A1, ACC1, ACC1{%k7}
  245. vpsubq A2, ACC2, ACC2{%k7}
  246. vpsubq A3, ACC3, ACC3{%k7}
  247. vpsubq A4, ACC4, ACC4{%k7}
  248. vpsubq A5, ACC5, ACC5{%k7}
  249. vpsubq A6, ACC6, ACC6{%k7}
  250. vpsubq A7, ACC7, ACC7{%k7}
  251. vpsubq A8, ACC8, ACC8{%k7}
  252. vpsubq A9, ACC9, ACC9{%k7}
  253. vpsubq A10, ACC10, ACC10{%k7}
  254. vpsubq A11, ACC11, ACC11{%k7}
  255. vpsubq A12, ACC12, ACC12{%k7}
  256. vpsubq A13, ACC13, ACC13{%k7}
  257. vpsubq A14, ACC14, ACC14{%k7}
  258. vpsrlq $52, ACC0, B
  259. vpaddq B, ACC1, ACC1
  260. vpandq .Lpoly(%rip){1to8}, ACC0, ACC0
  261. vpsrlq $52, ACC1, B
  262. vpaddq B, ACC2, ACC2
  263. vpandq .Lpoly(%rip){1to8}, ACC1, ACC1
  264. vpsrlq $52, ACC2, B
  265. vpaddq B, ACC3, ACC3
  266. vpandq .Lpoly(%rip){1to8}, ACC2, ACC2
  267. vpsrlq $52, ACC3, B
  268. vpaddq B, ACC4, ACC4
  269. vpandq .Lpoly(%rip){1to8}, ACC3, ACC3
  270. vpsrlq $52, ACC4, B
  271. vpaddq B, ACC5, ACC5
  272. vpandq .Lpoly(%rip){1to8}, ACC4, ACC4
  273. vpsrlq $52, ACC5, B
  274. vpaddq B, ACC6, ACC6
  275. vpandq .Lpoly(%rip){1to8}, ACC5, ACC5
  276. vpsrlq $52, ACC6, B
  277. vpaddq B, ACC7, ACC7
  278. vpandq .Lpoly(%rip){1to8}, ACC6, ACC6
  279. vpsrlq $52, ACC7, B
  280. vpaddq B, ACC8, ACC8
  281. vpandq .Lpoly(%rip){1to8}, ACC7, ACC7
  282. vpsrlq $52, ACC8, B
  283. vpaddq B, ACC9, ACC9
  284. vpandq .Lpoly(%rip){1to8}, ACC8, ACC8
  285. vpsrlq $52, ACC9, B
  286. vpaddq B, ACC10, ACC10
  287. vpandq .Lpoly(%rip){1to8}, ACC9, ACC9
  288. vpsrlq $52, ACC10, B
  289. vpaddq B, ACC11, ACC11
  290. vpandq .Lpoly(%rip){1to8}, ACC10, ACC10
  291. vpsrlq $52, ACC11, B
  292. vpaddq B, ACC12, ACC12
  293. vpandq .Lpoly(%rip){1to8}, ACC11, ACC11
  294. vpsrlq $52, ACC12, B
  295. vpaddq B, ACC13, ACC13
  296. vpandq .Lpoly(%rip){1to8}, ACC12, ACC12
  297. vpsrlq $52, ACC13, B
  298. vpaddq B, ACC14, ACC14
  299. vpandq .Lpoly(%rip){1to8}, ACC13, ACC13
  300. vpandq .Lpoly(%rip){1to8}, ACC14, ACC14
  301. // Transpose to horizontal
  302. vpunpcklqdq ACC1, ACC0, ACC0
  303. vpunpcklqdq ACC3, ACC2, ACC1
  304. vpunpcklqdq ACC5, ACC4, ACC2
  305. vpunpcklqdq ACC7, ACC6, ACC3
  306. vpunpcklqdq ACC9, ACC8, ACC4
  307. vpunpcklqdq ACC11, ACC10, ACC5
  308. vpunpcklqdq ACC13, ACC12, ACC6
  309. vmovdqa64 ACC14, ACC7
  310. vshufi64x2 $0x44, ACC1, ACC0, A0
  311. vshufi64x2 $0x44, ACC3, ACC2, A1
  312. vshufi64x2 $0x44, ACC5, ACC4, A2
  313. vshufi64x2 $0x44, ACC7, ACC6, A3
  314. vshufi64x2 $0xee, ACC1, ACC0, A4
  315. vshufi64x2 $0xee, ACC3, ACC2, A5
  316. vshufi64x2 $0xee, ACC5, ACC4, A6
  317. vshufi64x2 $0xee, ACC7, ACC6, A7
  318. vshufi64x2 $0x88, A1, A0, ACC0
  319. vshufi64x2 $0x88, A3, A2, ACC1
  320. vshufi64x2 $0xdd, A1, A0, ACC2
  321. vshufi64x2 $0xdd, A3, A2, ACC3
  322. vshufi64x2 $0x88, A5, A4, ACC4
  323. vshufi64x2 $0x88, A7, A6, ACC5
  324. vshufi64x2 $0xdd, A5, A4, ACC6
  325. vshufi64x2 $0xdd, A7, A6, ACC7
  326. vmovdqu64 ACC0, 0*64(r0ptr)
  327. vmovdqu64 ACC1, 1*64(r0ptr){%k5}
  328. lea 15*8(r0ptr), r0ptr
  329. vmovdqu64 ACC2, 0*64(r0ptr)
  330. vmovdqu64 ACC3, 1*64(r0ptr){%k5}
  331. vmovdqu64 ACC4, 0*64(r1ptr)
  332. vmovdqu64 ACC5, 1*64(r1ptr){%k5}
  333. lea 15*8(r1ptr), r1ptr
  334. vmovdqu64 ACC6, 0*64(r1ptr)
  335. vmovdqu64 ACC7, 1*64(r1ptr){%k5}
  336. mov %rbp, %rsp
  337. pop %rbp
  338. ret
  339. // Performs 8 field multiplications in parallel
  340. .globl C_ABI(amm_751_ifma_x2)
  341. C_ABI(amm_751_ifma_x2):
  342. vmovdqu64 0*64(aptr), A0
  343. vmovdqu64 1*64(aptr), A1
  344. vmovdqu64 2*64(aptr), A2
  345. vmovdqu64 3*64(aptr), A3
  346. vmovdqu64 4*64(aptr), A4
  347. vmovdqu64 5*64(aptr), A5
  348. vmovdqu64 6*64(aptr), A6
  349. vmovdqu64 7*64(aptr), A7
  350. vmovdqu64 8*64(aptr), A8
  351. vmovdqu64 9*64(aptr), A9
  352. vmovdqu64 10*64(aptr), A10
  353. vmovdqu64 11*64(aptr), A11
  354. vmovdqu64 12*64(aptr), A12
  355. vmovdqu64 13*64(aptr), A13
  356. vmovdqu64 14*64(aptr), A14
  357. do_mul_x2:
  358. vpxorq ACC0, ACC0, ACC0
  359. vpxorq ACC1, ACC1, ACC1
  360. vpxorq ACC2, ACC2, ACC2
  361. vpxorq ACC3, ACC3, ACC3
  362. vpxorq ACC4, ACC4, ACC4
  363. vpxorq ACC5, ACC5, ACC5
  364. vpxorq ACC6, ACC6, ACC6
  365. vpxorq ACC7, ACC7, ACC7
  366. vpxorq ACC8, ACC8, ACC8
  367. vpxorq ACC9, ACC9, ACC9
  368. vpxorq ACC10, ACC10, ACC10
  369. vpxorq ACC11, ACC11, ACC11
  370. vpxorq ACC12, ACC12, ACC12
  371. vpxorq ACC13, ACC13, ACC13
  372. vpxorq ACC14, ACC14, ACC14
  373. vpxorq ACC15, ACC15, ACC15
  374. mov $15, hlp
  375. 1:
  376. vmovdqu64 (bptr), B
  377. lea 1*64(bptr), bptr
  378. vpmadd52luq A0, B, ACC0
  379. vpmadd52luq A1, B, ACC1
  380. vpmadd52luq A2, B, ACC2
  381. vpmadd52luq A3, B, ACC3
  382. vpmadd52luq A4, B, ACC4
  383. vpmadd52luq A5, B, ACC5
  384. vpmadd52luq A6, B, ACC6
  385. vpmadd52luq A7, B, ACC7
  386. vpmadd52luq A8, B, ACC8
  387. vpmadd52luq A9, B, ACC9
  388. vpmadd52luq A10, B, ACC10
  389. vpmadd52luq A11, B, ACC11
  390. vpmadd52luq A12, B, ACC12
  391. vpmadd52luq A13, B, ACC13
  392. vpmadd52luq A14, B, ACC14
  393. vpmadd52huq A0, B, ACC1
  394. vpmadd52huq A1, B, ACC2
  395. vpmadd52huq A2, B, ACC3
  396. vpmadd52huq A3, B, ACC4
  397. vpmadd52huq A4, B, ACC5
  398. vpmadd52huq A5, B, ACC6
  399. vpmadd52huq A6, B, ACC7
  400. vpmadd52huq A7, B, ACC8
  401. vpmadd52huq A8, B, ACC9
  402. vpmadd52huq A9, B, ACC10
  403. vpmadd52huq A10, B, ACC11
  404. vpmadd52huq A11, B, ACC12
  405. vpmadd52huq A12, B, ACC13
  406. vpmadd52huq A13, B, ACC14
  407. vpmadd52huq A14, B, ACC15
  408. vmovdqa64 ACC0, B
  409. vpmadd52luq 0*8 + .Lpoly(%rip){1to8}, B, ACC0
  410. vpsrlq $52, ACC0, ACC0
  411. vpmadd52luq 1*8 + .Lpoly(%rip){1to8}, B, ACC1
  412. vpaddq ACC1, ACC0, ACC0
  413. vpmadd52luq 2*8 + .Lpoly(%rip){1to8}, B, ACC2
  414. vmovdqa64 ACC2, ACC1
  415. vpmadd52luq 3*8 + .Lpoly(%rip){1to8}, B, ACC3
  416. vmovdqa64 ACC3, ACC2
  417. vpmadd52luq 4*8 + .Lpoly(%rip){1to8}, B, ACC4
  418. vmovdqa64 ACC4, ACC3
  419. vpmadd52luq 5*8 + .Lpoly(%rip){1to8}, B, ACC5
  420. vmovdqa64 ACC5, ACC4
  421. vpmadd52luq 6*8 + .Lpoly(%rip){1to8}, B, ACC6
  422. vmovdqa64 ACC6, ACC5
  423. vpmadd52luq 7*8 + .Lpoly(%rip){1to8}, B, ACC7
  424. vmovdqa64 ACC7, ACC6
  425. vpmadd52luq 8*8 + .Lpoly(%rip){1to8}, B, ACC8
  426. vmovdqa64 ACC8, ACC7
  427. vpmadd52luq 9*8 + .Lpoly(%rip){1to8}, B, ACC9
  428. vmovdqa64 ACC9, ACC8
  429. vpmadd52luq 10*8 + .Lpoly(%rip){1to8}, B, ACC10
  430. vmovdqa64 ACC10, ACC9
  431. vpmadd52luq 11*8 + .Lpoly(%rip){1to8}, B, ACC11
  432. vmovdqa64 ACC11, ACC10
  433. vpmadd52luq 12*8 + .Lpoly(%rip){1to8}, B, ACC12
  434. vmovdqa64 ACC12, ACC11
  435. vpmadd52luq 13*8 + .Lpoly(%rip){1to8}, B, ACC13
  436. vmovdqa64 ACC13, ACC12
  437. vpmadd52luq 14*8 + .Lpoly(%rip){1to8}, B, ACC14
  438. vmovdqa64 ACC14, ACC13
  439. vmovdqa64 ACC15, ACC14
  440. vpxorq ACC15, ACC15, ACC15
  441. vpmadd52huq 0*8 + .Lpoly(%rip){1to8}, B, ACC0
  442. vpmadd52huq 1*8 + .Lpoly(%rip){1to8}, B, ACC1
  443. vpmadd52huq 2*8 + .Lpoly(%rip){1to8}, B, ACC2
  444. vpmadd52huq 3*8 + .Lpoly(%rip){1to8}, B, ACC3
  445. vpmadd52huq 4*8 + .Lpoly(%rip){1to8}, B, ACC4
  446. vpmadd52huq 5*8 + .Lpoly(%rip){1to8}, B, ACC5
  447. vpmadd52huq 6*8 + .Lpoly(%rip){1to8}, B, ACC6
  448. vpmadd52huq 7*8 + .Lpoly(%rip){1to8}, B, ACC7
  449. vpmadd52huq 8*8 + .Lpoly(%rip){1to8}, B, ACC8
  450. vpmadd52huq 9*8 + .Lpoly(%rip){1to8}, B, ACC9
  451. vpmadd52huq 10*8 + .Lpoly(%rip){1to8}, B, ACC10
  452. vpmadd52huq 11*8 + .Lpoly(%rip){1to8}, B, ACC11
  453. vpmadd52huq 12*8 + .Lpoly(%rip){1to8}, B, ACC12
  454. vpmadd52huq 13*8 + .Lpoly(%rip){1to8}, B, ACC13
  455. vpmadd52huq 14*8 + .Lpoly(%rip){1to8}, B, ACC14
  456. dec hlp
  457. jnz 1b
  458. ret