You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

917 regels
24 KiB

  1. #if defined(__APPLE__)
  2. /* OS X's C ABI prefixes functions with underscore. */
  3. #define C_ABI(x) _ ## x
  4. #define HIDDEN .private_extern
  5. #else
  6. #define C_ABI(x) x
  7. #define HIDDEN .hidden
  8. #endif
  9. .p2align 6
  10. .LpermMask0:
  11. .word 0,1,2,3, 3,4,5,6, 6,7,8,9, 9,10,11,12, 13,14,15,16, 16,17,18,19, 19,20,21,22, 22,23,24,25
  12. .LshiftMask0:
  13. .quad 0,4,8,12,0,4,8,12
  14. .LandMask:
  15. .quad 0xfffffffffffff
  16. .p2align 6
  17. .Lpoly:
  18. .quad 0x000fffffffffffff, 0x000fffffffffffff, 0x000fffffffffffff, 0x000fffffffffffff
  19. .quad 0x000fffffffffffff, 0x000fffffffffffff, 0x000fffffffffffff, 0x00049f878a8eeaff
  20. .quad 0x0007cc76e3ec9685, 0x00076da959b1a13f, 0x00084e9867d6ebe8, 0x000b5045cb257480
  21. .quad 0x000f97badc668562, 0x00041f71c0e12909, 0x00000000006fe5d5, 0
  22. .LpolyX:
  23. .quad 0x0fffffffffffff00, 0x0fffffffffffff00, 0x0fffffffffffff00, 0x0fffffffffffff00
  24. .quad 0x0fffffffffffff00, 0x0fffffffffffff00, 0x0fffffffffffff00, 0x049f878a8eeaff00
  25. .quad 0x07cc76e3ec968500, 0x076da959b1a13f00, 0x084e9867d6ebe800, 0x0b5045cb25748000
  26. .quad 0x0f97badc66856200, 0x041f71c0e1290900, 0x000000006fe5d500, 0
  27. #define felemR %rdi
  28. #define felemA %rsi
  29. #define felemB %rdx
  30. #define itr %r10
  31. #define M0 %zmm0
  32. #define M1 %zmm1
  33. #define ZERO %zmm2
  34. #define AND_MASK %zmm3
  35. #define A0a %zmm4
  36. #define A0b %zmm5
  37. #define A1a %zmm6
  38. #define A1b %zmm7
  39. #define ACC0a %zmm8
  40. #define ACC0b %zmm9
  41. #define ACC1a %zmm10
  42. #define ACC1b %zmm11
  43. #define ACC2a %zmm12
  44. #define ACC2b %zmm13
  45. #define ACC3a %zmm14
  46. #define ACC3b %zmm15
  47. #define B0curr %zmm16
  48. #define B0prev %zmm17
  49. #define B1curr %zmm18
  50. #define B1prev %zmm19
  51. #define Y0curr %zmm20
  52. #define Y0prev %zmm21
  53. #define Y1curr %zmm22
  54. #define Y1prev %zmm23
  55. #define Y2curr %zmm24
  56. #define Y2prev %zmm25
  57. #define Y3curr %zmm26
  58. #define Y3prev %zmm27
  59. #define T0 %zmm28
  60. #define T1 %zmm29
  61. #define T2 %zmm30
  62. #define T3 %zmm31
  63. ###############################################################################
  64. .globl C_ABI(fp2_mul_ifma)
  65. .p2align 6
  66. C_ABI(fp2_mul_ifma):
  67. mov $1, %eax
  68. kmovw %eax, %k1
  69. mov $0x7f, %eax
  70. kmovw %eax, %k5
  71. vpbroadcastq .LandMask(%rip), AND_MASK
  72. vpxorq ZERO, ZERO, ZERO
  73. vmovdqu64 64*0(felemA), A0a
  74. vmovdqu64 64*1(felemA), A0b{%k5}{z}
  75. vmovdqu64 15*8 + 64*0(felemA), A1a
  76. vmovdqu64 15*8 + 64*1(felemA), A1b{%k5}{z}
  77. # Load the modulus
  78. vmovdqa64 64*0 + .Lpoly(%rip), M0
  79. vmovdqa64 64*1 + .Lpoly(%rip), M1
  80. # Prepare the accumulators
  81. vpxorq ACC0a, ACC0a, ACC0a
  82. vpxorq ACC0b, ACC0b, ACC0b
  83. vpxorq ACC1a, ACC1a, ACC1a
  84. vpxorq ACC1b, ACC1b, ACC1b
  85. vpxorq ACC2a, ACC2a, ACC2a
  86. vpxorq ACC2b, ACC2b, ACC2b
  87. vpxorq ACC3a, ACC3a, ACC3a
  88. vpxorq ACC3b, ACC3b, ACC3b
  89. vpxorq T0, T0, T0
  90. vpxorq T1, T1, T1
  91. vpxorq T2, T2, T2
  92. vpxorq T3, T3, T3
  93. # First iteration
  94. vpbroadcastq (felemB), B0curr
  95. vpbroadcastq 15*8(felemB), B1curr
  96. lea 8(felemB), felemB
  97. vpmadd52luq B0curr, A0a, ACC0a
  98. vpmadd52luq B0curr, A0b, ACC0b
  99. vpmadd52luq B1curr, A1a, ACC1a
  100. vpmadd52luq B1curr, A1b, ACC1b
  101. vpmadd52luq B0curr, A1a, ACC2a
  102. vpmadd52luq B0curr, A1b, ACC2b
  103. vpmadd52luq B1curr, A0a, ACC3a
  104. vpmadd52luq B1curr, A0b, ACC3b
  105. vpermq ACC0a, ZERO, Y0curr
  106. vpermq ACC1a, ZERO, Y1curr
  107. vpermq ACC2a, ZERO, Y2curr
  108. vpermq ACC3a, ZERO, Y3curr
  109. vpmadd52luq Y0curr, M0, ACC0a
  110. vpmadd52luq Y0curr, M1, ACC0b
  111. vpmadd52luq Y1curr, M0, ACC1a
  112. vpmadd52luq Y1curr, M1, ACC1b
  113. vpmadd52luq Y2curr, M0, ACC2a
  114. vpmadd52luq Y2curr, M1, ACC2b
  115. vpmadd52luq Y3curr, M0, ACC3a
  116. vpmadd52luq Y3curr, M1, ACC3b
  117. vpsrlq $52, ACC0a, T0{%k1}{z}
  118. vpsrlq $52, ACC1a, T1{%k1}{z}
  119. vpsrlq $52, ACC2a, T2{%k1}{z}
  120. vpsrlq $52, ACC3a, T3{%k1}{z}
  121. mov $14, itr
  122. 1:
  123. # Shift the ACC in zmms right by a word
  124. valignq $1, ACC0a, ACC0b, ACC0a
  125. valignq $1, ACC0b, ZERO, ACC0b
  126. valignq $1, ACC1a, ACC1b, ACC1a
  127. valignq $1, ACC1b, ZERO, ACC1b
  128. valignq $1, ACC2a, ACC2b, ACC2a
  129. valignq $1, ACC2b, ZERO, ACC2b
  130. valignq $1, ACC3a, ACC3b, ACC3a
  131. valignq $1, ACC3b, ZERO, ACC3b
  132. vmovdqa64 B0curr, B0prev
  133. vmovdqa64 B1curr, B1prev
  134. vmovdqa64 Y0curr, Y0prev
  135. vmovdqa64 Y1curr, Y1prev
  136. vmovdqa64 Y2curr, Y2prev
  137. vmovdqa64 Y3curr, Y3prev
  138. vpbroadcastq (felemB), B0curr
  139. vpbroadcastq 15*8(felemB), B1curr
  140. lea 8(felemB), felemB
  141. # High multiplications
  142. vpmadd52huq B0prev, A0a, ACC0a # ACC0 = A0 * B0
  143. vpmadd52huq B0prev, A0b, ACC0b
  144. vpmadd52huq B1prev, A1a, ACC1a # ACC1 = A1 * B1
  145. vpmadd52huq B1prev, A1b, ACC1b
  146. vpmadd52huq B0prev, A1a, ACC2a # ACC2 = A1 * B0
  147. vpmadd52huq B0prev, A1b, ACC2b
  148. vpmadd52huq B1prev, A0a, ACC3a # ACC3 = A0 * B1
  149. vpmadd52huq B1prev, A0b, ACC3b
  150. vpmadd52huq Y0prev, M0, ACC0a
  151. vpmadd52huq Y0prev, M1, ACC0b
  152. vpmadd52huq Y1prev, M0, ACC1a
  153. vpmadd52huq Y1prev, M1, ACC1b
  154. vpmadd52huq Y2prev, M0, ACC2a
  155. vpmadd52huq Y2prev, M1, ACC2b
  156. vpmadd52huq Y3prev, M0, ACC3a
  157. vpmadd52huq Y3prev, M1, ACC3b
  158. # Low multiplications
  159. vpmadd52luq B0curr, A0a, ACC0a
  160. vpmadd52luq B0curr, A0b, ACC0b
  161. vpmadd52luq B1curr, A1a, ACC1a
  162. vpmadd52luq B1curr, A1b, ACC1b
  163. vpmadd52luq B0curr, A1a, ACC2a
  164. vpmadd52luq B0curr, A1b, ACC2b
  165. vpmadd52luq B1curr, A0a, ACC3a
  166. vpmadd52luq B1curr, A0b, ACC3b
  167. vpaddq T0, ACC0a, ACC0a
  168. vpaddq T1, ACC1a, ACC1a
  169. vpaddq T2, ACC2a, ACC2a
  170. vpaddq T3, ACC3a, ACC3a
  171. vpermq ACC0a, ZERO, Y0curr
  172. vpermq ACC1a, ZERO, Y1curr
  173. vpermq ACC2a, ZERO, Y2curr
  174. vpermq ACC3a, ZERO, Y3curr
  175. vpmadd52luq Y0curr, M0, ACC0a
  176. vpmadd52luq Y0curr, M1, ACC0b
  177. vpmadd52luq Y1curr, M0, ACC1a
  178. vpmadd52luq Y1curr, M1, ACC1b
  179. vpmadd52luq Y2curr, M0, ACC2a
  180. vpmadd52luq Y2curr, M1, ACC2b
  181. vpmadd52luq Y3curr, M0, ACC3a
  182. vpmadd52luq Y3curr, M1, ACC3b
  183. vpsrlq $52, ACC0a, T0{%k1}{z}
  184. vpsrlq $52, ACC1a, T1{%k1}{z}
  185. vpsrlq $52, ACC2a, T2{%k1}{z}
  186. vpsrlq $52, ACC3a, T3{%k1}{z}
  187. dec itr
  188. jne 1b
  189. valignq $1, ACC0a, ACC0b, ACC0a
  190. valignq $1, ACC0b, ZERO, ACC0b
  191. valignq $1, ACC1a, ACC1b, ACC1a
  192. valignq $1, ACC1b, ZERO, ACC1b
  193. valignq $1, ACC2a, ACC2b, ACC2a
  194. valignq $1, ACC2b, ZERO, ACC2b
  195. valignq $1, ACC3a, ACC3b, ACC3a
  196. valignq $1, ACC3b, ZERO, ACC3b
  197. vpaddq T0, ACC0a, ACC0a
  198. vpaddq T1, ACC1a, ACC1a
  199. vpaddq T2, ACC2a, ACC2a
  200. vpaddq T3, ACC3a, ACC3a
  201. # The last high multiplications
  202. vpmadd52huq B0curr, A0a, ACC0a
  203. vpmadd52huq B0curr, A0b, ACC0b
  204. vpmadd52huq B1curr, A1a, ACC1a
  205. vpmadd52huq B1curr, A1b, ACC1b
  206. vpmadd52huq B0curr, A1a, ACC2a
  207. vpmadd52huq B0curr, A1b, ACC2b
  208. vpmadd52huq B1curr, A0a, ACC3a
  209. vpmadd52huq B1curr, A0b, ACC3b
  210. vpmadd52huq Y0curr, M0, ACC0a
  211. vpmadd52huq Y0curr, M1, ACC0b
  212. vpmadd52huq Y1curr, M0, ACC1a
  213. vpmadd52huq Y1curr, M1, ACC1b
  214. vpmadd52huq Y2curr, M0, ACC2a
  215. vpmadd52huq Y2curr, M1, ACC2b
  216. vpmadd52huq Y3curr, M0, ACC3a
  217. vpmadd52huq Y3curr, M1, ACC3b
  218. # C0 = A0*B0 - A1*B1
  219. # C1 = A0*B1 + A1*B0
  220. vpaddq 64*0 + .LpolyX(%rip), ACC0a, ACC0a
  221. vpaddq 64*1 + .LpolyX(%rip), ACC0b, ACC0b
  222. vpaddq ACC3a, ACC2a, ACC2a
  223. vpaddq ACC3b, ACC2b, ACC2b
  224. vpsubq ACC1a, ACC0a, ACC0a
  225. vpsubq ACC1b, ACC0b, ACC0b
  226. # Now 'normalize' the acc to 52 bit words
  227. vpsrlq $52, ACC0a, A0a
  228. vpsrlq $52, ACC0b, A0b
  229. vpsrlq $52, ACC2a, A1a
  230. vpsrlq $52, ACC2b, A1b
  231. vpandq AND_MASK, ACC0a, ACC0a
  232. vpandq AND_MASK, ACC0b, ACC0b
  233. vpandq AND_MASK, ACC2a, ACC2a
  234. vpandq AND_MASK, ACC2b, ACC2b
  235. valignq $7, A0a, A0b, A0b
  236. valignq $7, ZERO, A0a, A0a
  237. valignq $7, A1a, A1b, A1b
  238. valignq $7, ZERO, A1a, A1a
  239. vpaddq A0a, ACC0a, ACC0a
  240. vpaddq A0b, ACC0b, ACC0b
  241. vpaddq A1a, ACC2a, ACC2a
  242. vpaddq A1b, ACC2b, ACC2b
  243. vpcmpuq $1, A0a, ACC0a, %k1
  244. vpcmpuq $1, A0b, ACC0b, %k2
  245. vpcmpuq $0, AND_MASK, ACC0a, %k3
  246. vpcmpuq $0, AND_MASK, ACC0b, %k4
  247. kmovb %k1, %eax
  248. kmovb %k2, %ecx
  249. kmovb %k3, %r8d
  250. kmovb %k4, %r9d
  251. add %al, %al
  252. adc %cl, %cl
  253. add %r8b, %al
  254. adc %r9b, %cl
  255. xor %r8b, %al
  256. xor %r9b, %cl
  257. kmovb %eax, %k1
  258. kmovb %ecx, %k2
  259. vpsubq AND_MASK, ACC0a, ACC0a{%k1}
  260. vpsubq AND_MASK, ACC0b, ACC0b{%k2}
  261. vpandq AND_MASK, ACC0a, ACC0a
  262. vpandq AND_MASK, ACC0b, ACC0b
  263. vpcmpuq $1, A1a, ACC2a, %k1
  264. vpcmpuq $1, A1b, ACC2b, %k2
  265. vpcmpuq $0, AND_MASK, ACC2a, %k3
  266. vpcmpuq $0, AND_MASK, ACC2b, %k4
  267. kmovb %k1, %eax
  268. kmovb %k2, %ecx
  269. kmovb %k3, %r8d
  270. kmovb %k4, %r9d
  271. add %al, %al
  272. adc %cl, %cl
  273. add %r8b, %al
  274. adc %r9b, %cl
  275. xor %r8b, %al
  276. xor %r9b, %cl
  277. kmovb %eax, %k1
  278. kmovb %ecx, %k2
  279. vpsubq AND_MASK, ACC2a, ACC2a{%k1}
  280. vpsubq AND_MASK, ACC2b, ACC2b{%k2}
  281. vpandq AND_MASK, ACC2a, ACC2a
  282. vpandq AND_MASK, ACC2b, ACC2b
  283. mov $0x7f, %eax
  284. kmovw %eax, %k1
  285. vmovdqu64 ACC0a, 64*0(felemR)
  286. vmovdqu64 ACC0b, 64*1(felemR){%k5}
  287. vmovdqu64 ACC2a, 15*8 + 64*0(felemR)
  288. vmovdqu64 ACC2b, 15*8 + 64*1(felemR){%k5}
  289. ret
  290. ###############################################################################
  291. #define ST0 ACC3a
  292. #define ST1 ACC3b
  293. #define ST2 Y3curr
  294. .globl C_ABI(fp2_sqr_ifma)
  295. .p2align 6
  296. C_ABI(fp2_sqr_ifma):
  297. mov $1, %eax
  298. kmovw %eax, %k1
  299. mov $0x7f, %eax
  300. kmovw %eax, %k2
  301. vpbroadcastq .LandMask(%rip), AND_MASK
  302. vpxorq ZERO, ZERO, ZERO
  303. vmovdqu64 64*0(felemA), A0a
  304. vmovdqu64 64*1(felemA), A0b{%k2}{z}
  305. vmovdqu64 15*8 + 64*0(felemA), A1a
  306. vmovdqu64 15*8 + 64*1(felemA), A1b{%k2}{z}
  307. # Load the modulus
  308. vmovdqa64 64*0 + .Lpoly(%rip), M0
  309. vmovdqa64 64*1 + .Lpoly(%rip), M1
  310. # Prepare the accumulators
  311. vpxorq ACC0a, ACC0a, ACC0a
  312. vpxorq ACC0b, ACC0b, ACC0b
  313. vpxorq ACC1a, ACC1a, ACC1a
  314. vpxorq ACC1b, ACC1b, ACC1b
  315. vpxorq ACC2a, ACC2a, ACC2a
  316. vpxorq ACC2b, ACC2b, ACC2b
  317. vpxorq T0, T0, T0
  318. vpxorq T1, T1, T1
  319. vpxorq T2, T2, T2
  320. # First iteration
  321. vpbroadcastq (felemA), B0curr
  322. vpbroadcastq 15*8(felemA), B1curr
  323. lea 8(felemA), felemA
  324. vpmadd52luq B0curr, A0a, ACC0a
  325. vpmadd52luq B0curr, A0b, ACC0b
  326. vpmadd52luq B1curr, A1a, ACC1a
  327. vpmadd52luq B1curr, A1b, ACC1b
  328. vpmadd52luq B0curr, A1a, ACC2a
  329. vpmadd52luq B0curr, A1b, ACC2b
  330. vpermq ACC0a, ZERO, Y0curr
  331. vpermq ACC1a, ZERO, Y1curr
  332. vpermq ACC2a, ZERO, Y2curr
  333. vpmadd52luq Y0curr, M0, ACC0a
  334. vpmadd52luq Y0curr, M1, ACC0b
  335. vpmadd52luq Y1curr, M0, ACC1a
  336. vpmadd52luq Y1curr, M1, ACC1b
  337. vpmadd52luq Y2curr, M0, ACC2a
  338. vpmadd52luq Y2curr, M1, ACC2b
  339. vpsrlq $52, ACC0a, T0{%k1}{z}
  340. vpsrlq $52, ACC1a, T1{%k1}{z}
  341. vpsrlq $52, ACC2a, T2{%k1}{z}
  342. mov $14, itr
  343. 1:
  344. # Shift the ACC in zmms right by a word
  345. valignq $1, ACC0a, ACC0b, ACC0a
  346. valignq $1, ACC0b, ZERO, ACC0b
  347. valignq $1, ACC1a, ACC1b, ACC1a
  348. valignq $1, ACC1b, ZERO, ACC1b
  349. valignq $1, ACC2a, ACC2b, ACC2a
  350. valignq $1, ACC2b, ZERO, ACC2b
  351. vpxorq ST0, ST0, ST0
  352. vpxorq ST1, ST1, ST1
  353. vpxorq ST2, ST2, ST2
  354. vmovdqa64 B0curr, B0prev
  355. vmovdqa64 B1curr, B1prev
  356. vmovdqa64 Y0curr, Y0prev
  357. vmovdqa64 Y1curr, Y1prev
  358. vmovdqa64 Y2curr, Y2prev
  359. vpbroadcastq (felemA), B0curr
  360. vpbroadcastq 15*8(felemA), B1curr
  361. lea 8(felemA), felemA
  362. # High multiplications
  363. vpmadd52huq B0prev, A0a, ACC0a # ACC0 = A0 * B0
  364. vpmadd52huq B1prev, A1a, ACC1a # ACC1 = A1 * B1
  365. vpmadd52huq B0prev, A1a, ACC2a # ACC2 = A1 * B0
  366. vpmadd52huq B0prev, A0b, ACC0b
  367. vpmadd52huq B1prev, A1b, ACC1b
  368. vpmadd52huq B0prev, A1b, ACC2b
  369. # We really want to have 8 independent vpmadd instructions in the pipe
  370. vpmadd52huq Y0prev, M0, T0
  371. vpmadd52huq Y1prev, M0, T1
  372. vpmadd52huq Y2prev, M0, T2
  373. vpmadd52huq Y0prev, M1, ACC0b
  374. vpmadd52huq Y1prev, M1, ACC1b
  375. vpmadd52huq Y2prev, M1, ACC2b
  376. # Low multiplications
  377. vpmadd52luq B0curr, A0a, ACC0a
  378. vpmadd52luq B1curr, A1a, ACC1a
  379. vpmadd52luq B0curr, A1a, ACC2a
  380. vpmadd52luq B0curr, A0b, ST0
  381. vpmadd52luq B1curr, A1b, ST1
  382. vpmadd52luq B0curr, A1b, ST2
  383. vpaddq T0, ACC0a, ACC0a
  384. vpaddq T1, ACC1a, ACC1a
  385. vpaddq T2, ACC2a, ACC2a
  386. vpermq ACC0a, ZERO, Y0curr
  387. vpermq ACC1a, ZERO, Y1curr
  388. vpermq ACC2a, ZERO, Y2curr
  389. vpaddq ST0, ACC0b, ACC0b
  390. vpaddq ST1, ACC1b, ACC1b
  391. vpaddq ST2, ACC2b, ACC2b
  392. vpmadd52luq Y0curr, M0, ACC0a
  393. vpmadd52luq Y0curr, M1, ACC0b
  394. vpmadd52luq Y1curr, M0, ACC1a
  395. vpmadd52luq Y1curr, M1, ACC1b
  396. vpmadd52luq Y2curr, M0, ACC2a
  397. vpmadd52luq Y2curr, M1, ACC2b
  398. vpsrlq $52, ACC0a, T0{%k1}{z}
  399. vpsrlq $52, ACC1a, T1{%k1}{z}
  400. vpsrlq $52, ACC2a, T2{%k1}{z}
  401. dec itr
  402. jne 1b
  403. valignq $1, ACC0a, ACC0b, ACC0a
  404. valignq $1, ACC0b, ZERO, ACC0b
  405. valignq $1, ACC1a, ACC1b, ACC1a
  406. valignq $1, ACC1b, ZERO, ACC1b
  407. valignq $1, ACC2a, ACC2b, ACC2a
  408. valignq $1, ACC2b, ZERO, ACC2b
  409. vpaddq T0, ACC0a, ACC0a
  410. vpaddq T1, ACC1a, ACC1a
  411. vpaddq T2, ACC2a, ACC2a
  412. # The last high multiplications
  413. vpmadd52huq B0curr, A0a, ACC0a
  414. vpmadd52huq B0curr, A0b, ACC0b
  415. vpmadd52huq B1curr, A1a, ACC1a
  416. vpmadd52huq B1curr, A1b, ACC1b
  417. vpmadd52huq B0curr, A1a, ACC2a
  418. vpmadd52huq B0curr, A1b, ACC2b
  419. vpmadd52huq Y0curr, M0, ACC0a
  420. vpmadd52huq Y0curr, M1, ACC0b
  421. vpmadd52huq Y1curr, M0, ACC1a
  422. vpmadd52huq Y1curr, M1, ACC1b
  423. vpmadd52huq Y2curr, M0, ACC2a
  424. vpmadd52huq Y2curr, M1, ACC2b
  425. # C0 = A0*B0 - A1*B1
  426. # C1 = A0*B1 + A1*B0
  427. vpaddq 64*0 + .LpolyX(%rip), ACC0a, ACC0a
  428. vpaddq 64*1 + .LpolyX(%rip), ACC0b, ACC0b
  429. vpaddq ACC2a, ACC2a, ACC2a
  430. vpaddq ACC2b, ACC2b, ACC2b
  431. vpsubq ACC1a, ACC0a, ACC0a
  432. vpsubq ACC1b, ACC0b, ACC0b
  433. # Now 'normalize' the acc to 52 bit words
  434. vpsrlq $52, ACC0a, A0a
  435. vpsrlq $52, ACC0b, A0b
  436. vpsrlq $52, ACC2a, A1a
  437. vpsrlq $52, ACC2b, A1b
  438. vpandq AND_MASK, ACC0a, ACC0a
  439. vpandq AND_MASK, ACC0b, ACC0b
  440. vpandq AND_MASK, ACC2a, ACC2a
  441. vpandq AND_MASK, ACC2b, ACC2b
  442. valignq $7, A0a, A0b, A0b
  443. valignq $7, ZERO, A0a, A0a
  444. valignq $7, A1a, A1b, A1b
  445. valignq $7, ZERO, A1a, A1a
  446. vpaddq A0a, ACC0a, ACC0a
  447. vpaddq A0b, ACC0b, ACC0b
  448. vpaddq A1a, ACC2a, ACC2a
  449. vpaddq A1b, ACC2b, ACC2b
  450. vpcmpuq $1, A0a, ACC0a, %k1
  451. vpcmpuq $1, A0b, ACC0b, %k2
  452. vpcmpuq $0, AND_MASK, ACC0a, %k3
  453. vpcmpuq $0, AND_MASK, ACC0b, %k4
  454. kmovb %k1, %eax
  455. kmovb %k2, %ecx
  456. kmovb %k3, %r8d
  457. kmovb %k4, %r9d
  458. add %al, %al
  459. adc %cl, %cl
  460. add %r8b, %al
  461. adc %r9b, %cl
  462. xor %r8b, %al
  463. xor %r9b, %cl
  464. kmovb %eax, %k1
  465. kmovb %ecx, %k2
  466. vpsubq AND_MASK, ACC0a, ACC0a{%k1}
  467. vpsubq AND_MASK, ACC0b, ACC0b{%k2}
  468. vpandq AND_MASK, ACC0a, ACC0a
  469. vpandq AND_MASK, ACC0b, ACC0b
  470. vpcmpuq $1, A1a, ACC2a, %k1
  471. vpcmpuq $1, A1b, ACC2b, %k2
  472. vpcmpuq $0, AND_MASK, ACC2a, %k3
  473. vpcmpuq $0, AND_MASK, ACC2b, %k4
  474. kmovb %k1, %eax
  475. kmovb %k2, %ecx
  476. kmovb %k3, %r8d
  477. kmovb %k4, %r9d
  478. add %al, %al
  479. adc %cl, %cl
  480. add %r8b, %al
  481. adc %r9b, %cl
  482. xor %r8b, %al
  483. xor %r9b, %cl
  484. kmovb %eax, %k1
  485. kmovb %ecx, %k2
  486. vpsubq AND_MASK, ACC2a, ACC2a{%k1}
  487. vpsubq AND_MASK, ACC2b, ACC2b{%k2}
  488. vpandq AND_MASK, ACC2a, ACC2a
  489. vpandq AND_MASK, ACC2b, ACC2b
  490. mov $0x7f, %eax
  491. kmovw %eax, %k1
  492. vmovdqu64 ACC0a, 64*0(felemR)
  493. vmovdqu64 ACC0b, 64*1(felemR){%k1}
  494. vmovdqu64 ACC2a, 15*8 + 64*0(felemR)
  495. vmovdqu64 ACC2b, 15*8 + 64*1(felemR){%k1}
  496. ret
  497. ###############################################################################
  498. .globl C_ABI(fp2_sub)
  499. .p2align 6
  500. C_ABI(fp2_sub):
  501. mov $1, %eax
  502. kmovw %eax, %k1
  503. mov $0x7f, %eax
  504. kmovw %eax, %k2
  505. vmovdqu64 64*0(felemA), ACC0a
  506. vmovdqu64 64*1(felemA), ACC0b{%k2}{z}
  507. vmovdqu64 15*8 + 64*0(felemA), ACC1a
  508. vmovdqu64 15*8 + 64*1(felemA), ACC1b{%k2}{z}
  509. vmovdqu64 64*0(felemB), ACC2a
  510. vmovdqu64 64*1(felemB), ACC2b{%k2}{z}
  511. vmovdqu64 15*8 + 64*0(felemB), ACC3a
  512. vmovdqu64 15*8 + 64*1(felemB), ACC3b{%k2}{z}
  513. vpaddq 64*0 + .LpolyX(%rip), ACC0a, ACC0a
  514. vpaddq 64*1 + .LpolyX(%rip), ACC0b, ACC0b
  515. vpaddq 64*0 + .LpolyX(%rip), ACC1a, ACC1a
  516. vpaddq 64*1 + .LpolyX(%rip), ACC1b, ACC1b
  517. vpsubq ACC2a, ACC0a, ACC0a
  518. vpsubq ACC2b, ACC0b, ACC0b
  519. vpsubq ACC3a, ACC1a, ACC2a
  520. vpsubq ACC3b, ACC1b, ACC2b
  521. jmp fp2_normalize
  522. ###############################################################################
  523. .globl C_ABI(fp2_add)
  524. .p2align 6
  525. C_ABI(fp2_add):
  526. mov $1, %eax
  527. kmovw %eax, %k1
  528. mov $0x7f, %eax
  529. kmovw %eax, %k2
  530. vmovdqu64 64*0(felemA), ACC0a
  531. vmovdqu64 64*1(felemA), ACC0b{%k2}{z}
  532. vmovdqu64 15*8 + 64*0(felemA), ACC1a
  533. vmovdqu64 15*8 + 64*1(felemA), ACC1b{%k2}{z}
  534. vmovdqu64 64*0(felemB), ACC2a
  535. vmovdqu64 64*1(felemB), ACC2b{%k2}{z}
  536. vmovdqu64 15*8 + 64*0(felemB), ACC3a
  537. vmovdqu64 15*8 + 64*1(felemB), ACC3b{%k2}{z}
  538. vpaddq ACC2a, ACC0a, ACC0a
  539. vpaddq ACC2b, ACC0b, ACC0b
  540. vpaddq ACC3a, ACC1a, ACC2a
  541. vpaddq ACC3b, ACC1b, ACC2b
  542. // Fallthrough
  543. ###############################################################################
  544. .p2align 6
  545. C_ABI(fp2_normalize):
  546. vpbroadcastq .LandMask(%rip), AND_MASK
  547. vpxorq ZERO, ZERO, ZERO
  548. # Now 'normalize' the acc to 52 bit words
  549. vpsrlq $52, ACC0a, A0a
  550. vpsrlq $52, ACC0b, A0b
  551. vpsrlq $52, ACC2a, A1a
  552. vpsrlq $52, ACC2b, A1b
  553. vpandq AND_MASK, ACC0a, ACC0a
  554. vpandq AND_MASK, ACC0b, ACC0b
  555. vpandq AND_MASK, ACC2a, ACC2a
  556. vpandq AND_MASK, ACC2b, ACC2b
  557. valignq $7, A0a, A0b, A0b
  558. valignq $7, ZERO, A0a, A0a
  559. valignq $7, A1a, A1b, A1b
  560. valignq $7, ZERO, A1a, A1a
  561. vpaddq A0a, ACC0a, ACC0a
  562. vpaddq A0b, ACC0b, ACC0b
  563. vpaddq A1a, ACC2a, ACC2a
  564. vpaddq A1b, ACC2b, ACC2b
  565. vpcmpuq $1, A0a, ACC0a, %k1
  566. vpcmpuq $1, A0b, ACC0b, %k2
  567. vpcmpuq $0, AND_MASK, ACC0a, %k3
  568. vpcmpuq $0, AND_MASK, ACC0b, %k4
  569. kmovb %k1, %eax
  570. kmovb %k2, %ecx
  571. kmovb %k3, %r8d
  572. kmovb %k4, %r9d
  573. add %al, %al
  574. adc %cl, %cl
  575. add %r8b, %al
  576. adc %r9b, %cl
  577. xor %r8b, %al
  578. xor %r9b, %cl
  579. kmovb %eax, %k1
  580. kmovb %ecx, %k2
  581. vpsubq AND_MASK, ACC0a, ACC0a{%k1}
  582. vpsubq AND_MASK, ACC0b, ACC0b{%k2}
  583. vpandq AND_MASK, ACC0a, ACC0a
  584. vpandq AND_MASK, ACC0b, ACC0b
  585. vpcmpuq $1, A1a, ACC2a, %k1
  586. vpcmpuq $1, A1b, ACC2b, %k2
  587. vpcmpuq $0, AND_MASK, ACC2a, %k3
  588. vpcmpuq $0, AND_MASK, ACC2b, %k4
  589. kmovb %k1, %eax
  590. kmovb %k2, %ecx
  591. kmovb %k3, %r8d
  592. kmovb %k4, %r9d
  593. add %al, %al
  594. adc %cl, %cl
  595. add %r8b, %al
  596. adc %r9b, %cl
  597. xor %r8b, %al
  598. xor %r9b, %cl
  599. kmovb %eax, %k1
  600. kmovb %ecx, %k2
  601. vpsubq AND_MASK, ACC2a, ACC2a{%k1}
  602. vpsubq AND_MASK, ACC2b, ACC2b{%k2}
  603. vpandq AND_MASK, ACC2a, ACC2a
  604. vpandq AND_MASK, ACC2b, ACC2b
  605. mov $0x7f, %eax
  606. kmovw %eax, %k1
  607. vmovdqu64 ACC0a, 64*0(felemR)
  608. vmovdqu64 ACC0b, 64*1(felemR){%k1}
  609. vmovdqu64 ACC2a, 15*8 + 64*0(felemR)
  610. vmovdqu64 ACC2b, 15*8 + 64*1(felemR){%k1}
  611. ret
  612. ###############################################################################
  613. #define p1ptr %rdi
  614. #define p2ptr %rsi
  615. #define swap %rdx
  616. .globl C_ABI(fp2_swap)
  617. .p2align 6
  618. C_ABI(fp2_swap):
  619. mov $0x7f, %eax
  620. kmovw %eax, %k2
  621. // TODO: get rid of the masks, not needed
  622. vmovdqu64 64*0(p1ptr), %zmm0
  623. vmovdqu64 64*1(p1ptr), %zmm1{%k2}{z}
  624. vmovdqu64 15*8 + 64*0(p1ptr), %zmm2
  625. vmovdqu64 15*8 + 64*1(p1ptr), %zmm3{%k2}{z}
  626. vmovdqu64 2*15*8 + 64*0(p1ptr), %zmm4
  627. vmovdqu64 2*15*8 + 64*1(p1ptr), %zmm5{%k2}{z}
  628. vmovdqu64 3*15*8 + 64*0(p1ptr), %zmm6
  629. vmovdqu64 3*15*8 + 64*1(p1ptr), %zmm7{%k2}{z}
  630. vmovdqu64 64*0(p2ptr), %zmm8
  631. vmovdqu64 64*1(p2ptr), %zmm9{%k2}{z}
  632. vmovdqu64 15*8 + 64*0(p2ptr), %zmm10
  633. vmovdqu64 15*8 + 64*1(p2ptr), %zmm11{%k2}{z}
  634. vmovdqu64 2*15*8 + 64*0(p2ptr), %zmm12
  635. vmovdqu64 2*15*8 + 64*1(p2ptr), %zmm13{%k2}{z}
  636. vmovdqu64 3*15*8 + 64*0(p2ptr), %zmm14
  637. vmovdqu64 3*15*8 + 64*1(p2ptr), %zmm15{%k2}{z}
  638. vpxorq %zmm16, %zmm16, %zmm16
  639. vpbroadcastq swap, %zmm17
  640. vpsubq %zmm17, %zmm16, %zmm16
  641. vmovdqa64 %zmm8, %zmm17
  642. vmovdqa64 %zmm9, %zmm18
  643. vmovdqa64 %zmm10, %zmm19
  644. vmovdqa64 %zmm11, %zmm20
  645. vmovdqa64 %zmm12, %zmm21
  646. vmovdqa64 %zmm13, %zmm22
  647. vmovdqa64 %zmm14, %zmm23
  648. vmovdqa64 %zmm15, %zmm24
  649. vpternlogq $0xd8, %zmm16, %zmm0, %zmm17
  650. vpternlogq $0xd8, %zmm16, %zmm1, %zmm18
  651. vpternlogq $0xd8, %zmm16, %zmm2, %zmm19
  652. vpternlogq $0xd8, %zmm16, %zmm3, %zmm20
  653. vpternlogq $0xd8, %zmm16, %zmm4, %zmm21
  654. vpternlogq $0xd8, %zmm16, %zmm5, %zmm22
  655. vpternlogq $0xd8, %zmm16, %zmm6, %zmm23
  656. vpternlogq $0xd8, %zmm16, %zmm7, %zmm24
  657. vpternlogq $0xe4, %zmm16, %zmm0, %zmm8
  658. vpternlogq $0xe4, %zmm16, %zmm1, %zmm9
  659. vpternlogq $0xe4, %zmm16, %zmm2, %zmm10
  660. vpternlogq $0xe4, %zmm16, %zmm3, %zmm11
  661. vpternlogq $0xe4, %zmm16, %zmm4, %zmm12
  662. vpternlogq $0xe4, %zmm16, %zmm5, %zmm13
  663. vpternlogq $0xe4, %zmm16, %zmm6, %zmm14
  664. vpternlogq $0xe4, %zmm16, %zmm7, %zmm15
  665. vmovdqu64 %zmm8, 64*0(p1ptr)
  666. vmovdqu64 %zmm9, 64*1(p1ptr){%k2}
  667. vmovdqu64 %zmm10, 15*8 + 64*0(p1ptr)
  668. vmovdqu64 %zmm11, 15*8 + 64*1(p1ptr){%k2}
  669. vmovdqu64 %zmm12, 2*15*8 + 64*0(p1ptr)
  670. vmovdqu64 %zmm13, 2*15*8 + 64*1(p1ptr){%k2}
  671. vmovdqu64 %zmm14, 3*15*8 + 64*0(p1ptr)
  672. vmovdqu64 %zmm15, 3*15*8 + 64*1(p1ptr){%k2}
  673. vmovdqu64 %zmm17, 64*0(p2ptr)
  674. vmovdqu64 %zmm18, 64*1(p2ptr){%k2}
  675. vmovdqu64 %zmm19, 15*8 + 64*0(p2ptr)
  676. vmovdqu64 %zmm20, 15*8 + 64*1(p2ptr){%k2}
  677. vmovdqu64 %zmm21, 2*15*8 + 64*0(p2ptr)
  678. vmovdqu64 %zmm22, 2*15*8 + 64*1(p2ptr){%k2}
  679. vmovdqu64 %zmm23, 3*15*8 + 64*0(p2ptr)
  680. vmovdqu64 %zmm24, 3*15*8 + 64*1(p2ptr){%k2}
  681. ret
  682. ###############################################################################
  683. .globl C_ABI(fp_add)
  684. .p2align 6
  685. C_ABI(fp_add):
  686. mov $0x7f, %eax
  687. kmovw %eax, %k2
  688. vmovdqu64 64*0(felemA), ACC0a
  689. vmovdqu64 64*1(felemA), ACC0b{%k2}{z}
  690. vmovdqu64 64*0(felemB), ACC2a
  691. vmovdqu64 64*1(felemB), ACC2b{%k2}{z}
  692. vpaddq ACC2a, ACC0a, ACC0a
  693. vpaddq ACC2b, ACC0b, ACC0b
  694. // Fallthrough
  695. ###############################################################################
  696. .p2align 6
  697. C_ABI(fp_normalize):
  698. vpbroadcastq .LandMask(%rip), AND_MASK
  699. vpxorq ZERO, ZERO, ZERO
  700. # Now 'normalize' the acc to 52 bit words
  701. vpsrlq $52, ACC0a, A0a
  702. vpsrlq $52, ACC0b, A0b
  703. vpandq AND_MASK, ACC0a, ACC0a
  704. vpandq AND_MASK, ACC0b, ACC0b
  705. valignq $7, A0a, A0b, A0b
  706. valignq $7, ZERO, A0a, A0a
  707. vpaddq A0a, ACC0a, ACC0a
  708. vpaddq A0b, ACC0b, ACC0b
  709. vpcmpuq $1, A0a, ACC0a, %k1
  710. vpcmpuq $1, A0b, ACC0b, %k2
  711. vpcmpuq $0, AND_MASK, ACC0a, %k3
  712. vpcmpuq $0, AND_MASK, ACC0b, %k4
  713. kmovb %k1, %eax
  714. kmovb %k2, %ecx
  715. kmovb %k3, %r8d
  716. kmovb %k4, %r9d
  717. add %al, %al
  718. adc %cl, %cl
  719. add %r8b, %al
  720. adc %r9b, %cl
  721. xor %r8b, %al
  722. xor %r9b, %cl
  723. kmovb %eax, %k1
  724. kmovb %ecx, %k2
  725. vpsubq AND_MASK, ACC0a, ACC0a{%k1}
  726. vpsubq AND_MASK, ACC0b, ACC0b{%k2}
  727. vpandq AND_MASK, ACC0a, ACC0a
  728. vpandq AND_MASK, ACC0b, ACC0b
  729. mov $0x7f, %eax
  730. kmovw %eax, %k1
  731. vmovdqu64 ACC0a, 64*0(%rdi)
  732. vmovdqu64 ACC0b, 64*1(%rdi){%k1}
  733. ret
  734. ###############################################################################
  735. .globl C_ABI(fp_sub)
  736. .p2align 6
  737. C_ABI(fp_sub):
  738. mov $0x7f, %eax
  739. kmovw %eax, %k2
  740. vmovdqu64 64*0(felemA), ACC0a
  741. vmovdqu64 64*1(felemA), ACC0b{%k2}{z}
  742. vmovdqu64 64*0(felemB), ACC2a
  743. vmovdqu64 64*1(felemB), ACC2b{%k2}{z}
  744. vpaddq 64*0 + .LpolyX(%rip), ACC0a, ACC0a
  745. vpaddq 64*1 + .LpolyX(%rip), ACC0b, ACC0b
  746. vpsubq ACC2a, ACC0a, ACC0a
  747. vpsubq ACC2b, ACC0b, ACC0b
  748. jmp fp_normalize