You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

2621 lines
45 KiB

  1. // +build amd64,!noasm
  2. #include "textflag.h"
  3. // p751 + 1
  4. #define P751P1_5 $0xEEB0000000000000
  5. #define P751P1_6 $0xE3EC968549F878A8
  6. #define P751P1_7 $0xDA959B1A13F7CC76
  7. #define P751P1_8 $0x084E9867D6EBE876
  8. #define P751P1_9 $0x8562B5045CB25748
  9. #define P751P1_10 $0x0E12909F97BADC66
  10. #define P751P1_11 $0x00006FE5D541F71C
  11. #define P751_0 $0xFFFFFFFFFFFFFFFF
  12. #define P751_5 $0xEEAFFFFFFFFFFFFF
  13. #define P751_6 $0xE3EC968549F878A8
  14. #define P751_7 $0xDA959B1A13F7CC76
  15. #define P751_8 $0x084E9867D6EBE876
  16. #define P751_9 $0x8562B5045CB25748
  17. #define P751_10 $0x0E12909F97BADC66
  18. #define P751_11 $0x00006FE5D541F71C
  19. #define P751X2_0 $0xFFFFFFFFFFFFFFFE
  20. #define P751X2_1 $0xFFFFFFFFFFFFFFFF
  21. #define P751X2_5 $0xDD5FFFFFFFFFFFFF
  22. #define P751X2_6 $0xC7D92D0A93F0F151
  23. #define P751X2_7 $0xB52B363427EF98ED
  24. #define P751X2_8 $0x109D30CFADD7D0ED
  25. #define P751X2_9 $0x0AC56A08B964AE90
  26. #define P751X2_10 $0x1C25213F2F75B8CD
  27. #define P751X2_11 $0x0000DFCBAA83EE38
  28. // The MSR code uses these registers for parameter passing. Keep using
  29. // them to avoid significant code changes. This means that when the Go
  30. // assembler does something strange, we can diff the machine code
  31. // against a different assembler to find out what Go did.
  32. #define REG_P1 DI
  33. #define REG_P2 SI
  34. #define REG_P3 DX
  35. TEXT ·fp751StrongReduce(SB), NOSPLIT, $0-8
  36. MOVQ x+0(FP), REG_P1
  37. // Zero AX for later use:
  38. XORQ AX, AX
  39. // Load p into registers:
  40. MOVQ P751_0, R8
  41. // P751_{1,2,3,4} = P751_0, so reuse R8
  42. MOVQ P751_5, R9
  43. MOVQ P751_6, R10
  44. MOVQ P751_7, R11
  45. MOVQ P751_8, R12
  46. MOVQ P751_9, R13
  47. MOVQ P751_10, R14
  48. MOVQ P751_11, R15
  49. // Set x <- x - p
  50. SUBQ R8, (REG_P1)
  51. SBBQ R8, (8)(REG_P1)
  52. SBBQ R8, (16)(REG_P1)
  53. SBBQ R8, (24)(REG_P1)
  54. SBBQ R8, (32)(REG_P1)
  55. SBBQ R9, (40)(REG_P1)
  56. SBBQ R10, (48)(REG_P1)
  57. SBBQ R11, (56)(REG_P1)
  58. SBBQ R12, (64)(REG_P1)
  59. SBBQ R13, (72)(REG_P1)
  60. SBBQ R14, (80)(REG_P1)
  61. SBBQ R15, (88)(REG_P1)
  62. // Save carry flag indicating x-p < 0 as a mask in AX
  63. SBBQ $0, AX
  64. // Conditionally add p to x if x-p < 0
  65. ANDQ AX, R8
  66. ANDQ AX, R9
  67. ANDQ AX, R10
  68. ANDQ AX, R11
  69. ANDQ AX, R12
  70. ANDQ AX, R13
  71. ANDQ AX, R14
  72. ANDQ AX, R15
  73. ADDQ R8, (REG_P1)
  74. ADCQ R8, (8)(REG_P1)
  75. ADCQ R8, (16)(REG_P1)
  76. ADCQ R8, (24)(REG_P1)
  77. ADCQ R8, (32)(REG_P1)
  78. ADCQ R9, (40)(REG_P1)
  79. ADCQ R10, (48)(REG_P1)
  80. ADCQ R11, (56)(REG_P1)
  81. ADCQ R12, (64)(REG_P1)
  82. ADCQ R13, (72)(REG_P1)
  83. ADCQ R14, (80)(REG_P1)
  84. ADCQ R15, (88)(REG_P1)
  85. RET
  86. TEXT ·fp751ConditionalSwap(SB), NOSPLIT, $0-17
  87. MOVQ x+0(FP), REG_P1
  88. MOVQ y+8(FP), REG_P2
  89. MOVB choice+16(FP), AL // AL = 0 or 1
  90. MOVBLZX AL, AX // AX = 0 or 1
  91. NEGQ AX // RAX = 0x00..00 or 0xff..ff
  92. MOVQ (0*8)(REG_P1), BX // BX = x[0]
  93. MOVQ (0*8)(REG_P2), CX // CX = y[0]
  94. MOVQ CX, DX // DX = y[0]
  95. XORQ BX, DX // DX = y[0] ^ x[0]
  96. ANDQ AX, DX // DX = (y[0] ^ x[0]) & mask
  97. XORQ DX, BX // BX = (y[0] ^ x[0]) & mask) ^ x[0] = x[0] or y[0]
  98. XORQ DX, CX // CX = (y[0] ^ x[0]) & mask) ^ y[0] = y[0] or x[0]
  99. MOVQ BX, (0*8)(REG_P1)
  100. MOVQ CX, (0*8)(REG_P2)
  101. MOVQ (1*8)(REG_P1), BX
  102. MOVQ (1*8)(REG_P2), CX
  103. MOVQ CX, DX
  104. XORQ BX, DX
  105. ANDQ AX, DX
  106. XORQ DX, BX
  107. XORQ DX, CX
  108. MOVQ BX, (1*8)(REG_P1)
  109. MOVQ CX, (1*8)(REG_P2)
  110. MOVQ (2*8)(REG_P1), BX
  111. MOVQ (2*8)(REG_P2), CX
  112. MOVQ CX, DX
  113. XORQ BX, DX
  114. ANDQ AX, DX
  115. XORQ DX, BX
  116. XORQ DX, CX
  117. MOVQ BX, (2*8)(REG_P1)
  118. MOVQ CX, (2*8)(REG_P2)
  119. MOVQ (3*8)(REG_P1), BX
  120. MOVQ (3*8)(REG_P2), CX
  121. MOVQ CX, DX
  122. XORQ BX, DX
  123. ANDQ AX, DX
  124. XORQ DX, BX
  125. XORQ DX, CX
  126. MOVQ BX, (3*8)(REG_P1)
  127. MOVQ CX, (3*8)(REG_P2)
  128. MOVQ (4*8)(REG_P1), BX
  129. MOVQ (4*8)(REG_P2), CX
  130. MOVQ CX, DX
  131. XORQ BX, DX
  132. ANDQ AX, DX
  133. XORQ DX, BX
  134. XORQ DX, CX
  135. MOVQ BX, (4*8)(REG_P1)
  136. MOVQ CX, (4*8)(REG_P2)
  137. MOVQ (5*8)(REG_P1), BX
  138. MOVQ (5*8)(REG_P2), CX
  139. MOVQ CX, DX
  140. XORQ BX, DX
  141. ANDQ AX, DX
  142. XORQ DX, BX
  143. XORQ DX, CX
  144. MOVQ BX, (5*8)(REG_P1)
  145. MOVQ CX, (5*8)(REG_P2)
  146. MOVQ (6*8)(REG_P1), BX
  147. MOVQ (6*8)(REG_P2), CX
  148. MOVQ CX, DX
  149. XORQ BX, DX
  150. ANDQ AX, DX
  151. XORQ DX, BX
  152. XORQ DX, CX
  153. MOVQ BX, (6*8)(REG_P1)
  154. MOVQ CX, (6*8)(REG_P2)
  155. MOVQ (7*8)(REG_P1), BX
  156. MOVQ (7*8)(REG_P2), CX
  157. MOVQ CX, DX
  158. XORQ BX, DX
  159. ANDQ AX, DX
  160. XORQ DX, BX
  161. XORQ DX, CX
  162. MOVQ BX, (7*8)(REG_P1)
  163. MOVQ CX, (7*8)(REG_P2)
  164. MOVQ (8*8)(REG_P1), BX
  165. MOVQ (8*8)(REG_P2), CX
  166. MOVQ CX, DX
  167. XORQ BX, DX
  168. ANDQ AX, DX
  169. XORQ DX, BX
  170. XORQ DX, CX
  171. MOVQ BX, (8*8)(REG_P1)
  172. MOVQ CX, (8*8)(REG_P2)
  173. MOVQ (9*8)(REG_P1), BX
  174. MOVQ (9*8)(REG_P2), CX
  175. MOVQ CX, DX
  176. XORQ BX, DX
  177. ANDQ AX, DX
  178. XORQ DX, BX
  179. XORQ DX, CX
  180. MOVQ BX, (9*8)(REG_P1)
  181. MOVQ CX, (9*8)(REG_P2)
  182. MOVQ (10*8)(REG_P1), BX
  183. MOVQ (10*8)(REG_P2), CX
  184. MOVQ CX, DX
  185. XORQ BX, DX
  186. ANDQ AX, DX
  187. XORQ DX, BX
  188. XORQ DX, CX
  189. MOVQ BX, (10*8)(REG_P1)
  190. MOVQ CX, (10*8)(REG_P2)
  191. MOVQ (11*8)(REG_P1), BX
  192. MOVQ (11*8)(REG_P2), CX
  193. MOVQ CX, DX
  194. XORQ BX, DX
  195. ANDQ AX, DX
  196. XORQ DX, BX
  197. XORQ DX, CX
  198. MOVQ BX, (11*8)(REG_P1)
  199. MOVQ CX, (11*8)(REG_P2)
  200. RET
  201. TEXT ·fp751AddReduced(SB), NOSPLIT, $0-24
  202. MOVQ z+0(FP), REG_P3
  203. MOVQ x+8(FP), REG_P1
  204. MOVQ y+16(FP), REG_P2
  205. MOVQ (REG_P1), R8
  206. MOVQ (8)(REG_P1), R9
  207. MOVQ (16)(REG_P1), R10
  208. MOVQ (24)(REG_P1), R11
  209. MOVQ (32)(REG_P1), R12
  210. MOVQ (40)(REG_P1), R13
  211. MOVQ (48)(REG_P1), R14
  212. MOVQ (56)(REG_P1), R15
  213. MOVQ (64)(REG_P1), CX
  214. ADDQ (REG_P2), R8
  215. ADCQ (8)(REG_P2), R9
  216. ADCQ (16)(REG_P2), R10
  217. ADCQ (24)(REG_P2), R11
  218. ADCQ (32)(REG_P2), R12
  219. ADCQ (40)(REG_P2), R13
  220. ADCQ (48)(REG_P2), R14
  221. ADCQ (56)(REG_P2), R15
  222. ADCQ (64)(REG_P2), CX
  223. MOVQ (72)(REG_P1), AX
  224. ADCQ (72)(REG_P2), AX
  225. MOVQ AX, (72)(REG_P3)
  226. MOVQ (80)(REG_P1), AX
  227. ADCQ (80)(REG_P2), AX
  228. MOVQ AX, (80)(REG_P3)
  229. MOVQ (88)(REG_P1), AX
  230. ADCQ (88)(REG_P2), AX
  231. MOVQ AX, (88)(REG_P3)
  232. MOVQ P751X2_0, AX
  233. SUBQ AX, R8
  234. MOVQ P751X2_1, AX
  235. SBBQ AX, R9
  236. SBBQ AX, R10
  237. SBBQ AX, R11
  238. SBBQ AX, R12
  239. MOVQ P751X2_5, AX
  240. SBBQ AX, R13
  241. MOVQ P751X2_6, AX
  242. SBBQ AX, R14
  243. MOVQ P751X2_7, AX
  244. SBBQ AX, R15
  245. MOVQ P751X2_8, AX
  246. SBBQ AX, CX
  247. MOVQ R8, (REG_P3)
  248. MOVQ R9, (8)(REG_P3)
  249. MOVQ R10, (16)(REG_P3)
  250. MOVQ R11, (24)(REG_P3)
  251. MOVQ R12, (32)(REG_P3)
  252. MOVQ R13, (40)(REG_P3)
  253. MOVQ R14, (48)(REG_P3)
  254. MOVQ R15, (56)(REG_P3)
  255. MOVQ CX, (64)(REG_P3)
  256. MOVQ (72)(REG_P3), R8
  257. MOVQ (80)(REG_P3), R9
  258. MOVQ (88)(REG_P3), R10
  259. MOVQ P751X2_9, AX
  260. SBBQ AX, R8
  261. MOVQ P751X2_10, AX
  262. SBBQ AX, R9
  263. MOVQ P751X2_11, AX
  264. SBBQ AX, R10
  265. MOVQ R8, (72)(REG_P3)
  266. MOVQ R9, (80)(REG_P3)
  267. MOVQ R10, (88)(REG_P3)
  268. MOVQ $0, AX
  269. SBBQ $0, AX
  270. MOVQ P751X2_0, SI
  271. ANDQ AX, SI
  272. MOVQ P751X2_1, R8
  273. ANDQ AX, R8
  274. MOVQ P751X2_5, R9
  275. ANDQ AX, R9
  276. MOVQ P751X2_6, R10
  277. ANDQ AX, R10
  278. MOVQ P751X2_7, R11
  279. ANDQ AX, R11
  280. MOVQ P751X2_8, R12
  281. ANDQ AX, R12
  282. MOVQ P751X2_9, R13
  283. ANDQ AX, R13
  284. MOVQ P751X2_10, R14
  285. ANDQ AX, R14
  286. MOVQ P751X2_11, R15
  287. ANDQ AX, R15
  288. MOVQ (REG_P3), AX
  289. ADDQ SI, AX
  290. MOVQ AX, (REG_P3)
  291. MOVQ (8)(REG_P3), AX
  292. ADCQ R8, AX
  293. MOVQ AX, (8)(REG_P3)
  294. MOVQ (16)(REG_P3), AX
  295. ADCQ R8, AX
  296. MOVQ AX, (16)(REG_P3)
  297. MOVQ (24)(REG_P3), AX
  298. ADCQ R8, AX
  299. MOVQ AX, (24)(REG_P3)
  300. MOVQ (32)(REG_P3), AX
  301. ADCQ R8, AX
  302. MOVQ AX, (32)(REG_P3)
  303. MOVQ (40)(REG_P3), AX
  304. ADCQ R9, AX
  305. MOVQ AX, (40)(REG_P3)
  306. MOVQ (48)(REG_P3), AX
  307. ADCQ R10, AX
  308. MOVQ AX, (48)(REG_P3)
  309. MOVQ (56)(REG_P3), AX
  310. ADCQ R11, AX
  311. MOVQ AX, (56)(REG_P3)
  312. MOVQ (64)(REG_P3), AX
  313. ADCQ R12, AX
  314. MOVQ AX, (64)(REG_P3)
  315. MOVQ (72)(REG_P3), AX
  316. ADCQ R13, AX
  317. MOVQ AX, (72)(REG_P3)
  318. MOVQ (80)(REG_P3), AX
  319. ADCQ R14, AX
  320. MOVQ AX, (80)(REG_P3)
  321. MOVQ (88)(REG_P3), AX
  322. ADCQ R15, AX
  323. MOVQ AX, (88)(REG_P3)
  324. RET
  325. TEXT ·fp751SubReduced(SB), NOSPLIT, $0-24
  326. MOVQ z+0(FP), REG_P3
  327. MOVQ x+8(FP), REG_P1
  328. MOVQ y+16(FP), REG_P2
  329. MOVQ (REG_P1), R8
  330. MOVQ (8)(REG_P1), R9
  331. MOVQ (16)(REG_P1), R10
  332. MOVQ (24)(REG_P1), R11
  333. MOVQ (32)(REG_P1), R12
  334. MOVQ (40)(REG_P1), R13
  335. MOVQ (48)(REG_P1), R14
  336. MOVQ (56)(REG_P1), R15
  337. MOVQ (64)(REG_P1), CX
  338. SUBQ (REG_P2), R8
  339. SBBQ (8)(REG_P2), R9
  340. SBBQ (16)(REG_P2), R10
  341. SBBQ (24)(REG_P2), R11
  342. SBBQ (32)(REG_P2), R12
  343. SBBQ (40)(REG_P2), R13
  344. SBBQ (48)(REG_P2), R14
  345. SBBQ (56)(REG_P2), R15
  346. SBBQ (64)(REG_P2), CX
  347. MOVQ R8, (REG_P3)
  348. MOVQ R9, (8)(REG_P3)
  349. MOVQ R10, (16)(REG_P3)
  350. MOVQ R11, (24)(REG_P3)
  351. MOVQ R12, (32)(REG_P3)
  352. MOVQ R13, (40)(REG_P3)
  353. MOVQ R14, (48)(REG_P3)
  354. MOVQ R15, (56)(REG_P3)
  355. MOVQ CX, (64)(REG_P3)
  356. MOVQ (72)(REG_P1), AX
  357. SBBQ (72)(REG_P2), AX
  358. MOVQ AX, (72)(REG_P3)
  359. MOVQ (80)(REG_P1), AX
  360. SBBQ (80)(REG_P2), AX
  361. MOVQ AX, (80)(REG_P3)
  362. MOVQ (88)(REG_P1), AX
  363. SBBQ (88)(REG_P2), AX
  364. MOVQ AX, (88)(REG_P3)
  365. MOVQ $0, AX
  366. SBBQ $0, AX
  367. MOVQ P751X2_0, SI
  368. ANDQ AX, SI
  369. MOVQ P751X2_1, R8
  370. ANDQ AX, R8
  371. MOVQ P751X2_5, R9
  372. ANDQ AX, R9
  373. MOVQ P751X2_6, R10
  374. ANDQ AX, R10
  375. MOVQ P751X2_7, R11
  376. ANDQ AX, R11
  377. MOVQ P751X2_8, R12
  378. ANDQ AX, R12
  379. MOVQ P751X2_9, R13
  380. ANDQ AX, R13
  381. MOVQ P751X2_10, R14
  382. ANDQ AX, R14
  383. MOVQ P751X2_11, R15
  384. ANDQ AX, R15
  385. MOVQ (REG_P3), AX
  386. ADDQ SI, AX
  387. MOVQ AX, (REG_P3)
  388. MOVQ (8)(REG_P3), AX
  389. ADCQ R8, AX
  390. MOVQ AX, (8)(REG_P3)
  391. MOVQ (16)(REG_P3), AX
  392. ADCQ R8, AX
  393. MOVQ AX, (16)(REG_P3)
  394. MOVQ (24)(REG_P3), AX
  395. ADCQ R8, AX
  396. MOVQ AX, (24)(REG_P3)
  397. MOVQ (32)(REG_P3), AX
  398. ADCQ R8, AX
  399. MOVQ AX, (32)(REG_P3)
  400. MOVQ (40)(REG_P3), AX
  401. ADCQ R9, AX
  402. MOVQ AX, (40)(REG_P3)
  403. MOVQ (48)(REG_P3), AX
  404. ADCQ R10, AX
  405. MOVQ AX, (48)(REG_P3)
  406. MOVQ (56)(REG_P3), AX
  407. ADCQ R11, AX
  408. MOVQ AX, (56)(REG_P3)
  409. MOVQ (64)(REG_P3), AX
  410. ADCQ R12, AX
  411. MOVQ AX, (64)(REG_P3)
  412. MOVQ (72)(REG_P3), AX
  413. ADCQ R13, AX
  414. MOVQ AX, (72)(REG_P3)
  415. MOVQ (80)(REG_P3), AX
  416. ADCQ R14, AX
  417. MOVQ AX, (80)(REG_P3)
  418. MOVQ (88)(REG_P3), AX
  419. ADCQ R15, AX
  420. MOVQ AX, (88)(REG_P3)
  421. RET
  422. TEXT ·fp751Mul(SB), $96-24
  423. // Here we store the destination in CX instead of in REG_P3 because the
  424. // multiplication instructions use DX as an implicit destination
  425. // operand: MULQ $REG sets DX:AX <-- AX * $REG.
  426. MOVQ z+0(FP), CX
  427. MOVQ x+8(FP), REG_P1
  428. MOVQ y+16(FP), REG_P2
  429. XORQ AX, AX
  430. MOVQ (48)(REG_P1), R8
  431. MOVQ (56)(REG_P1), R9
  432. MOVQ (64)(REG_P1), R10
  433. MOVQ (72)(REG_P1), R11
  434. MOVQ (80)(REG_P1), R12
  435. MOVQ (88)(REG_P1), R13
  436. ADDQ (REG_P1), R8
  437. ADCQ (8)(REG_P1), R9
  438. ADCQ (16)(REG_P1), R10
  439. ADCQ (24)(REG_P1), R11
  440. ADCQ (32)(REG_P1), R12
  441. ADCQ (40)(REG_P1), R13
  442. MOVQ R8, (CX)
  443. MOVQ R9, (8)(CX)
  444. MOVQ R10, (16)(CX)
  445. MOVQ R11, (24)(CX)
  446. MOVQ R12, (32)(CX)
  447. MOVQ R13, (40)(CX)
  448. SBBQ $0, AX
  449. XORQ DX, DX
  450. MOVQ (48)(REG_P2), R8
  451. MOVQ (56)(REG_P2), R9
  452. MOVQ (64)(REG_P2), R10
  453. MOVQ (72)(REG_P2), R11
  454. MOVQ (80)(REG_P2), R12
  455. MOVQ (88)(REG_P2), R13
  456. ADDQ (REG_P2), R8
  457. ADCQ (8)(REG_P2), R9
  458. ADCQ (16)(REG_P2), R10
  459. ADCQ (24)(REG_P2), R11
  460. ADCQ (32)(REG_P2), R12
  461. ADCQ (40)(REG_P2), R13
  462. MOVQ R8, (48)(CX)
  463. MOVQ R9, (56)(CX)
  464. MOVQ R10, (64)(CX)
  465. MOVQ R11, (72)(CX)
  466. MOVQ R12, (80)(CX)
  467. MOVQ R13, (88)(CX)
  468. SBBQ $0, DX
  469. MOVQ AX, (80)(SP)
  470. MOVQ DX, (88)(SP)
  471. // (SP[0-8],R10,R8,R9) <- (AH+AL)*(BH+BL)
  472. MOVQ (CX), R11
  473. MOVQ R8, AX
  474. MULQ R11
  475. MOVQ AX, (SP) // c0
  476. MOVQ DX, R14
  477. XORQ R15, R15
  478. MOVQ R9, AX
  479. MULQ R11
  480. XORQ R9, R9
  481. ADDQ AX, R14
  482. ADCQ DX, R9
  483. MOVQ (8)(CX), R12
  484. MOVQ R8, AX
  485. MULQ R12
  486. ADDQ AX, R14
  487. MOVQ R14, (8)(SP) // c1
  488. ADCQ DX, R9
  489. ADCQ $0, R15
  490. XORQ R8, R8
  491. MOVQ R10, AX
  492. MULQ R11
  493. ADDQ AX, R9
  494. MOVQ (48)(CX), R13
  495. ADCQ DX, R15
  496. ADCQ $0, R8
  497. MOVQ (16)(CX), AX
  498. MULQ R13
  499. ADDQ AX, R9
  500. ADCQ DX, R15
  501. MOVQ (56)(CX), AX
  502. ADCQ $0, R8
  503. MULQ R12
  504. ADDQ AX, R9
  505. MOVQ R9, (16)(SP) // c2
  506. ADCQ DX, R15
  507. ADCQ $0, R8
  508. XORQ R9, R9
  509. MOVQ (72)(CX), AX
  510. MULQ R11
  511. ADDQ AX, R15
  512. ADCQ DX, R8
  513. ADCQ $0, R9
  514. MOVQ (24)(CX), AX
  515. MULQ R13
  516. ADDQ AX, R15
  517. ADCQ DX, R8
  518. ADCQ $0, R9
  519. MOVQ R10, AX
  520. MULQ R12
  521. ADDQ AX, R15
  522. ADCQ DX, R8
  523. ADCQ $0, R9
  524. MOVQ (16)(CX), R14
  525. MOVQ (56)(CX), AX
  526. MULQ R14
  527. ADDQ AX, R15
  528. MOVQ R15, (24)(SP) // c3
  529. ADCQ DX, R8
  530. ADCQ $0, R9
  531. XORQ R10, R10
  532. MOVQ (80)(CX), AX
  533. MULQ R11
  534. ADDQ AX, R8
  535. ADCQ DX, R9
  536. ADCQ $0, R10
  537. MOVQ (64)(CX), AX
  538. MULQ R14
  539. ADDQ AX, R8
  540. ADCQ DX, R9
  541. ADCQ $0, R10
  542. MOVQ (48)(CX), R15
  543. MOVQ (32)(CX), AX
  544. MULQ R15
  545. ADDQ AX, R8
  546. ADCQ DX, R9
  547. ADCQ $0, R10
  548. MOVQ (72)(CX), AX
  549. MULQ R12
  550. ADDQ AX, R8
  551. ADCQ DX, R9
  552. ADCQ $0, R10
  553. MOVQ (24)(CX), R13
  554. MOVQ (56)(CX), AX
  555. MULQ R13
  556. ADDQ AX, R8
  557. MOVQ R8, (32)(SP) // c4
  558. ADCQ DX, R9
  559. ADCQ $0, R10
  560. XORQ R8, R8
  561. MOVQ (88)(CX), AX
  562. MULQ R11
  563. ADDQ AX, R9
  564. ADCQ DX, R10
  565. ADCQ $0, R8
  566. MOVQ (64)(CX), AX
  567. MULQ R13
  568. ADDQ AX, R9
  569. ADCQ DX, R10
  570. ADCQ $0, R8
  571. MOVQ (72)(CX), AX
  572. MULQ R14
  573. ADDQ AX, R9
  574. ADCQ DX, R10
  575. ADCQ $0, R8
  576. MOVQ (40)(CX), AX
  577. MULQ R15
  578. ADDQ AX, R9
  579. ADCQ DX, R10
  580. ADCQ $0, R8
  581. MOVQ (80)(CX), AX
  582. MULQ R12
  583. ADDQ AX, R9
  584. ADCQ DX, R10
  585. ADCQ $0, R8
  586. MOVQ (32)(CX), R15
  587. MOVQ (56)(CX), AX
  588. MULQ R15
  589. ADDQ AX, R9
  590. MOVQ R9, (40)(SP) // c5
  591. ADCQ DX, R10
  592. ADCQ $0, R8
  593. XORQ R9, R9
  594. MOVQ (64)(CX), AX
  595. MULQ R15
  596. ADDQ AX, R10
  597. ADCQ DX, R8
  598. ADCQ $0, R9
  599. MOVQ (88)(CX), AX
  600. MULQ R12
  601. ADDQ AX, R10
  602. ADCQ DX, R8
  603. ADCQ $0, R9
  604. MOVQ (80)(CX), AX
  605. MULQ R14
  606. ADDQ AX, R10
  607. ADCQ DX, R8
  608. ADCQ $0, R9
  609. MOVQ (40)(CX), R11
  610. MOVQ (56)(CX), AX
  611. MULQ R11
  612. ADDQ AX, R10
  613. ADCQ DX, R8
  614. ADCQ $0, R9
  615. MOVQ (72)(CX), AX
  616. MULQ R13
  617. ADDQ AX, R10
  618. MOVQ R10, (48)(SP) // c6
  619. ADCQ DX, R8
  620. ADCQ $0, R9
  621. XORQ R10, R10
  622. MOVQ (88)(CX), AX
  623. MULQ R14
  624. ADDQ AX, R8
  625. ADCQ DX, R9
  626. ADCQ $0, R10
  627. MOVQ (64)(CX), AX
  628. MULQ R11
  629. ADDQ AX, R8
  630. ADCQ DX, R9
  631. ADCQ $0, R10
  632. MOVQ (80)(CX), AX
  633. MULQ R13
  634. ADDQ AX, R8
  635. ADCQ DX, R9
  636. ADCQ $0, R10
  637. MOVQ (72)(CX), AX
  638. MULQ R15
  639. ADDQ AX, R8
  640. MOVQ R8, (56)(SP) // c7
  641. ADCQ DX, R9
  642. ADCQ $0, R10
  643. XORQ R8, R8
  644. MOVQ (72)(CX), AX
  645. MULQ R11
  646. ADDQ AX, R9
  647. ADCQ DX, R10
  648. ADCQ $0, R8
  649. MOVQ (80)(CX), AX
  650. MULQ R15
  651. ADDQ AX, R9
  652. ADCQ DX, R10
  653. ADCQ $0, R8
  654. MOVQ (88)(CX), AX
  655. MULQ R13
  656. ADDQ AX, R9
  657. MOVQ R9, (64)(SP) // c8
  658. ADCQ DX, R10
  659. ADCQ $0, R8
  660. XORQ R9, R9
  661. MOVQ (88)(CX), AX
  662. MULQ R15
  663. ADDQ AX, R10
  664. ADCQ DX, R8
  665. ADCQ $0, R9
  666. MOVQ (80)(CX), AX
  667. MULQ R11
  668. ADDQ AX, R10 // c9
  669. ADCQ DX, R8
  670. ADCQ $0, R9
  671. MOVQ (88)(CX), AX
  672. MULQ R11
  673. ADDQ AX, R8 // c10
  674. ADCQ DX, R9 // c11
  675. MOVQ (88)(SP), AX
  676. MOVQ (CX), DX
  677. ANDQ AX, R12
  678. ANDQ AX, R14
  679. ANDQ AX, DX
  680. ANDQ AX, R13
  681. ANDQ AX, R15
  682. ANDQ AX, R11
  683. MOVQ (48)(SP), AX
  684. ADDQ AX, DX
  685. MOVQ (56)(SP), AX
  686. ADCQ AX, R12
  687. MOVQ (64)(SP), AX
  688. ADCQ AX, R14
  689. ADCQ R10, R13
  690. ADCQ R8, R15
  691. ADCQ R9, R11
  692. MOVQ (80)(SP), AX
  693. MOVQ DX, (48)(SP)
  694. MOVQ R12, (56)(SP)
  695. MOVQ R14, (64)(SP)
  696. MOVQ R13, (72)(SP)
  697. MOVQ R15, (80)(SP)
  698. MOVQ R11, (88)(SP)
  699. MOVQ (48)(CX), R8
  700. MOVQ (56)(CX), R9
  701. MOVQ (64)(CX), R10
  702. MOVQ (72)(CX), R11
  703. MOVQ (80)(CX), R12
  704. MOVQ (88)(CX), R13
  705. ANDQ AX, R8
  706. ANDQ AX, R9
  707. ANDQ AX, R10
  708. ANDQ AX, R11
  709. ANDQ AX, R12
  710. ANDQ AX, R13
  711. MOVQ (48)(SP), AX
  712. ADDQ AX, R8
  713. MOVQ (56)(SP), AX
  714. ADCQ AX, R9
  715. MOVQ (64)(SP), AX
  716. ADCQ AX, R10
  717. MOVQ (72)(SP), AX
  718. ADCQ AX, R11
  719. MOVQ (80)(SP), AX
  720. ADCQ AX, R12
  721. MOVQ (88)(SP), AX
  722. ADCQ AX, R13
  723. MOVQ R8, (48)(SP)
  724. MOVQ R9, (56)(SP)
  725. MOVQ R11, (72)(SP)
  726. // CX[0-11] <- AL*BL
  727. MOVQ (REG_P1), R11
  728. MOVQ (REG_P2), AX
  729. MULQ R11
  730. XORQ R9, R9
  731. MOVQ AX, (CX) // c0
  732. MOVQ R10, (64)(SP)
  733. MOVQ DX, R8
  734. MOVQ (8)(REG_P2), AX
  735. MULQ R11
  736. XORQ R10, R10
  737. ADDQ AX, R8
  738. MOVQ R12, (80)(SP)
  739. ADCQ DX, R9
  740. MOVQ (8)(REG_P1), R12
  741. MOVQ (REG_P2), AX
  742. MULQ R12
  743. ADDQ AX, R8
  744. MOVQ R8, (8)(CX) // c1
  745. ADCQ DX, R9
  746. MOVQ R13, (88)(SP)
  747. ADCQ $0, R10
  748. XORQ R8, R8
  749. MOVQ (16)(REG_P2), AX
  750. MULQ R11
  751. ADDQ AX, R9
  752. ADCQ DX, R10
  753. ADCQ $0, R8
  754. MOVQ (REG_P2), R13
  755. MOVQ (16)(REG_P1), AX
  756. MULQ R13
  757. ADDQ AX, R9
  758. ADCQ DX, R10
  759. ADCQ $0, R8
  760. MOVQ (8)(REG_P2), AX
  761. MULQ R12
  762. ADDQ AX, R9
  763. MOVQ R9, (16)(CX) // c2
  764. ADCQ DX, R10
  765. ADCQ $0, R8
  766. XORQ R9, R9
  767. MOVQ (24)(REG_P2), AX
  768. MULQ R11
  769. ADDQ AX, R10
  770. ADCQ DX, R8
  771. ADCQ $0, R9
  772. MOVQ (24)(REG_P1), AX
  773. MULQ R13
  774. ADDQ AX, R10
  775. ADCQ DX, R8
  776. ADCQ $0, R9
  777. MOVQ (16)(REG_P2), AX
  778. MULQ R12
  779. ADDQ AX, R10
  780. ADCQ DX, R8
  781. ADCQ $0, R9
  782. MOVQ (16)(REG_P1), R14
  783. MOVQ (8)(REG_P2), AX
  784. MULQ R14
  785. ADDQ AX, R10
  786. MOVQ R10, (24)(CX) // c3
  787. ADCQ DX, R8
  788. ADCQ $0, R9
  789. XORQ R10, R10
  790. MOVQ (32)(REG_P2), AX
  791. MULQ R11
  792. ADDQ AX, R8
  793. ADCQ DX, R9
  794. ADCQ $0, R10
  795. MOVQ (16)(REG_P2), AX
  796. MULQ R14
  797. ADDQ AX, R8
  798. ADCQ DX, R9
  799. ADCQ $0, R10
  800. MOVQ (32)(REG_P1), AX
  801. MULQ R13
  802. ADDQ AX, R8
  803. ADCQ DX, R9
  804. ADCQ $0, R10
  805. MOVQ (24)(REG_P2), AX
  806. MULQ R12
  807. ADDQ AX, R8
  808. ADCQ DX, R9
  809. ADCQ $0, R10
  810. MOVQ (24)(REG_P1), R13
  811. MOVQ (8)(REG_P2), AX
  812. MULQ R13
  813. ADDQ AX, R8
  814. MOVQ R8, (32)(CX) // c4
  815. ADCQ DX, R9
  816. ADCQ $0, R10
  817. XORQ R8, R8
  818. MOVQ (40)(REG_P2), AX
  819. MULQ R11
  820. ADDQ AX, R9
  821. ADCQ DX, R10
  822. ADCQ $0, R8
  823. MOVQ (16)(REG_P2), AX
  824. MULQ R13
  825. ADDQ AX, R9
  826. ADCQ DX, R10
  827. ADCQ $0, R8
  828. MOVQ (24)(REG_P2), AX
  829. MULQ R14
  830. ADDQ AX, R9
  831. ADCQ DX, R10
  832. ADCQ $0, R8
  833. MOVQ (40)(REG_P1), R11
  834. MOVQ (REG_P2), AX
  835. MULQ R11
  836. ADDQ AX, R9
  837. ADCQ DX, R10
  838. ADCQ $0, R8
  839. MOVQ (32)(REG_P2), AX
  840. MULQ R12
  841. ADDQ AX, R9
  842. ADCQ DX, R10
  843. ADCQ $0, R8
  844. MOVQ (32)(REG_P1), R15
  845. MOVQ (8)(REG_P2), AX
  846. MULQ R15
  847. ADDQ AX, R9
  848. MOVQ R9, (40)(CX) //c5
  849. ADCQ DX, R10
  850. ADCQ $0, R8
  851. XORQ R9, R9
  852. MOVQ (16)(REG_P2), AX
  853. MULQ R15
  854. ADDQ AX, R10
  855. ADCQ DX, R8
  856. ADCQ $0, R9
  857. MOVQ (40)(REG_P2), AX
  858. MULQ R12
  859. ADDQ AX, R10
  860. ADCQ DX, R8
  861. ADCQ $0, R9
  862. MOVQ (32)(REG_P2), AX
  863. MULQ R14
  864. ADDQ AX, R10
  865. ADCQ DX, R8
  866. ADCQ $0, R9
  867. MOVQ (8)(REG_P2), AX
  868. MULQ R11
  869. ADDQ AX, R10
  870. ADCQ DX, R8
  871. ADCQ $0, R9
  872. MOVQ (24)(REG_P2), AX
  873. MULQ R13
  874. ADDQ AX, R10
  875. MOVQ R10, (48)(CX) // c6
  876. ADCQ DX, R8
  877. ADCQ $0, R9
  878. XORQ R10, R10
  879. MOVQ (40)(REG_P2), AX
  880. MULQ R14
  881. ADDQ AX, R8
  882. ADCQ DX, R9
  883. ADCQ $0, R10
  884. MOVQ (16)(REG_P2), AX
  885. MULQ R11
  886. ADDQ AX, R8
  887. ADCQ DX, R9
  888. ADCQ $0, R10
  889. MOVQ (32)(REG_P2), AX
  890. MULQ R13
  891. ADDQ AX, R8
  892. ADCQ DX, R9
  893. ADCQ $0, R10
  894. MOVQ (24)(REG_P2), AX
  895. MULQ R15
  896. ADDQ AX, R8
  897. MOVQ R8, (56)(CX) // c7
  898. ADCQ DX, R9
  899. ADCQ $0, R10
  900. XORQ R8, R8
  901. MOVQ (24)(REG_P2), AX
  902. MULQ R11
  903. ADDQ AX, R9
  904. ADCQ DX, R10
  905. ADCQ $0, R8
  906. MOVQ (32)(REG_P2), AX
  907. MULQ R15
  908. ADDQ AX, R9
  909. ADCQ DX, R10
  910. ADCQ $0, R8
  911. MOVQ (40)(REG_P2), AX
  912. MULQ R13
  913. ADDQ AX, R9
  914. MOVQ R9, (64)(CX) // c8
  915. ADCQ DX, R10
  916. ADCQ $0, R8
  917. XORQ R9, R9
  918. MOVQ (40)(REG_P2), AX
  919. MULQ R15
  920. ADDQ AX, R10
  921. ADCQ DX, R8
  922. ADCQ $0, R9
  923. MOVQ (32)(REG_P2), AX
  924. MULQ R11
  925. ADDQ AX, R10
  926. MOVQ R10, (72)(CX) // c9
  927. ADCQ DX, R8
  928. ADCQ $0, R9
  929. MOVQ (40)(REG_P2), AX
  930. MULQ R11
  931. ADDQ AX, R8
  932. MOVQ R8, (80)(CX) // c10
  933. ADCQ DX, R9
  934. MOVQ R9, (88)(CX) // c11
  935. // CX[12-23] <- AH*BH
  936. MOVQ (48)(REG_P1), R11
  937. MOVQ (48)(REG_P2), AX
  938. MULQ R11
  939. XORQ R9, R9
  940. MOVQ AX, (96)(CX) // c0
  941. MOVQ DX, R8
  942. MOVQ (56)(REG_P2), AX
  943. MULQ R11
  944. XORQ R10, R10
  945. ADDQ AX, R8
  946. ADCQ DX, R9
  947. MOVQ (56)(REG_P1), R12
  948. MOVQ (48)(REG_P2), AX
  949. MULQ R12
  950. ADDQ AX, R8
  951. MOVQ R8, (104)(CX) // c1
  952. ADCQ DX, R9
  953. ADCQ $0, R10
  954. XORQ R8, R8
  955. MOVQ (64)(REG_P2), AX
  956. MULQ R11
  957. ADDQ AX, R9
  958. ADCQ DX, R10
  959. ADCQ $0, R8
  960. MOVQ (48)(REG_P2), R13
  961. MOVQ (64)(REG_P1), AX
  962. MULQ R13
  963. ADDQ AX, R9
  964. ADCQ DX, R10
  965. ADCQ $0, R8
  966. MOVQ (56)(REG_P2), AX
  967. MULQ R12
  968. ADDQ AX, R9
  969. MOVQ R9, (112)(CX) // c2
  970. ADCQ DX, R10
  971. ADCQ $0, R8
  972. XORQ R9, R9
  973. MOVQ (72)(REG_P2), AX
  974. MULQ R11
  975. ADDQ AX, R10
  976. ADCQ DX, R8
  977. ADCQ $0, R9
  978. MOVQ (72)(REG_P1), AX
  979. MULQ R13
  980. ADDQ AX, R10
  981. ADCQ DX, R8
  982. ADCQ $0, R9
  983. MOVQ (64)(REG_P2), AX
  984. MULQ R12
  985. ADDQ AX, R10
  986. ADCQ DX, R8
  987. ADCQ $0, R9
  988. MOVQ (64)(REG_P1), R14
  989. MOVQ (56)(REG_P2), AX
  990. MULQ R14
  991. ADDQ AX, R10
  992. MOVQ R10, (120)(CX) // c3
  993. ADCQ DX, R8
  994. ADCQ $0, R9
  995. XORQ R10, R10
  996. MOVQ (80)(REG_P2), AX
  997. MULQ R11
  998. ADDQ AX, R8
  999. ADCQ DX, R9
  1000. ADCQ $0, R10
  1001. MOVQ (64)(REG_P2), AX
  1002. MULQ R14
  1003. ADDQ AX, R8
  1004. ADCQ DX, R9
  1005. ADCQ $0, R10
  1006. MOVQ (80)(REG_P1), R15
  1007. MOVQ R13, AX
  1008. MULQ R15
  1009. ADDQ AX, R8
  1010. ADCQ DX, R9
  1011. ADCQ $0, R10
  1012. MOVQ (72)(REG_P2), AX
  1013. MULQ R12
  1014. ADDQ AX, R8
  1015. ADCQ DX, R9
  1016. ADCQ $0, R10
  1017. MOVQ (72)(REG_P1), R13
  1018. MOVQ (56)(REG_P2), AX
  1019. MULQ R13
  1020. ADDQ AX, R8
  1021. MOVQ R8, (128)(CX) // c4
  1022. ADCQ DX, R9
  1023. ADCQ $0, R10
  1024. XORQ R8, R8
  1025. MOVQ (88)(REG_P2), AX
  1026. MULQ R11
  1027. ADDQ AX, R9
  1028. ADCQ DX, R10
  1029. ADCQ $0, R8
  1030. MOVQ (64)(REG_P2), AX
  1031. MULQ R13
  1032. ADDQ AX, R9
  1033. ADCQ DX, R10
  1034. ADCQ $0, R8
  1035. MOVQ (72)(REG_P2), AX
  1036. MULQ R14
  1037. ADDQ AX, R9
  1038. ADCQ DX, R10
  1039. ADCQ $0, R8
  1040. MOVQ (88)(REG_P1), R11
  1041. MOVQ (48)(REG_P2), AX
  1042. MULQ R11
  1043. ADDQ AX, R9
  1044. ADCQ DX, R10
  1045. ADCQ $0, R8
  1046. MOVQ (80)(REG_P2), AX
  1047. MULQ R12
  1048. ADDQ AX, R9
  1049. ADCQ DX, R10
  1050. ADCQ $0, R8
  1051. MOVQ (56)(REG_P2), AX
  1052. MULQ R15
  1053. ADDQ AX, R9
  1054. MOVQ R9, (136)(CX) // c5
  1055. ADCQ DX, R10
  1056. ADCQ $0, R8
  1057. XORQ R9, R9
  1058. MOVQ (64)(REG_P2), AX
  1059. MULQ R15
  1060. ADDQ AX, R10
  1061. ADCQ DX, R8
  1062. ADCQ $0, R9
  1063. MOVQ (88)(REG_P2), AX
  1064. MULQ R12
  1065. ADDQ AX, R10
  1066. ADCQ DX, R8
  1067. ADCQ $0, R9
  1068. MOVQ (80)(REG_P2), AX
  1069. MULQ R14
  1070. ADDQ AX, R10
  1071. ADCQ DX, R8
  1072. ADCQ $0, R9
  1073. MOVQ (56)(REG_P2), AX
  1074. MULQ R11
  1075. ADDQ AX, R10
  1076. ADCQ DX, R8
  1077. ADCQ $0, R9
  1078. MOVQ (72)(REG_P2), AX
  1079. MULQ R13
  1080. ADDQ AX, R10
  1081. MOVQ R10, (144)(CX) // c6
  1082. ADCQ DX, R8
  1083. ADCQ $0, R9
  1084. XORQ R10, R10
  1085. MOVQ (88)(REG_P2), AX
  1086. MULQ R14
  1087. ADDQ AX, R8
  1088. ADCQ DX, R9
  1089. ADCQ $0, R10
  1090. MOVQ (64)(REG_P2), AX
  1091. MULQ R11
  1092. ADDQ AX, R8
  1093. ADCQ DX, R9
  1094. ADCQ $0, R10
  1095. MOVQ (80)(REG_P2), AX
  1096. MULQ R13
  1097. ADDQ AX, R8
  1098. ADCQ DX, R9
  1099. ADCQ $0, R10
  1100. MOVQ (72)(REG_P2), AX
  1101. MULQ R15
  1102. ADDQ AX, R8
  1103. MOVQ R8, (152)(CX) // c7
  1104. ADCQ DX, R9
  1105. ADCQ $0, R10
  1106. XORQ R8, R8
  1107. MOVQ (72)(REG_P2), AX
  1108. MULQ R11
  1109. ADDQ AX, R9
  1110. ADCQ DX, R10
  1111. ADCQ $0, R8
  1112. MOVQ (80)(REG_P2), AX
  1113. MULQ R15
  1114. ADDQ AX, R9
  1115. ADCQ DX, R10
  1116. ADCQ $0, R8
  1117. MOVQ (88)(REG_P2), AX
  1118. MULQ R13
  1119. ADDQ AX, R9
  1120. MOVQ R9, (160)(CX) // c8
  1121. ADCQ DX, R10
  1122. ADCQ $0, R8
  1123. MOVQ (88)(REG_P2), AX
  1124. MULQ R15
  1125. ADDQ AX, R10
  1126. ADCQ DX, R8
  1127. MOVQ (80)(REG_P2), AX
  1128. MULQ R11
  1129. ADDQ AX, R10
  1130. MOVQ R10, (168)(CX) // c9
  1131. ADCQ DX, R8
  1132. MOVQ (88)(REG_P2), AX
  1133. MULQ R11
  1134. ADDQ AX, R8
  1135. MOVQ R8, (176)(CX) // c10
  1136. ADCQ $0, DX
  1137. MOVQ DX, (184)(CX) // c11
  1138. // [R8-R15,AX,DX,DI,(SP)] <- (AH+AL)*(BH+BL)-AL*BL
  1139. MOVQ (SP), R8
  1140. SUBQ (CX), R8
  1141. MOVQ (8)(SP), R9
  1142. SBBQ (8)(CX), R9
  1143. MOVQ (16)(SP), R10
  1144. SBBQ (16)(CX), R10
  1145. MOVQ (24)(SP), R11
  1146. SBBQ (24)(CX), R11
  1147. MOVQ (32)(SP), R12
  1148. SBBQ (32)(CX), R12
  1149. MOVQ (40)(SP), R13
  1150. SBBQ (40)(CX), R13
  1151. MOVQ (48)(SP), R14
  1152. SBBQ (48)(CX), R14
  1153. MOVQ (56)(SP), R15
  1154. SBBQ (56)(CX), R15
  1155. MOVQ (64)(SP), AX
  1156. SBBQ (64)(CX), AX
  1157. MOVQ (72)(SP), DX
  1158. SBBQ (72)(CX), DX
  1159. MOVQ (80)(SP), DI
  1160. SBBQ (80)(CX), DI
  1161. MOVQ (88)(SP), SI
  1162. SBBQ (88)(CX), SI
  1163. MOVQ SI, (SP)
  1164. // [R8-R15,AX,DX,DI,(SP)] <- (AH+AL)*(BH+BL) - AL*BL - AH*BH
  1165. MOVQ (96)(CX), SI
  1166. SUBQ SI, R8
  1167. MOVQ (104)(CX), SI
  1168. SBBQ SI, R9
  1169. MOVQ (112)(CX), SI
  1170. SBBQ SI, R10
  1171. MOVQ (120)(CX), SI
  1172. SBBQ SI, R11
  1173. MOVQ (128)(CX), SI
  1174. SBBQ SI, R12
  1175. MOVQ (136)(CX), SI
  1176. SBBQ SI, R13
  1177. MOVQ (144)(CX), SI
  1178. SBBQ SI, R14
  1179. MOVQ (152)(CX), SI
  1180. SBBQ SI, R15
  1181. MOVQ (160)(CX), SI
  1182. SBBQ SI, AX
  1183. MOVQ (168)(CX), SI
  1184. SBBQ SI, DX
  1185. MOVQ (176)(CX), SI
  1186. SBBQ SI, DI
  1187. MOVQ (SP), SI
  1188. SBBQ (184)(CX), SI
  1189. // FINAL RESULT
  1190. ADDQ (48)(CX), R8
  1191. MOVQ R8, (48)(CX)
  1192. ADCQ (56)(CX), R9
  1193. MOVQ R9, (56)(CX)
  1194. ADCQ (64)(CX), R10
  1195. MOVQ R10, (64)(CX)
  1196. ADCQ (72)(CX), R11
  1197. MOVQ R11, (72)(CX)
  1198. ADCQ (80)(CX), R12
  1199. MOVQ R12, (80)(CX)
  1200. ADCQ (88)(CX), R13
  1201. MOVQ R13, (88)(CX)
  1202. ADCQ (96)(CX), R14
  1203. MOVQ R14, (96)(CX)
  1204. ADCQ (104)(CX), R15
  1205. MOVQ R15, (104)(CX)
  1206. ADCQ (112)(CX), AX
  1207. MOVQ AX, (112)(CX)
  1208. ADCQ (120)(CX), DX
  1209. MOVQ DX, (120)(CX)
  1210. ADCQ (128)(CX), DI
  1211. MOVQ DI, (128)(CX)
  1212. ADCQ (136)(CX), SI
  1213. MOVQ SI, (136)(CX)
  1214. MOVQ (144)(CX), AX
  1215. ADCQ $0, AX
  1216. MOVQ AX, (144)(CX)
  1217. MOVQ (152)(CX), AX
  1218. ADCQ $0, AX
  1219. MOVQ AX, (152)(CX)
  1220. MOVQ (160)(CX), AX
  1221. ADCQ $0, AX
  1222. MOVQ AX, (160)(CX)
  1223. MOVQ (168)(CX), AX
  1224. ADCQ $0, AX
  1225. MOVQ AX, (168)(CX)
  1226. MOVQ (176)(CX), AX
  1227. ADCQ $0, AX
  1228. MOVQ AX, (176)(CX)
  1229. MOVQ (184)(CX), AX
  1230. ADCQ $0, AX
  1231. MOVQ AX, (184)(CX)
  1232. RET
  1233. // This multiplies a 256-bit number pointed to by M0 with p751+1.
  1234. // It is assumed that M1 points to p751+1 stored as a 768-bit Fp751Element.
  1235. // C points to the place to store the result and should be at least 192 bits.
  1236. // This should only be used when the BMI2 and ADX instruction set extensions
  1237. // are available.
  1238. #define mul256x448bmi2adx(M0, M1, C, T0, T1, T2, T3, T4, T5, T6, T7, T8, T9, T10) \
  1239. MOVQ 0+M0, DX \
  1240. MULXQ M1+40(SB), T1, T0 \
  1241. MULXQ M1+48(SB), T3, T2 \
  1242. MOVQ T1, 0+C \ // C0_final
  1243. XORQ AX, AX \
  1244. MULXQ M1+56(SB), T5, T4 \
  1245. ADOXQ T3, T0 \
  1246. ADOXQ T5, T2 \
  1247. MULXQ M1+64(SB), T3, T1 \
  1248. ADOXQ T3, T4 \
  1249. MULXQ M1+72(SB), T6, T5 \
  1250. ADOXQ T6, T1 \
  1251. MULXQ M1+80(SB), T7, T3 \
  1252. ADOXQ T7, T5 \
  1253. MULXQ M1+88(SB), T8, T6 \
  1254. ADOXQ T8, T3 \
  1255. ADOXQ AX, T6 \
  1256. \
  1257. MOVQ 8+M0, DX \
  1258. MULXQ M1+40(SB), T7, T8 \
  1259. XORQ AX, AX \
  1260. ADCXQ T7, T0 \
  1261. MOVQ T0, 8+C \ // C1_final
  1262. ADCXQ T8, T2 \
  1263. MULXQ M1+48(SB), T8, T7 \
  1264. ADOXQ T8, T2 \
  1265. ADCXQ T7, T4 \
  1266. MULXQ M1+56(SB), T8, T0 \
  1267. ADOXQ T8, T4 \
  1268. ADCXQ T1, T0 \
  1269. MULXQ M1+64(SB), T7, T1 \
  1270. ADCXQ T5, T1 \
  1271. MULXQ M1+72(SB), T8, T5 \
  1272. ADCXQ T5, T3 \
  1273. MULXQ M1+80(SB), T9, T5 \
  1274. ADCXQ T5, T6 \
  1275. MULXQ M1+88(SB), DX, T5 \
  1276. ADCXQ AX, T5 \
  1277. \
  1278. ADOXQ T7, T0 \
  1279. ADOXQ T8, T1 \
  1280. ADOXQ T9, T3 \
  1281. ADOXQ DX, T6 \
  1282. ADOXQ AX, T5 \
  1283. \
  1284. MOVQ 16+M0, DX \
  1285. MULXQ M1+40(SB), T7, T8 \
  1286. XORQ AX, AX \
  1287. ADCXQ T7, T2 \
  1288. MOVQ T2, 16+C \ // C2_final
  1289. ADCXQ T8, T4 \
  1290. MULXQ M1+48(SB), T7, T8 \
  1291. ADOXQ T7, T4 \
  1292. ADCXQ T8, T0 \
  1293. MULXQ M1+56(SB), T8, T2 \
  1294. ADOXQ T8, T0 \
  1295. ADCXQ T2, T1 \
  1296. MULXQ M1+64(SB), T7, T2 \
  1297. ADCXQ T2, T3 \
  1298. MULXQ M1+72(SB), T8, T2 \
  1299. ADCXQ T2, T6 \
  1300. MULXQ M1+80(SB), T9, T2 \
  1301. ADCXQ T2, T5 \
  1302. MULXQ M1+88(SB), DX, T2 \
  1303. ADCXQ AX, T2 \
  1304. \
  1305. ADOXQ T7, T1 \
  1306. ADOXQ T8, T3 \
  1307. ADOXQ T9, T6 \
  1308. ADOXQ DX, T5 \
  1309. ADOXQ AX, T2 \
  1310. \
  1311. MOVQ 24+M0, DX \
  1312. MULXQ M1+40(SB), T7, T8 \
  1313. XORQ AX, AX \
  1314. ADCXQ T4, T7 \
  1315. ADCXQ T8, T0 \
  1316. MULXQ M1+48(SB), T10, T8 \
  1317. ADOXQ T10, T0 \
  1318. ADCXQ T8, T1 \
  1319. MULXQ M1+56(SB), T8, T4 \
  1320. ADOXQ T8, T1 \
  1321. ADCXQ T4, T3 \
  1322. MULXQ M1+64(SB), T10, T4 \
  1323. ADCXQ T4, T6 \
  1324. MULXQ M1+72(SB), T8, T4 \
  1325. ADCXQ T4, T5 \
  1326. MULXQ M1+80(SB), T9, T4 \
  1327. ADCXQ T4, T2 \
  1328. MULXQ M1+88(SB), DX, T4 \
  1329. ADCXQ AX, T4 \
  1330. \
  1331. ADOXQ T10, T3 \
  1332. ADOXQ T8, T6 \
  1333. ADOXQ T9, T5 \
  1334. ADOXQ DX, T2 \
  1335. ADOXQ AX, T4
  1336. // This multiplies a 256-bit number pointed to by M0 with p751+1.
  1337. // It is assumed that M1 points to p751+1 stored as a 768-bit Fp751Element.
  1338. // C points to the place to store the result and should be at least 192 bits.
  1339. // This should only be used when the BMI2 instruction set extension is
  1340. // available.
  1341. #define mul256x448bmi2(M0, M1, C, T0, T1, T2, T3, T4, T5, T6, T7, T8, T9, T10) \
  1342. MOVQ 0+M0, DX \
  1343. MULXQ M1+40(SB), T1, T0 \
  1344. MULXQ M1+48(SB), T3, T2 \
  1345. MOVQ T1, 0+C \ // C0_final
  1346. XORQ AX, AX \
  1347. MULXQ M1+56(SB), T5, T4 \
  1348. ADDQ T3, T0 \
  1349. ADCQ T5, T2 \
  1350. MULXQ M1+64(SB), T3, T1 \
  1351. ADCQ T3, T4 \
  1352. MULXQ M1+72(SB), T6, T5 \
  1353. ADCQ T6, T1 \
  1354. MULXQ M1+80(SB), T7, T3 \
  1355. ADCQ T7, T5 \
  1356. MULXQ M1+88(SB), T8, T6 \
  1357. ADCQ T8, T3 \
  1358. ADCQ AX, T6 \
  1359. \
  1360. MOVQ 8+M0, DX \
  1361. MULXQ M1+40(SB), T7, T8 \
  1362. ADDQ T7, T0 \
  1363. MOVQ T0, 8+C \ // C1_final
  1364. ADCQ T8, T2 \
  1365. MULXQ M1+48(SB), T8, T7 \
  1366. MOVQ T8, 32+C \
  1367. ADCQ T7, T4 \
  1368. MULXQ M1+56(SB), T8, T0 \
  1369. MOVQ T8, 40+C \
  1370. ADCQ T1, T0 \
  1371. MULXQ M1+64(SB), T7, T1 \
  1372. ADCQ T5, T1 \
  1373. MULXQ M1+72(SB), T8, T5 \
  1374. ADCQ T5, T3 \
  1375. MULXQ M1+80(SB), T9, T5 \
  1376. ADCQ T5, T6 \
  1377. MULXQ M1+88(SB), DX, T5 \
  1378. ADCQ AX, T5 \
  1379. \
  1380. XORQ AX, AX \
  1381. ADDQ 32+C, T2 \
  1382. ADCQ 40+C, T4 \
  1383. ADCQ T7, T0 \
  1384. ADCQ T8, T1 \
  1385. ADCQ T9, T3 \
  1386. ADCQ DX, T6 \
  1387. ADCQ AX, T5 \
  1388. \
  1389. MOVQ 16+M0, DX \
  1390. MULXQ M1+40(SB), T7, T8 \
  1391. ADDQ T7, T2 \
  1392. MOVQ T2, 16+C \ // C2_final
  1393. ADCQ T8, T4 \
  1394. MULXQ M1+48(SB), T7, T8 \
  1395. MOVQ T7, 32+C \
  1396. ADCQ T8, T0 \
  1397. MULXQ M1+56(SB), T8, T2 \
  1398. MOVQ T8, 40+C \
  1399. ADCQ T2, T1 \
  1400. MULXQ M1+64(SB), T7, T2 \
  1401. ADCQ T2, T3 \
  1402. MULXQ M1+72(SB), T8, T2 \
  1403. ADCQ T2, T6 \
  1404. MULXQ M1+80(SB), T9, T2 \
  1405. ADCQ T2, T5 \
  1406. MULXQ M1+88(SB), DX, T2 \
  1407. ADCQ AX, T2 \
  1408. \
  1409. XORQ AX, AX \
  1410. ADDQ 32+C, T4 \
  1411. ADCQ 40+C, T0 \
  1412. ADCQ T7, T1 \
  1413. ADCQ T8, T3 \
  1414. ADCQ T9, T6 \
  1415. ADCQ DX, T5 \
  1416. ADCQ AX, T2 \
  1417. \
  1418. MOVQ 24+M0, DX \
  1419. MULXQ M1+40(SB), T7, T8 \
  1420. ADDQ T4, T7 \
  1421. ADCQ T8, T0 \
  1422. MULXQ M1+48(SB), T10, T8 \
  1423. MOVQ T10, 32+C \
  1424. ADCQ T8, T1 \
  1425. MULXQ M1+56(SB), T8, T4 \
  1426. MOVQ T8, 40+C \
  1427. ADCQ T4, T3 \
  1428. MULXQ M1+64(SB), T10, T4 \
  1429. ADCQ T4, T6 \
  1430. MULXQ M1+72(SB), T8, T4 \
  1431. ADCQ T4, T5 \
  1432. MULXQ M1+80(SB), T9, T4 \
  1433. ADCQ T4, T2 \
  1434. MULXQ M1+88(SB), DX, T4 \
  1435. ADCQ AX, T4 \
  1436. \
  1437. XORQ AX, AX \
  1438. ADDQ 32+C, T0 \
  1439. ADCQ 40+C, T1 \
  1440. ADCQ T10, T3 \
  1441. ADCQ T8, T6 \
  1442. ADCQ T9, T5 \
  1443. ADCQ DX, T2 \
  1444. ADCQ AX, T4
  1445. // Template for calculating the Montgomery reduction algorithm described in
  1446. // section 5.2.3 of https://eprint.iacr.org/2017/1015.pdf. Template must be
  1447. // customized with schoolbook multiplicaton for 256 x 448-bit number.
  1448. // This macro reuses memory of IN value and *changes* it. Smashes registers
  1449. // R[8-15], AX, BX, CX, DX, BP.
  1450. // Input:
  1451. // * M0: 1536-bit number to be reduced
  1452. // * C : either mul256x448bmi2 or mul256x448bmi2adx
  1453. // Output: OUT 768-bit
  1454. #define REDC(C, M0, MULS) \
  1455. \ // a[0-3] x p751p1_nz --> result: [reg_p2+48], [reg_p2+56], [reg_p2+64], and rbp, r8:r14
  1456. MULS(M0, ·p751p1, 48+C, R8, R9, R13, R10, R14, R12, R11, BP, BX, CX, R15) \
  1457. XORQ R15, R15 \
  1458. MOVQ 48+C, AX \
  1459. MOVQ 56+C, DX \
  1460. MOVQ 64+C, BX \
  1461. ADDQ 40+M0, AX \
  1462. ADCQ 48+M0, DX \
  1463. ADCQ 56+M0, BX \
  1464. MOVQ AX, 40+M0 \
  1465. MOVQ DX, 48+M0 \
  1466. MOVQ BX, 56+M0 \
  1467. ADCQ 64+M0, BP \
  1468. ADCQ 72+M0, R8 \
  1469. ADCQ 80+M0, R9 \
  1470. ADCQ 88+M0, R10 \
  1471. ADCQ 96+M0, R11 \
  1472. ADCQ 104+M0, R12 \
  1473. ADCQ 112+M0, R13 \
  1474. ADCQ 120+M0, R14 \
  1475. ADCQ 128+M0, R15 \
  1476. MOVQ BP, 64+M0 \
  1477. MOVQ R8, 72+M0 \
  1478. MOVQ R9, 80+M0 \
  1479. MOVQ R10, 88+M0 \
  1480. MOVQ R11, 96+M0 \
  1481. MOVQ R12, 104+M0 \
  1482. MOVQ R13, 112+M0 \
  1483. MOVQ R14, 120+M0 \
  1484. MOVQ R15, 128+M0 \
  1485. MOVQ 136+M0, R8 \
  1486. MOVQ 144+M0, R9 \
  1487. MOVQ 152+M0, R10 \
  1488. MOVQ 160+M0, R11 \
  1489. MOVQ 168+M0, R12 \
  1490. MOVQ 176+M0, R13 \
  1491. MOVQ 184+M0, R14 \
  1492. ADCQ $0, R8 \
  1493. ADCQ $0, R9 \
  1494. ADCQ $0, R10 \
  1495. ADCQ $0, R11 \
  1496. ADCQ $0, R12 \
  1497. ADCQ $0, R13 \
  1498. ADCQ $0, R14 \
  1499. MOVQ R8, 136+M0 \
  1500. MOVQ R9, 144+M0 \
  1501. MOVQ R10, 152+M0 \
  1502. MOVQ R11, 160+M0 \
  1503. MOVQ R12, 168+M0 \
  1504. MOVQ R13, 176+M0 \
  1505. MOVQ R14, 184+M0 \
  1506. \ // a[4-7] x p751p1_nz --> result: [reg_p2+48], [reg_p2+56], [reg_p2+64], and rbp, r8:r14
  1507. MULS(32+M0, ·p751p1, 48+C, R8, R9, R13, R10, R14, R12, R11, BP, BX, CX, R15) \
  1508. XORQ R15, R15 \
  1509. MOVQ 48+C, AX \
  1510. MOVQ 56+C, DX \
  1511. MOVQ 64+C, BX \
  1512. ADDQ 72+M0, AX \
  1513. ADCQ 80+M0, DX \
  1514. ADCQ 88+M0, BX \
  1515. MOVQ AX, 72+M0 \
  1516. MOVQ DX, 80+M0 \
  1517. MOVQ BX, 88+M0 \
  1518. ADCQ 96+M0, BP \
  1519. ADCQ 104+M0, R8 \
  1520. ADCQ 112+M0, R9 \
  1521. ADCQ 120+M0, R10 \
  1522. ADCQ 128+M0, R11 \
  1523. ADCQ 136+M0, R12 \
  1524. ADCQ 144+M0, R13 \
  1525. ADCQ 152+M0, R14 \
  1526. ADCQ 160+M0, R15 \
  1527. MOVQ BP, 0+C \ // Final result c0
  1528. MOVQ R8, 104+M0 \
  1529. MOVQ R9, 112+M0 \
  1530. MOVQ R10, 120+M0 \
  1531. MOVQ R11, 128+M0 \
  1532. MOVQ R12, 136+M0 \
  1533. MOVQ R13, 144+M0 \
  1534. MOVQ R14, 152+M0 \
  1535. MOVQ R15, 160+M0 \
  1536. MOVQ 168+M0, R12 \
  1537. MOVQ 176+M0, R13 \
  1538. MOVQ 184+M0, R14 \
  1539. ADCQ $0, R12 \
  1540. ADCQ $0, R13 \
  1541. ADCQ $0, R14 \
  1542. MOVQ R12, 168+M0 \
  1543. MOVQ R13, 176+M0 \
  1544. MOVQ R14, 184+M0 \
  1545. \ // a[8-11] x p751p1_nz --> result: [reg_p2+48], [reg_p2+56], [reg_p2+64], and rbp, r8:r14
  1546. MULS(64+M0, ·p751p1, 48+C, R8, R9, R13, R10, R14, R12, R11, BP, BX, CX, R15) \
  1547. MOVQ 48+C, AX \ // Final result c1:c11
  1548. MOVQ 56+C, DX \
  1549. MOVQ 64+C, BX \
  1550. ADDQ 104+M0, AX \
  1551. ADCQ 112+M0, DX \
  1552. ADCQ 120+M0, BX \
  1553. MOVQ AX, 8+C \
  1554. MOVQ DX, 16+C \
  1555. MOVQ BX, 24+C \
  1556. ADCQ 128+M0, BP \
  1557. ADCQ 136+M0, R8 \
  1558. ADCQ 144+M0, R9 \
  1559. ADCQ 152+M0, R10 \
  1560. ADCQ 160+M0, R11 \
  1561. ADCQ 168+M0, R12 \
  1562. ADCQ 176+M0, R13 \
  1563. ADCQ 184+M0, R14 \
  1564. MOVQ BP, 32+C \
  1565. MOVQ R8, 40+C \
  1566. MOVQ R9, 48+C \
  1567. MOVQ R10, 56+C \
  1568. MOVQ R11, 64+C \
  1569. MOVQ R12, 72+C \
  1570. MOVQ R13, 80+C \
  1571. MOVQ R14, 88+C
  1572. TEXT ·fp751MontgomeryReduce(SB), $0-16
  1573. MOVQ z+0(FP), REG_P2
  1574. MOVQ x+8(FP), REG_P1
  1575. // Check wether to use optimized implementation
  1576. CMPB github·com∕cloudflare∕sidh∕internal∕utils·HasADXandBMI2(SB), $1
  1577. JE redc_with_mulx_adcx_adox
  1578. CMPB github·com∕cloudflare∕sidh∕internal∕utils·HasBMI2(SB), $1
  1579. JE redc_with_mulx
  1580. MOVQ (REG_P1), R11
  1581. MOVQ P751P1_5, AX
  1582. MULQ R11
  1583. XORQ R8, R8
  1584. ADDQ (40)(REG_P1), AX
  1585. MOVQ AX, (40)(REG_P2) // Z5
  1586. ADCQ DX, R8
  1587. XORQ R9, R9
  1588. MOVQ P751P1_6, AX
  1589. MULQ R11
  1590. XORQ R10, R10
  1591. ADDQ AX, R8
  1592. ADCQ DX, R9
  1593. MOVQ (8)(REG_P1), R12
  1594. MOVQ P751P1_5, AX
  1595. MULQ R12
  1596. ADDQ AX, R8
  1597. ADCQ DX, R9
  1598. ADCQ $0, R10
  1599. ADDQ (48)(REG_P1), R8
  1600. MOVQ R8, (48)(REG_P2) // Z6
  1601. ADCQ $0, R9
  1602. ADCQ $0, R10
  1603. XORQ R8, R8
  1604. MOVQ P751P1_7, AX
  1605. MULQ R11
  1606. ADDQ AX, R9
  1607. ADCQ DX, R10
  1608. ADCQ $0, R8
  1609. MOVQ P751P1_6, AX
  1610. MULQ R12
  1611. ADDQ AX, R9
  1612. ADCQ DX, R10
  1613. ADCQ $0, R8
  1614. MOVQ (16)(REG_P1), R13
  1615. MOVQ P751P1_5, AX
  1616. MULQ R13
  1617. ADDQ AX, R9
  1618. ADCQ DX, R10
  1619. ADCQ $0, R8
  1620. ADDQ (56)(REG_P1), R9
  1621. MOVQ R9, (56)(REG_P2) // Z7
  1622. ADCQ $0, R10
  1623. ADCQ $0, R8
  1624. XORQ R9, R9
  1625. MOVQ P751P1_8, AX
  1626. MULQ R11
  1627. ADDQ AX, R10
  1628. ADCQ DX, R8
  1629. ADCQ $0, R9
  1630. MOVQ P751P1_7, AX
  1631. MULQ R12
  1632. ADDQ AX, R10
  1633. ADCQ DX, R8
  1634. ADCQ $0, R9
  1635. MOVQ P751P1_6, AX
  1636. MULQ R13
  1637. ADDQ AX, R10
  1638. ADCQ DX, R8
  1639. ADCQ $0, R9
  1640. MOVQ (24)(REG_P1), R14
  1641. MOVQ P751P1_5, AX
  1642. MULQ R14
  1643. ADDQ AX, R10
  1644. ADCQ DX, R8
  1645. ADCQ $0, R9
  1646. ADDQ (64)(REG_P1), R10
  1647. MOVQ R10, (64)(REG_P2) // Z8
  1648. ADCQ $0, R8
  1649. ADCQ $0, R9
  1650. XORQ R10, R10
  1651. MOVQ P751P1_9, AX
  1652. MULQ R11
  1653. ADDQ AX, R8
  1654. ADCQ DX, R9
  1655. ADCQ $0, R10
  1656. MOVQ P751P1_8, AX
  1657. MULQ R12
  1658. ADDQ AX, R8
  1659. ADCQ DX, R9
  1660. ADCQ $0, R10
  1661. MOVQ P751P1_7, AX
  1662. MULQ R13
  1663. ADDQ AX, R8
  1664. ADCQ DX, R9
  1665. ADCQ $0, R10
  1666. MOVQ P751P1_6, AX
  1667. MULQ R14
  1668. ADDQ AX, R8
  1669. ADCQ DX, R9
  1670. ADCQ $0, R10
  1671. MOVQ (32)(REG_P1), R15
  1672. MOVQ P751P1_5, AX
  1673. MULQ R15
  1674. ADDQ AX, R8
  1675. ADCQ DX, R9
  1676. ADCQ $0, R10
  1677. ADDQ (72)(REG_P1), R8
  1678. MOVQ R8, (72)(REG_P2) // Z9
  1679. ADCQ $0, R9
  1680. ADCQ $0, R10
  1681. XORQ R8, R8
  1682. MOVQ P751P1_10, AX
  1683. MULQ R11
  1684. ADDQ AX, R9
  1685. ADCQ DX, R10
  1686. ADCQ $0, R8
  1687. MOVQ P751P1_9, AX
  1688. MULQ R12
  1689. ADDQ AX, R9
  1690. ADCQ DX, R10
  1691. ADCQ $0, R8
  1692. MOVQ P751P1_8, AX
  1693. MULQ R13
  1694. ADDQ AX, R9
  1695. ADCQ DX, R10
  1696. ADCQ $0, R8
  1697. MOVQ P751P1_7, AX
  1698. MULQ R14
  1699. ADDQ AX, R9
  1700. ADCQ DX, R10
  1701. ADCQ $0, R8
  1702. MOVQ P751P1_6, AX
  1703. MULQ R15
  1704. ADDQ AX, R9
  1705. ADCQ DX, R10
  1706. ADCQ $0, R8
  1707. MOVQ (40)(REG_P2), CX
  1708. MOVQ P751P1_5, AX
  1709. MULQ CX
  1710. ADDQ AX, R9
  1711. ADCQ DX, R10
  1712. ADCQ $0, R8
  1713. ADDQ (80)(REG_P1), R9
  1714. MOVQ R9, (80)(REG_P2) // Z10
  1715. ADCQ $0, R10
  1716. ADCQ $0, R8
  1717. XORQ R9, R9
  1718. MOVQ P751P1_11, AX
  1719. MULQ R11
  1720. ADDQ AX, R10
  1721. ADCQ DX, R8
  1722. ADCQ $0, R9
  1723. MOVQ P751P1_10, AX
  1724. MULQ R12
  1725. ADDQ AX, R10
  1726. ADCQ DX, R8
  1727. ADCQ $0, R9
  1728. MOVQ P751P1_9, AX
  1729. MULQ R13
  1730. ADDQ AX, R10
  1731. ADCQ DX, R8
  1732. ADCQ $0, R9
  1733. MOVQ P751P1_8, AX
  1734. MULQ R14
  1735. ADDQ AX, R10
  1736. ADCQ DX, R8
  1737. ADCQ $0, R9
  1738. MOVQ P751P1_7, AX
  1739. MULQ R15
  1740. ADDQ AX, R10
  1741. ADCQ DX, R8
  1742. ADCQ $0, R9
  1743. MOVQ P751P1_6, AX
  1744. MULQ CX
  1745. ADDQ AX, R10
  1746. ADCQ DX, R8
  1747. ADCQ $0, R9
  1748. MOVQ (48)(REG_P2), R11
  1749. MOVQ P751P1_5, AX
  1750. MULQ R11
  1751. ADDQ AX, R10
  1752. ADCQ DX, R8
  1753. ADCQ $0, R9
  1754. ADDQ (88)(REG_P1), R10
  1755. MOVQ R10, (88)(REG_P2) // Z11
  1756. ADCQ $0, R8
  1757. ADCQ $0, R9
  1758. XORQ R10, R10
  1759. MOVQ P751P1_11, AX
  1760. MULQ R12
  1761. ADDQ AX, R8
  1762. ADCQ DX, R9
  1763. ADCQ $0, R10
  1764. MOVQ P751P1_10, AX
  1765. MULQ R13
  1766. ADDQ AX, R8
  1767. ADCQ DX, R9
  1768. ADCQ $0, R10
  1769. MOVQ P751P1_9, AX
  1770. MULQ R14
  1771. ADDQ AX, R8
  1772. ADCQ DX, R9
  1773. ADCQ $0, R10
  1774. MOVQ P751P1_8, AX
  1775. MULQ R15
  1776. ADDQ AX, R8
  1777. ADCQ DX, R9
  1778. ADCQ $0, R10
  1779. MOVQ P751P1_7, AX
  1780. MULQ CX
  1781. ADDQ AX, R8
  1782. ADCQ DX, R9
  1783. ADCQ $0, R10
  1784. MOVQ P751P1_6, AX
  1785. MULQ R11
  1786. ADDQ AX, R8
  1787. ADCQ DX, R9
  1788. ADCQ $0, R10
  1789. MOVQ (56)(REG_P2), R12
  1790. MOVQ P751P1_5, AX
  1791. MULQ R12
  1792. ADDQ AX, R8
  1793. ADCQ DX, R9
  1794. ADCQ $0, R10
  1795. ADDQ (96)(REG_P1), R8
  1796. MOVQ R8, (REG_P2) // Z0
  1797. ADCQ $0, R9
  1798. ADCQ $0, R10
  1799. XORQ R8, R8
  1800. MOVQ P751P1_11, AX
  1801. MULQ R13
  1802. ADDQ AX, R9
  1803. ADCQ DX, R10
  1804. ADCQ $0, R8
  1805. MOVQ P751P1_10, AX
  1806. MULQ R14
  1807. ADDQ AX, R9
  1808. ADCQ DX, R10
  1809. ADCQ $0, R8
  1810. MOVQ P751P1_9, AX
  1811. MULQ R15
  1812. ADDQ AX, R9
  1813. ADCQ DX, R10
  1814. ADCQ $0, R8
  1815. MOVQ P751P1_8, AX
  1816. MULQ CX
  1817. ADDQ AX, R9
  1818. ADCQ DX, R10
  1819. ADCQ $0, R8
  1820. MOVQ P751P1_7, AX
  1821. MULQ R11
  1822. ADDQ AX, R9
  1823. ADCQ DX, R10
  1824. ADCQ $0, R8
  1825. MOVQ P751P1_6, AX
  1826. MULQ R12
  1827. ADDQ AX, R9
  1828. ADCQ DX, R10
  1829. ADCQ $0, R8
  1830. MOVQ (64)(REG_P2), R13
  1831. MOVQ P751P1_5, AX
  1832. MULQ R13
  1833. ADDQ AX, R9
  1834. ADCQ DX, R10
  1835. ADCQ $0, R8
  1836. ADDQ (104)(REG_P1), R9
  1837. MOVQ R9, (8)(REG_P2) // Z1
  1838. ADCQ $0, R10
  1839. ADCQ $0, R8
  1840. XORQ R9, R9
  1841. MOVQ P751P1_11, AX
  1842. MULQ R14
  1843. ADDQ AX, R10
  1844. ADCQ DX, R8
  1845. ADCQ $0, R9
  1846. MOVQ P751P1_10, AX
  1847. MULQ R15
  1848. ADDQ AX, R10
  1849. ADCQ DX, R8
  1850. ADCQ $0, R9
  1851. MOVQ P751P1_9, AX
  1852. MULQ CX
  1853. ADDQ AX, R10
  1854. ADCQ DX, R8
  1855. ADCQ $0, R9
  1856. MOVQ P751P1_8, AX
  1857. MULQ R11
  1858. ADDQ AX, R10
  1859. ADCQ DX, R8
  1860. ADCQ $0, R9
  1861. MOVQ P751P1_7, AX
  1862. MULQ R12
  1863. ADDQ AX, R10
  1864. ADCQ DX, R8
  1865. ADCQ $0, R9
  1866. MOVQ P751P1_6, AX
  1867. MULQ R13
  1868. ADDQ AX, R10
  1869. ADCQ DX, R8
  1870. ADCQ $0, R9
  1871. MOVQ (72)(REG_P2), R14
  1872. MOVQ P751P1_5, AX
  1873. MULQ R14
  1874. ADDQ AX, R10
  1875. ADCQ DX, R8
  1876. ADCQ $0, R9
  1877. ADDQ (112)(REG_P1), R10
  1878. MOVQ R10, (16)(REG_P2) // Z2
  1879. ADCQ $0, R8
  1880. ADCQ $0, R9
  1881. XORQ R10, R10
  1882. MOVQ P751P1_11, AX
  1883. MULQ R15
  1884. ADDQ AX, R8
  1885. ADCQ DX, R9
  1886. ADCQ $0, R10
  1887. MOVQ P751P1_10, AX
  1888. MULQ CX
  1889. ADDQ AX, R8
  1890. ADCQ DX, R9
  1891. ADCQ $0, R10
  1892. MOVQ P751P1_9, AX
  1893. MULQ R11
  1894. ADDQ AX, R8
  1895. ADCQ DX, R9
  1896. ADCQ $0, R10
  1897. MOVQ P751P1_8, AX
  1898. MULQ R12
  1899. ADDQ AX, R8
  1900. ADCQ DX, R9
  1901. ADCQ $0, R10
  1902. MOVQ P751P1_7, AX
  1903. MULQ R13
  1904. ADDQ AX, R8
  1905. ADCQ DX, R9
  1906. ADCQ $0, R10
  1907. MOVQ P751P1_6, AX
  1908. MULQ R14
  1909. ADDQ AX, R8
  1910. ADCQ DX, R9
  1911. ADCQ $0, R10
  1912. MOVQ (80)(REG_P2), R15
  1913. MOVQ P751P1_5, AX
  1914. MULQ R15
  1915. ADDQ AX, R8
  1916. ADCQ DX, R9
  1917. ADCQ $0, R10
  1918. ADDQ (120)(REG_P1), R8
  1919. MOVQ R8, (24)(REG_P2) // Z3
  1920. ADCQ $0, R9
  1921. ADCQ $0, R10
  1922. XORQ R8, R8
  1923. MOVQ P751P1_11, AX
  1924. MULQ CX
  1925. ADDQ AX, R9
  1926. ADCQ DX, R10
  1927. ADCQ $0, R8
  1928. MOVQ P751P1_10, AX
  1929. MULQ R11
  1930. ADDQ AX, R9
  1931. ADCQ DX, R10
  1932. ADCQ $0, R8
  1933. MOVQ P751P1_9, AX
  1934. MULQ R12
  1935. ADDQ AX, R9
  1936. ADCQ DX, R10
  1937. ADCQ $0, R8
  1938. MOVQ P751P1_8, AX
  1939. MULQ R13
  1940. ADDQ AX, R9
  1941. ADCQ DX, R10
  1942. ADCQ $0, R8
  1943. MOVQ P751P1_7, AX
  1944. MULQ R14
  1945. ADDQ AX, R9
  1946. ADCQ DX, R10
  1947. ADCQ $0, R8
  1948. MOVQ P751P1_6, AX
  1949. MULQ R15
  1950. ADDQ AX, R9
  1951. ADCQ DX, R10
  1952. ADCQ $0, R8
  1953. MOVQ (88)(REG_P2), CX
  1954. MOVQ P751P1_5, AX
  1955. MULQ CX
  1956. ADDQ AX, R9
  1957. ADCQ DX, R10
  1958. ADCQ $0, R8
  1959. ADDQ (128)(REG_P1), R9
  1960. MOVQ R9, (32)(REG_P2) // Z4
  1961. ADCQ $0, R10
  1962. ADCQ $0, R8
  1963. XORQ R9, R9
  1964. MOVQ P751P1_11, AX
  1965. MULQ R11
  1966. ADDQ AX, R10
  1967. ADCQ DX, R8
  1968. ADCQ $0, R9
  1969. MOVQ P751P1_10, AX
  1970. MULQ R12
  1971. ADDQ AX, R10
  1972. ADCQ DX, R8
  1973. ADCQ $0, R9
  1974. MOVQ P751P1_9, AX
  1975. MULQ R13
  1976. ADDQ AX, R10
  1977. ADCQ DX, R8
  1978. ADCQ $0, R9
  1979. MOVQ P751P1_8, AX
  1980. MULQ R14
  1981. ADDQ AX, R10
  1982. ADCQ DX, R8
  1983. ADCQ $0, R9
  1984. MOVQ P751P1_7, AX
  1985. MULQ R15
  1986. ADDQ AX, R10
  1987. ADCQ DX, R8
  1988. ADCQ $0, R9
  1989. MOVQ P751P1_6, AX
  1990. MULQ CX
  1991. ADDQ AX, R10
  1992. ADCQ DX, R8
  1993. ADCQ $0, R9
  1994. ADDQ (136)(REG_P1), R10
  1995. MOVQ R10, (40)(REG_P2) // Z5
  1996. ADCQ $0, R8
  1997. ADCQ $0, R9
  1998. XORQ R10, R10
  1999. MOVQ P751P1_11, AX
  2000. MULQ R12
  2001. ADDQ AX, R8
  2002. ADCQ DX, R9
  2003. ADCQ $0, R10
  2004. MOVQ P751P1_10, AX
  2005. MULQ R13
  2006. ADDQ AX, R8
  2007. ADCQ DX, R9
  2008. ADCQ $0, R10
  2009. MOVQ P751P1_9, AX
  2010. MULQ R14
  2011. ADDQ AX, R8
  2012. ADCQ DX, R9
  2013. ADCQ $0, R10
  2014. MOVQ P751P1_8, AX
  2015. MULQ R15
  2016. ADDQ AX, R8
  2017. ADCQ DX, R9
  2018. ADCQ $0, R10
  2019. MOVQ P751P1_7, AX
  2020. MULQ CX
  2021. ADDQ AX, R8
  2022. ADCQ DX, R9
  2023. ADCQ $0, R10
  2024. ADDQ (144)(REG_P1), R8
  2025. MOVQ R8, (48)(REG_P2) // Z6
  2026. ADCQ $0, R9
  2027. ADCQ $0, R10
  2028. XORQ R8, R8
  2029. MOVQ P751P1_11, AX
  2030. MULQ R13
  2031. ADDQ AX, R9
  2032. ADCQ DX, R10
  2033. ADCQ $0, R8
  2034. MOVQ P751P1_10, AX
  2035. MULQ R14
  2036. ADDQ AX, R9
  2037. ADCQ DX, R10
  2038. ADCQ $0, R8
  2039. MOVQ P751P1_9, AX
  2040. MULQ R15
  2041. ADDQ AX, R9
  2042. ADCQ DX, R10
  2043. ADCQ $0, R8
  2044. MOVQ P751P1_8, AX
  2045. MULQ CX
  2046. ADDQ AX, R9
  2047. ADCQ DX, R10
  2048. ADCQ $0, R8
  2049. ADDQ (152)(REG_P1), R9
  2050. MOVQ R9, (56)(REG_P2) // Z7
  2051. ADCQ $0, R10
  2052. ADCQ $0, R8
  2053. XORQ R9, R9
  2054. MOVQ P751P1_11, AX
  2055. MULQ R14
  2056. ADDQ AX, R10
  2057. ADCQ DX, R8
  2058. ADCQ $0, R9
  2059. MOVQ P751P1_10, AX
  2060. MULQ R15
  2061. ADDQ AX, R10
  2062. ADCQ DX, R8
  2063. ADCQ $0, R9
  2064. MOVQ P751P1_9, AX
  2065. MULQ CX
  2066. ADDQ AX, R10
  2067. ADCQ DX, R8
  2068. ADCQ $0, R9
  2069. ADDQ (160)(REG_P1), R10
  2070. MOVQ R10, (64)(REG_P2) // Z8
  2071. ADCQ $0, R8
  2072. ADCQ $0, R9
  2073. XORQ R10, R10
  2074. MOVQ P751P1_11, AX
  2075. MULQ R15
  2076. ADDQ AX, R8
  2077. ADCQ DX, R9
  2078. ADCQ $0, R10
  2079. MOVQ P751P1_10, AX
  2080. MULQ CX
  2081. ADDQ AX, R8
  2082. ADCQ DX, R9
  2083. ADCQ $0, R10
  2084. ADDQ (168)(REG_P1), R8 // Z9
  2085. MOVQ R8, (72)(REG_P2) // Z9
  2086. ADCQ $0, R9
  2087. ADCQ $0, R10
  2088. MOVQ P751P1_11, AX
  2089. MULQ CX
  2090. ADDQ AX, R9
  2091. ADCQ DX, R10
  2092. ADDQ (176)(REG_P1), R9 // Z10
  2093. MOVQ R9, (80)(REG_P2) // Z10
  2094. ADCQ $0, R10
  2095. ADDQ (184)(REG_P1), R10 // Z11
  2096. MOVQ R10, (88)(REG_P2) // Z11
  2097. RET
  2098. redc_with_mulx_adcx_adox:
  2099. // This implements the Montgomery reduction algorithm described in
  2100. // section 5.2.3 of https://eprint.iacr.org/2017/1015.pdf.
  2101. // This assumes that the BMI2 and ADX instruction set extensions are available.
  2102. REDC(0(REG_P2), 0(REG_P1), mul256x448bmi2adx)
  2103. RET
  2104. redc_with_mulx:
  2105. // This implements the Montgomery reduction algorithm described in
  2106. // section 5.2.3 of https://eprint.iacr.org/2017/1015.pdf.
  2107. // This assumes that the BMI2 instruction set extension is available.
  2108. REDC(0(REG_P2), 0(REG_P1), mul256x448bmi2)
  2109. RET
  2110. TEXT ·fp751AddLazy(SB), NOSPLIT, $0-24
  2111. MOVQ z+0(FP), REG_P3
  2112. MOVQ x+8(FP), REG_P1
  2113. MOVQ y+16(FP), REG_P2
  2114. MOVQ (REG_P1), R8
  2115. MOVQ (8)(REG_P1), R9
  2116. MOVQ (16)(REG_P1), R10
  2117. MOVQ (24)(REG_P1), R11
  2118. MOVQ (32)(REG_P1), R12
  2119. MOVQ (40)(REG_P1), R13
  2120. MOVQ (48)(REG_P1), R14
  2121. MOVQ (56)(REG_P1), R15
  2122. MOVQ (64)(REG_P1), AX
  2123. MOVQ (72)(REG_P1), BX
  2124. MOVQ (80)(REG_P1), CX
  2125. MOVQ (88)(REG_P1), DI
  2126. ADDQ (REG_P2), R8
  2127. ADCQ (8)(REG_P2), R9
  2128. ADCQ (16)(REG_P2), R10
  2129. ADCQ (24)(REG_P2), R11
  2130. ADCQ (32)(REG_P2), R12
  2131. ADCQ (40)(REG_P2), R13
  2132. ADCQ (48)(REG_P2), R14
  2133. ADCQ (56)(REG_P2), R15
  2134. ADCQ (64)(REG_P2), AX
  2135. ADCQ (72)(REG_P2), BX
  2136. ADCQ (80)(REG_P2), CX
  2137. ADCQ (88)(REG_P2), DI
  2138. MOVQ R8, (REG_P3)
  2139. MOVQ R9, (8)(REG_P3)
  2140. MOVQ R10, (16)(REG_P3)
  2141. MOVQ R11, (24)(REG_P3)
  2142. MOVQ R12, (32)(REG_P3)
  2143. MOVQ R13, (40)(REG_P3)
  2144. MOVQ R14, (48)(REG_P3)
  2145. MOVQ R15, (56)(REG_P3)
  2146. MOVQ AX, (64)(REG_P3)
  2147. MOVQ BX, (72)(REG_P3)
  2148. MOVQ CX, (80)(REG_P3)
  2149. MOVQ DI, (88)(REG_P3)
  2150. RET
  2151. TEXT ·fp751X2AddLazy(SB), NOSPLIT, $0-24
  2152. MOVQ z+0(FP), REG_P3
  2153. MOVQ x+8(FP), REG_P1
  2154. MOVQ y+16(FP), REG_P2
  2155. MOVQ (REG_P1), R8
  2156. MOVQ (8)(REG_P1), R9
  2157. MOVQ (16)(REG_P1), R10
  2158. MOVQ (24)(REG_P1), R11
  2159. MOVQ (32)(REG_P1), R12
  2160. MOVQ (40)(REG_P1), R13
  2161. MOVQ (48)(REG_P1), R14
  2162. MOVQ (56)(REG_P1), R15
  2163. MOVQ (64)(REG_P1), AX
  2164. MOVQ (72)(REG_P1), BX
  2165. MOVQ (80)(REG_P1), CX
  2166. ADDQ (REG_P2), R8
  2167. ADCQ (8)(REG_P2), R9
  2168. ADCQ (16)(REG_P2), R10
  2169. ADCQ (24)(REG_P2), R11
  2170. ADCQ (32)(REG_P2), R12
  2171. ADCQ (40)(REG_P2), R13
  2172. ADCQ (48)(REG_P2), R14
  2173. ADCQ (56)(REG_P2), R15
  2174. ADCQ (64)(REG_P2), AX
  2175. ADCQ (72)(REG_P2), BX
  2176. ADCQ (80)(REG_P2), CX
  2177. MOVQ R8, (REG_P3)
  2178. MOVQ R9, (8)(REG_P3)
  2179. MOVQ R10, (16)(REG_P3)
  2180. MOVQ R11, (24)(REG_P3)
  2181. MOVQ R12, (32)(REG_P3)
  2182. MOVQ R13, (40)(REG_P3)
  2183. MOVQ R14, (48)(REG_P3)
  2184. MOVQ R15, (56)(REG_P3)
  2185. MOVQ AX, (64)(REG_P3)
  2186. MOVQ BX, (72)(REG_P3)
  2187. MOVQ CX, (80)(REG_P3)
  2188. MOVQ (88)(REG_P1), AX
  2189. ADCQ (88)(REG_P2), AX
  2190. MOVQ AX, (88)(REG_P3)
  2191. MOVQ (96)(REG_P1), R8
  2192. MOVQ (104)(REG_P1), R9
  2193. MOVQ (112)(REG_P1), R10
  2194. MOVQ (120)(REG_P1), R11
  2195. MOVQ (128)(REG_P1), R12
  2196. MOVQ (136)(REG_P1), R13
  2197. MOVQ (144)(REG_P1), R14
  2198. MOVQ (152)(REG_P1), R15
  2199. MOVQ (160)(REG_P1), AX
  2200. MOVQ (168)(REG_P1), BX
  2201. MOVQ (176)(REG_P1), CX
  2202. MOVQ (184)(REG_P1), DI
  2203. ADCQ (96)(REG_P2), R8
  2204. ADCQ (104)(REG_P2), R9
  2205. ADCQ (112)(REG_P2), R10
  2206. ADCQ (120)(REG_P2), R11
  2207. ADCQ (128)(REG_P2), R12
  2208. ADCQ (136)(REG_P2), R13
  2209. ADCQ (144)(REG_P2), R14
  2210. ADCQ (152)(REG_P2), R15
  2211. ADCQ (160)(REG_P2), AX
  2212. ADCQ (168)(REG_P2), BX
  2213. ADCQ (176)(REG_P2), CX
  2214. ADCQ (184)(REG_P2), DI
  2215. MOVQ R8, (96)(REG_P3)
  2216. MOVQ R9, (104)(REG_P3)
  2217. MOVQ R10, (112)(REG_P3)
  2218. MOVQ R11, (120)(REG_P3)
  2219. MOVQ R12, (128)(REG_P3)
  2220. MOVQ R13, (136)(REG_P3)
  2221. MOVQ R14, (144)(REG_P3)
  2222. MOVQ R15, (152)(REG_P3)
  2223. MOVQ AX, (160)(REG_P3)
  2224. MOVQ BX, (168)(REG_P3)
  2225. MOVQ CX, (176)(REG_P3)
  2226. MOVQ DI, (184)(REG_P3)
  2227. RET
  2228. TEXT ·fp751X2SubLazy(SB), NOSPLIT, $0-24
  2229. MOVQ z+0(FP), REG_P3
  2230. MOVQ x+8(FP), REG_P1
  2231. MOVQ y+16(FP), REG_P2
  2232. MOVQ (REG_P1), R8
  2233. MOVQ (8)(REG_P1), R9
  2234. MOVQ (16)(REG_P1), R10
  2235. MOVQ (24)(REG_P1), R11
  2236. MOVQ (32)(REG_P1), R12
  2237. MOVQ (40)(REG_P1), R13
  2238. MOVQ (48)(REG_P1), R14
  2239. MOVQ (56)(REG_P1), R15
  2240. MOVQ (64)(REG_P1), AX
  2241. MOVQ (72)(REG_P1), BX
  2242. MOVQ (80)(REG_P1), CX
  2243. SUBQ (REG_P2), R8
  2244. SBBQ (8)(REG_P2), R9
  2245. SBBQ (16)(REG_P2), R10
  2246. SBBQ (24)(REG_P2), R11
  2247. SBBQ (32)(REG_P2), R12
  2248. SBBQ (40)(REG_P2), R13
  2249. SBBQ (48)(REG_P2), R14
  2250. SBBQ (56)(REG_P2), R15
  2251. SBBQ (64)(REG_P2), AX
  2252. SBBQ (72)(REG_P2), BX
  2253. SBBQ (80)(REG_P2), CX
  2254. MOVQ R8, (REG_P3)
  2255. MOVQ R9, (8)(REG_P3)
  2256. MOVQ R10, (16)(REG_P3)
  2257. MOVQ R11, (24)(REG_P3)
  2258. MOVQ R12, (32)(REG_P3)
  2259. MOVQ R13, (40)(REG_P3)
  2260. MOVQ R14, (48)(REG_P3)
  2261. MOVQ R15, (56)(REG_P3)
  2262. MOVQ AX, (64)(REG_P3)
  2263. MOVQ BX, (72)(REG_P3)
  2264. MOVQ CX, (80)(REG_P3)
  2265. MOVQ (88)(REG_P1), AX
  2266. SBBQ (88)(REG_P2), AX
  2267. MOVQ AX, (88)(REG_P3)
  2268. MOVQ (96)(REG_P1), R8
  2269. MOVQ (104)(REG_P1), R9
  2270. MOVQ (112)(REG_P1), R10
  2271. MOVQ (120)(REG_P1), R11
  2272. MOVQ (128)(REG_P1), R12
  2273. MOVQ (136)(REG_P1), R13
  2274. MOVQ (144)(REG_P1), R14
  2275. MOVQ (152)(REG_P1), R15
  2276. MOVQ (160)(REG_P1), AX
  2277. MOVQ (168)(REG_P1), BX
  2278. MOVQ (176)(REG_P1), CX
  2279. MOVQ (184)(REG_P1), DI
  2280. SBBQ (96)(REG_P2), R8
  2281. SBBQ (104)(REG_P2), R9
  2282. SBBQ (112)(REG_P2), R10
  2283. SBBQ (120)(REG_P2), R11
  2284. SBBQ (128)(REG_P2), R12
  2285. SBBQ (136)(REG_P2), R13
  2286. SBBQ (144)(REG_P2), R14
  2287. SBBQ (152)(REG_P2), R15
  2288. SBBQ (160)(REG_P2), AX
  2289. SBBQ (168)(REG_P2), BX
  2290. SBBQ (176)(REG_P2), CX
  2291. SBBQ (184)(REG_P2), DI
  2292. MOVQ R8, (96)(REG_P3)
  2293. MOVQ R9, (104)(REG_P3)
  2294. MOVQ R10, (112)(REG_P3)
  2295. MOVQ R11, (120)(REG_P3)
  2296. MOVQ R12, (128)(REG_P3)
  2297. MOVQ R13, (136)(REG_P3)
  2298. MOVQ R14, (144)(REG_P3)
  2299. MOVQ R15, (152)(REG_P3)
  2300. MOVQ AX, (160)(REG_P3)
  2301. MOVQ BX, (168)(REG_P3)
  2302. MOVQ CX, (176)(REG_P3)
  2303. MOVQ DI, (184)(REG_P3)
  2304. // Now the carry flag is 1 if x-y < 0. If so, add p*2^768.
  2305. MOVQ $0, AX
  2306. SBBQ $0, AX
  2307. // Load p into registers:
  2308. MOVQ P751_0, R8
  2309. // P751_{1,2,3,4} = P751_0, so reuse R8
  2310. MOVQ P751_5, R9
  2311. MOVQ P751_6, R10
  2312. MOVQ P751_7, R11
  2313. MOVQ P751_8, R12
  2314. MOVQ P751_9, R13
  2315. MOVQ P751_10, R14
  2316. MOVQ P751_11, R15
  2317. ANDQ AX, R8
  2318. ANDQ AX, R9
  2319. ANDQ AX, R10
  2320. ANDQ AX, R11
  2321. ANDQ AX, R12
  2322. ANDQ AX, R13
  2323. ANDQ AX, R14
  2324. ANDQ AX, R15
  2325. ADDQ R8, (96 )(REG_P3)
  2326. ADCQ R8, (96+ 8)(REG_P3)
  2327. ADCQ R8, (96+16)(REG_P3)
  2328. ADCQ R8, (96+24)(REG_P3)
  2329. ADCQ R8, (96+32)(REG_P3)
  2330. ADCQ R9, (96+40)(REG_P3)
  2331. ADCQ R10, (96+48)(REG_P3)
  2332. ADCQ R11, (96+56)(REG_P3)
  2333. ADCQ R12, (96+64)(REG_P3)
  2334. ADCQ R13, (96+72)(REG_P3)
  2335. ADCQ R14, (96+80)(REG_P3)
  2336. ADCQ R15, (96+88)(REG_P3)
  2337. RET