You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

1724 lines
39 KiB

  1. // +build amd64,!noasm
  2. #include "textflag.h"
  3. // p503
  4. #define P503_0 $0xFFFFFFFFFFFFFFFF
  5. #define P503_1 $0xFFFFFFFFFFFFFFFF
  6. #define P503_2 $0xFFFFFFFFFFFFFFFF
  7. #define P503_3 $0xABFFFFFFFFFFFFFF
  8. #define P503_4 $0x13085BDA2211E7A0
  9. #define P503_5 $0x1B9BF6C87B7E7DAF
  10. #define P503_6 $0x6045C6BDDA77A4D0
  11. #define P503_7 $0x004066F541811E1E
  12. // p503+1
  13. #define P503P1_3 $0xAC00000000000000
  14. #define P503P1_4 $0x13085BDA2211E7A0
  15. #define P503P1_5 $0x1B9BF6C87B7E7DAF
  16. #define P503P1_6 $0x6045C6BDDA77A4D0
  17. #define P503P1_7 $0x004066F541811E1E
  18. // p503x2
  19. #define P503X2_0 $0xFFFFFFFFFFFFFFFE
  20. #define P503X2_1 $0xFFFFFFFFFFFFFFFF
  21. #define P503X2_2 $0xFFFFFFFFFFFFFFFF
  22. #define P503X2_3 $0x57FFFFFFFFFFFFFF
  23. #define P503X2_4 $0x2610B7B44423CF41
  24. #define P503X2_5 $0x3737ED90F6FCFB5E
  25. #define P503X2_6 $0xC08B8D7BB4EF49A0
  26. #define P503X2_7 $0x0080CDEA83023C3C
  27. #define REG_P1 DI
  28. #define REG_P2 SI
  29. #define REG_P3 DX
  30. // Performs schoolbook multiplication of 2 256-bit numbers. This optimized version
  31. // uses MULX instruction. Macro smashes value in DX.
  32. // Input: I0 and I1.
  33. // Output: O
  34. // All the other arguments are resgisters, used for storing temporary values
  35. #define MULS256_MULX(O, I0, I1, T0, T1, T2, T3, T4, T5, T6, T7, T8, T9) \
  36. MOVQ I0, DX \
  37. MULXQ I1, T1, T0 \ // T0:T1 = A0*B0
  38. MOVQ T1, O \ // O[0]
  39. MULXQ 8+I1, T2, T1 \ // T1:T2 = U0*V1
  40. ADDQ T2, T0 \
  41. MULXQ 16+I1, T3, T2 \ // T2:T3 = U0*V2
  42. ADCQ T3, T1 \
  43. MULXQ 24+I1, T4, T3 \ // T3:T4 = U0*V3
  44. ADCQ T4, T2 \
  45. \ // Column U1
  46. MOVQ 8+I0, DX \
  47. ADCQ $0, T3 \
  48. MULXQ 0+I1, T4, T5 \ // T5:T4 = U1*V0
  49. MULXQ 8+I1, T7, T6 \ // T6:T7 = U1*V1
  50. ADDQ T7, T5 \
  51. MULXQ 16+I1, T8, T7 \ // T7:T8 = U1*V2
  52. ADCQ T8, T6 \
  53. MULXQ 24+I1, T9, T8 \ // T8:T9 = U1*V3
  54. ADCQ T9, T7 \
  55. ADCQ $0, T8 \
  56. ADDQ T0, T4 \
  57. MOVQ T4, 8+O \ // O[1]
  58. ADCQ T1, T5 \
  59. ADCQ T2, T6 \
  60. ADCQ T3, T7 \
  61. \ // Column U2
  62. MOVQ 16+I0, DX \
  63. ADCQ $0, T8 \
  64. MULXQ 0+I1, T0, T1 \ // T1:T0 = U2*V0
  65. MULXQ 8+I1, T3, T2 \ // T2:T3 = U2*V1
  66. ADDQ T3, T1 \
  67. MULXQ 16+I1, T4, T3 \ // T3:T4 = U2*V2
  68. ADCQ T4, T2 \
  69. MULXQ 24+I1, T9, T4 \ // T4:T9 = U2*V3
  70. ADCQ T9, T3 \
  71. \ // Column U3
  72. MOVQ 24+I0, DX \
  73. ADCQ $0, T4 \
  74. ADDQ T5, T0 \
  75. MOVQ T0, 16+O \ // O[2]
  76. ADCQ T6, T1 \
  77. ADCQ T7, T2 \
  78. ADCQ T8, T3 \
  79. ADCQ $0, T4 \
  80. MULXQ 0+I1, T0, T5 \ // T5:T0 = U3*V0
  81. MULXQ 8+I1, T7, T6 \ // T6:T7 = U3*V1
  82. ADDQ T7, T5 \
  83. MULXQ 16+I1, T8, T7 \ // T7:T8 = U3*V2
  84. ADCQ T8, T6 \
  85. MULXQ 24+I1, T9, T8 \ // T8:T9 = U3*V3
  86. ADCQ T9, T7 \
  87. ADCQ $0, T8 \
  88. \ // Add values in remaining columns
  89. ADDQ T0, T1 \
  90. MOVQ T1, 24+O \ // O[3]
  91. ADCQ T5, T2 \
  92. MOVQ T2, 32+O \ // O[4]
  93. ADCQ T6, T3 \
  94. MOVQ T3, 40+O \ // O[5]
  95. ADCQ T7, T4 \
  96. MOVQ T4, 48+O \ // O[6]
  97. ADCQ $0, T8 \ // O[7]
  98. MOVQ T8, 56+O
  99. // Performs schoolbook multiplication of 2 256-bit numbers. This optimized version
  100. // uses ADOX, ADCX and MULX instructions. Macro smashes values in AX and DX.
  101. // Input: I0 and I1.
  102. // Output: O
  103. // All the other arguments resgisters are used for storing temporary values
  104. #define MULS256_MULXADX(O, I0, I1, T0, T1, T2, T3, T4, T5, T6, T7, T8, T9) \
  105. \ // U0[0]
  106. MOVQ 0+I0, DX \ // MULX requires multiplayer in DX
  107. \ // T0:T1 = I1*DX
  108. MULXQ I1, T1, T0 \ // T0:T1 = U0*V0 (low:high)
  109. MOVQ T1, O \ // O0[0]
  110. MULXQ 8+I1, T2, T1 \ // T2:T1 = U0*V1
  111. XORQ AX, AX \
  112. ADOXQ T2, T0 \
  113. MULXQ 16+I1, T3, T2 \ // T2:T3 = U0*V2
  114. ADOXQ T3, T1 \
  115. MULXQ 24+I1, T4, T3 \ // T3:T4 = U0*V3
  116. ADOXQ T4, T2 \
  117. \ // Column U1
  118. MOVQ 8+I0, DX \
  119. MULXQ I1, T4, T5 \ // T5:T4 = U1*V0
  120. ADOXQ AX, T3 \
  121. XORQ AX, AX \
  122. MULXQ 8+I1, T7, T6 \ // T6:T7 = U1*V1
  123. ADOXQ T0, T4 \
  124. MOVQ T4, 8+O \ // O[1]
  125. ADCXQ T7, T5 \
  126. MULXQ 16+I1, T8, T7 \ // T7:T8 = U1*V2
  127. ADCXQ T8, T6 \
  128. ADOXQ T1, T5 \
  129. MULXQ 24+I1, T9, T8 \ // T8:T9 = U1*V3
  130. ADCXQ T9, T7 \
  131. ADCXQ AX, T8 \
  132. ADOXQ T2, T6 \
  133. \ // Column U2
  134. MOVQ 16+I0, DX \
  135. MULXQ I1, T0, T1 \ // T1:T0 = U2*V0
  136. ADOXQ T3, T7 \
  137. ADOXQ AX, T8 \
  138. XORQ AX, AX \
  139. MULXQ 8+I1, T3, T2 \ // T2:T3 = U2*V1
  140. ADOXQ T5, T0 \
  141. MOVQ T0, 16+O \ // O[2]
  142. ADCXQ T3, T1 \
  143. MULXQ 16+I1, T4, T3 \ // T3:T4 = U2*V2
  144. ADCXQ T4, T2 \
  145. ADOXQ T6, T1 \
  146. MULXQ 24+I1, T9, T4 \ // T9:T4 = U2*V3
  147. ADCXQ T9, T3 \
  148. MOVQ 24+I0, DX \
  149. ADCXQ AX, T4 \
  150. \
  151. ADOXQ T7, T2 \
  152. ADOXQ T8, T3 \
  153. ADOXQ AX, T4 \
  154. \ // Column U3
  155. MULXQ I1, T0, T5 \ // T5:T0 = U3*B0
  156. XORQ AX, AX \
  157. MULXQ 8+I1, T7, T6 \ // T6:T7 = U3*B1
  158. ADCXQ T7, T5 \
  159. ADOXQ T0, T1 \
  160. MULXQ 16+I1, T8, T7 \ // T7:T8 = U3*V2
  161. ADCXQ T8, T6 \
  162. ADOXQ T5, T2 \
  163. MULXQ 24+I1, T9, T8 \ // T8:T9 = U3*V3
  164. ADCXQ T9, T7 \
  165. ADCXQ AX, T8 \
  166. \
  167. ADOXQ T6, T3 \
  168. ADOXQ T7, T4 \
  169. ADOXQ AX, T8 \
  170. MOVQ T1, 24+O \ // O[3]
  171. MOVQ T2, 32+O \ // O[4]
  172. MOVQ T3, 40+O \ // O[5]
  173. MOVQ T4, 48+O \ // O[6] and O[7] below
  174. MOVQ T8, 56+O
  175. // Template of a macro that performs schoolbook multiplication of 128-bit with 320-bit
  176. // number. It uses MULX instruction This template must be customized with functions
  177. // performing ADD (add1, add2) and ADD-with-carry (adc1, adc2). addX/adcX may or may
  178. // not be instructions that use two independent carry chains.
  179. // Input:
  180. // * I0 128-bit number
  181. // * I1 320-bit number
  182. // * add1, add2: instruction performing integer addition and starting carry chain
  183. // * adc1, adc2: instruction performing integer addition with carry
  184. // Output: T[0-6] registers
  185. #define MULS_128x320(I0, I1, T0, T1, T2, T3, T4, T5, T6, T7, T8, T9, add1, add2, adc1, adc2) \
  186. \ // Column 0
  187. MOVQ I0, DX \
  188. MULXQ I1+24(SB), T0, T1 \
  189. MULXQ I1+32(SB), T4, T2 \
  190. XORQ AX, AX \
  191. MULXQ I1+40(SB), T5, T3 \
  192. add1 T4, T1 \
  193. adc1 T5, T2 \
  194. MULXQ I1+48(SB), T7, T4 \
  195. adc1 T7, T3 \
  196. MULXQ I1+56(SB), T6, T5 \
  197. adc1 T6, T4 \
  198. adc1 AX, T5 \
  199. \ // Column 1
  200. MOVQ 8+I0, DX \
  201. MULXQ I1+24(SB), T6, T7 \
  202. add2 T6, T1 \
  203. adc2 T7, T2 \
  204. MULXQ I1+32(SB), T8, T6 \
  205. adc2 T6, T3 \
  206. MULXQ I1+40(SB), T7, T9 \
  207. adc2 T9, T4 \
  208. MULXQ I1+48(SB), T9, T6 \
  209. adc2 T6, T5 \
  210. MULXQ I1+56(SB), DX, T6 \
  211. adc2 AX, T6 \
  212. \ // Output
  213. XORQ AX, AX \
  214. add1 T8, T2 \
  215. adc1 T7, T3 \
  216. adc1 T9, T4 \
  217. adc1 DX, T5 \
  218. adc1 AX, T6
  219. // Multiplies 128-bit with 320-bit integer. Optimized with MULX instruction.
  220. #define MULS_128x320_MULX(I0, I1, T0, T1, T2, T3, T4, T5, T6, T7, T8, T9) \
  221. MULS_128x320(I0, I1, T0, T1, T2, T3, T4, T5, T6, T7, T8, T9, ADDQ, ADDQ, ADCQ, ADCQ)
  222. // Multiplies 128-bit with 320-bit integer. Optimized with MULX, ADOX and ADCX instructions
  223. #define MULS_128x320_MULXADX(I0, I1, T0, T1, T2, T3, T4, T5, T6, T7, T8, T9) \
  224. MULS_128x320(I0, I1, T0, T1, T2, T3, T4, T5, T6, T7, T8, T9, ADOXQ, ADCXQ, ADOXQ, ADCXQ)
  225. // Template of a macro performing multiplication of two 512-bit numbers. It uses one
  226. // level of Karatsuba and one level of schoolbook multiplication. Template must be
  227. // customized with macro performing schoolbook multiplication.
  228. // Input:
  229. // * I0, I1 - two 512-bit numbers
  230. // * MULS - either MULS256_MULX or MULS256_MULXADX
  231. // Output: OUT - 1024-bit long
  232. #define MUL(OUT, I0, I1, MULS) \
  233. \ // R[8-11]: U1+U0
  234. XORQ AX, AX \
  235. MOVQ ( 0)(I0), R8 \
  236. MOVQ ( 8)(I0), R9 \
  237. MOVQ (16)(I0), R10 \
  238. MOVQ (24)(I0), R11 \
  239. ADDQ (32)(I0), R8 \
  240. ADCQ (40)(I0), R9 \
  241. ADCQ (48)(I0), R10 \
  242. ADCQ (56)(I0), R11 \
  243. SBBQ $0, AX \ // store mask
  244. MOVQ R8, ( 0)(SP) \
  245. MOVQ R9, ( 8)(SP) \
  246. MOVQ R10, (16)(SP) \
  247. MOVQ R11, (24)(SP) \
  248. \
  249. \ // R[12-15]: V1+V0
  250. XORQ BX, BX \
  251. MOVQ ( 0)(I1), R12 \
  252. MOVQ ( 8)(I1), R13 \
  253. MOVQ (16)(I1), R14 \
  254. MOVQ (24)(I1), R15 \
  255. ADDQ (32)(I1), R12 \
  256. ADCQ (40)(I1), R13 \
  257. ADCQ (48)(I1), R14 \
  258. ADCQ (56)(I1), R15 \
  259. SBBQ $0, BX \ // store mask
  260. MOVQ R12, (32)(SP) \
  261. MOVQ R13, (40)(SP) \
  262. MOVQ R14, (48)(SP) \
  263. MOVQ R15, (56)(SP) \
  264. \ // Prepare mask for U0+U1 (U1+U0 mod 256^4 if U1+U0 sets carry flag, otherwise 0)
  265. ANDQ AX, R12 \
  266. ANDQ AX, R13 \
  267. ANDQ AX, R14 \
  268. ANDQ AX, R15 \
  269. \ // Prepare mask for V0+V1 (V1+V0 mod 256^4 if U1+U0 sets carry flag, otherwise 0)
  270. ANDQ BX, R8 \
  271. ANDQ BX, R9 \
  272. ANDQ BX, R10 \
  273. ANDQ BX, R11 \
  274. \ // res = masked(U0+U1) + masked(V0 + V1)
  275. ADDQ R12, R8 \
  276. ADCQ R13, R9 \
  277. ADCQ R14, R10 \
  278. ADCQ R15, R11 \
  279. \ // SP[64-96] <- res
  280. MOVQ R8, (64)(SP) \
  281. MOVQ R9, (72)(SP) \
  282. MOVQ R10, (80)(SP) \
  283. MOVQ R11, (88)(SP) \
  284. \ // BP will be used for schoolbook multiplication below
  285. MOVQ BP, 96(SP) \
  286. \ // (U1+U0)*(V1+V0)
  287. MULS((64)(OUT), 0(SP), 32(SP), R8, R9, R10, R11, R12, R13, R14, R15, BX, BP) \
  288. \ // U0 x V0
  289. MULS(0(OUT), 0(I0), 0(I1), R8, R9, R10, R11, R12, R13, R14, R15, BX, BP) \
  290. \ // U1 x V1
  291. MULS(0(SP), 32(I0), 32(I1), R8, R9, R10, R11, R12, R13, R14, R15, BX, BP) \
  292. \ // Recover BP
  293. MOVQ 96(SP), BP \
  294. \ // Final part of schoolbook multiplication; R[8-11] = (U0+U1) x (V0+V1)
  295. MOVQ (64)(SP), R8 \
  296. MOVQ (72)(SP), R9 \
  297. MOVQ (80)(SP), R10 \
  298. MOVQ (88)(SP), R11 \
  299. MOVQ (96)(OUT), AX \
  300. ADDQ AX, R8 \
  301. MOVQ (104)(OUT), AX \
  302. ADCQ AX, R9 \
  303. MOVQ (112)(OUT), AX \
  304. ADCQ AX, R10 \
  305. MOVQ (120)(OUT), AX \
  306. ADCQ AX, R11 \
  307. \ // R[12-15, 8-11] = (U0+U1) x (V0+V1) - U0xV0
  308. MOVQ (64)(OUT), R12 \
  309. MOVQ (72)(OUT), R13 \
  310. MOVQ (80)(OUT), R14 \
  311. MOVQ (88)(OUT), R15 \
  312. SUBQ ( 0)(OUT), R12 \
  313. SBBQ ( 8)(OUT), R13 \
  314. SBBQ (16)(OUT), R14 \
  315. SBBQ (24)(OUT), R15 \
  316. SBBQ (32)(OUT), R8 \
  317. SBBQ (40)(OUT), R9 \
  318. SBBQ (48)(OUT), R10 \
  319. SBBQ (56)(OUT), R11 \
  320. \ // r8-r15 <- (U0+U1) x (V0+V1) - U0xV0 - U1xV1
  321. SUBQ ( 0)(SP), R12 \
  322. SBBQ ( 8)(SP), R13 \
  323. SBBQ (16)(SP), R14 \
  324. SBBQ (24)(SP), R15 \
  325. SBBQ (32)(SP), R8 \
  326. SBBQ (40)(SP), R9 \
  327. SBBQ (48)(SP), R10 \
  328. SBBQ (56)(SP), R11 \
  329. \
  330. ; ADDQ (32)(OUT), R12; MOVQ R12, ( 32)(OUT) \
  331. ; ADCQ (40)(OUT), R13; MOVQ R13, ( 40)(OUT) \
  332. ; ADCQ (48)(OUT), R14; MOVQ R14, ( 48)(OUT) \
  333. ; ADCQ (56)(OUT), R15; MOVQ R15, ( 56)(OUT) \
  334. MOVQ ( 0)(SP), AX; ADCQ AX, R8; MOVQ R8, ( 64)(OUT) \
  335. MOVQ ( 8)(SP), AX; ADCQ AX, R9; MOVQ R9, ( 72)(OUT) \
  336. MOVQ (16)(SP), AX; ADCQ AX, R10; MOVQ R10, ( 80)(OUT) \
  337. MOVQ (24)(SP), AX; ADCQ AX, R11; MOVQ R11, ( 88)(OUT) \
  338. MOVQ (32)(SP), R12; ADCQ $0, R12; MOVQ R12, ( 96)(OUT) \
  339. MOVQ (40)(SP), R13; ADCQ $0, R13; MOVQ R13, (104)(OUT) \
  340. MOVQ (48)(SP), R14; ADCQ $0, R14; MOVQ R14, (112)(OUT) \
  341. MOVQ (56)(SP), R15; ADCQ $0, R15; MOVQ R15, (120)(OUT)
  342. // Template for calculating the Montgomery reduction algorithm described in
  343. // section 5.2.3 of https://eprint.iacr.org/2017/1015.pdf. Template must be
  344. // customized with schoolbook multiplicaton for 128 x 320-bit number.
  345. // This macro reuses memory of IN value and *changes* it.
  346. // Input:
  347. // * IN: 1024-bit number to be reduced
  348. // * MULS: either MULS_128x320_MULX or MULS_128x320_MULXADX
  349. // Output: OUT 512-bit
  350. #define REDC(OUT, IN, MULS) \
  351. MULS(0(IN), ·p503p1, R8, R9, R10, R11, R12, R13, R14, BX, CX, R15) \
  352. XORQ R15, R15 \
  353. ADDQ (24)(IN), R8 \
  354. ADCQ (32)(IN), R9 \
  355. ADCQ (40)(IN), R10 \
  356. ADCQ (48)(IN), R11 \
  357. ADCQ (56)(IN), R12 \
  358. ADCQ (64)(IN), R13 \
  359. ADCQ (72)(IN), R14 \
  360. ADCQ (80)(IN), R15 \
  361. MOVQ R8, (24)(IN) \
  362. MOVQ R9, (32)(IN) \
  363. MOVQ R10, (40)(IN) \
  364. MOVQ R11, (48)(IN) \
  365. MOVQ R12, (56)(IN) \
  366. MOVQ R13, (64)(IN) \
  367. MOVQ R14, (72)(IN) \
  368. MOVQ R15, (80)(IN) \
  369. MOVQ (88)(IN), R8 \
  370. MOVQ (96)(IN), R9 \
  371. MOVQ (104)(IN), R10 \
  372. MOVQ (112)(IN), R11 \
  373. MOVQ (120)(IN), R12 \
  374. ADCQ $0, R8 \
  375. ADCQ $0, R9 \
  376. ADCQ $0, R10 \
  377. ADCQ $0, R11 \
  378. ADCQ $0, R12 \
  379. MOVQ R8, (88)(IN) \
  380. MOVQ R9, (96)(IN) \
  381. MOVQ R10, (104)(IN) \
  382. MOVQ R11, (112)(IN) \
  383. MOVQ R12, (120)(IN) \
  384. \
  385. MULS(16(IN), ·p503p1, R8, R9, R10, R11, R12, R13, R14, BX, CX, R15) \
  386. XORQ R15, R15 \
  387. ADDQ (40)(IN), R8 \
  388. ADCQ (48)(IN), R9 \
  389. ADCQ (56)(IN), R10 \
  390. ADCQ (64)(IN), R11 \
  391. ADCQ (72)(IN), R12 \
  392. ADCQ (80)(IN), R13 \
  393. ADCQ (88)(IN), R14 \
  394. ADCQ (96)(IN), R15 \
  395. MOVQ R8, (40)(IN) \
  396. MOVQ R9, (48)(IN) \
  397. MOVQ R10, (56)(IN) \
  398. MOVQ R11, (64)(IN) \
  399. MOVQ R12, (72)(IN) \
  400. MOVQ R13, (80)(IN) \
  401. MOVQ R14, (88)(IN) \
  402. MOVQ R15, (96)(IN) \
  403. MOVQ (104)(IN), R8 \
  404. MOVQ (112)(IN), R9 \
  405. MOVQ (120)(IN), R10 \
  406. ADCQ $0, R8 \
  407. ADCQ $0, R9 \
  408. ADCQ $0, R10 \
  409. MOVQ R8, (104)(IN) \
  410. MOVQ R9, (112)(IN) \
  411. MOVQ R10, (120)(IN) \
  412. \
  413. MULS(32(IN), ·p503p1, R8, R9, R10, R11, R12, R13, R14, BX, CX, R15) \
  414. XORQ R15, R15 \
  415. XORQ BX, BX \
  416. ADDQ ( 56)(IN), R8 \
  417. ADCQ ( 64)(IN), R9 \
  418. ADCQ ( 72)(IN), R10 \
  419. ADCQ ( 80)(IN), R11 \
  420. ADCQ ( 88)(IN), R12 \
  421. ADCQ ( 96)(IN), R13 \
  422. ADCQ (104)(IN), R14 \
  423. ADCQ (112)(IN), R15 \
  424. ADCQ (120)(IN), BX \
  425. MOVQ R8, ( 56)(IN) \
  426. MOVQ R10, ( 72)(IN) \
  427. MOVQ R11, ( 80)(IN) \
  428. MOVQ R12, ( 88)(IN) \
  429. MOVQ R13, ( 96)(IN) \
  430. MOVQ R14, (104)(IN) \
  431. MOVQ R15, (112)(IN) \
  432. MOVQ BX, (120)(IN) \
  433. MOVQ R9, ( 0)(OUT) \ // Result: OUT[0]
  434. \
  435. MULS(48(IN), ·p503p1, R8, R9, R10, R11, R12, R13, R14, BX, CX, R15) \
  436. ADDQ ( 72)(IN), R8 \
  437. ADCQ ( 80)(IN), R9 \
  438. ADCQ ( 88)(IN), R10 \
  439. ADCQ ( 96)(IN), R11 \
  440. ADCQ (104)(IN), R12 \
  441. ADCQ (112)(IN), R13 \
  442. ADCQ (120)(IN), R14 \
  443. MOVQ R8, ( 8)(OUT) \ // Result: OUT[1]
  444. MOVQ R9, (16)(OUT) \ // Result: OUT[2]
  445. MOVQ R10, (24)(OUT) \ // Result: OUT[3]
  446. MOVQ R11, (32)(OUT) \ // Result: OUT[4]
  447. MOVQ R12, (40)(OUT) \ // Result: OUT[5]
  448. MOVQ R13, (48)(OUT) \ // Result: OUT[6] and OUT[7]
  449. MOVQ R14, (56)(OUT)
  450. TEXT ·fp503StrongReduce(SB), NOSPLIT, $0-8
  451. MOVQ x+0(FP), REG_P1
  452. // Zero AX for later use:
  453. XORQ AX, AX
  454. // Load p into registers:
  455. MOVQ P503_0, R8
  456. // P503_{1,2} = P503_0, so reuse R8
  457. MOVQ P503_3, R9
  458. MOVQ P503_4, R10
  459. MOVQ P503_5, R11
  460. MOVQ P503_6, R12
  461. MOVQ P503_7, R13
  462. // Set x <- x - p
  463. SUBQ R8, ( 0)(REG_P1)
  464. SBBQ R8, ( 8)(REG_P1)
  465. SBBQ R8, (16)(REG_P1)
  466. SBBQ R9, (24)(REG_P1)
  467. SBBQ R10, (32)(REG_P1)
  468. SBBQ R11, (40)(REG_P1)
  469. SBBQ R12, (48)(REG_P1)
  470. SBBQ R13, (56)(REG_P1)
  471. // Save carry flag indicating x-p < 0 as a mask
  472. SBBQ $0, AX
  473. // Conditionally add p to x if x-p < 0
  474. ANDQ AX, R8
  475. ANDQ AX, R9
  476. ANDQ AX, R10
  477. ANDQ AX, R11
  478. ANDQ AX, R12
  479. ANDQ AX, R13
  480. ADDQ R8, ( 0)(REG_P1)
  481. ADCQ R8, ( 8)(REG_P1)
  482. ADCQ R8, (16)(REG_P1)
  483. ADCQ R9, (24)(REG_P1)
  484. ADCQ R10,(32)(REG_P1)
  485. ADCQ R11,(40)(REG_P1)
  486. ADCQ R12,(48)(REG_P1)
  487. ADCQ R13,(56)(REG_P1)
  488. RET
  489. TEXT ·fp503ConditionalSwap(SB),NOSPLIT,$0-17
  490. MOVQ x+0(FP), REG_P1
  491. MOVQ y+8(FP), REG_P2
  492. MOVB choice+16(FP), AL // AL = 0 or 1
  493. MOVBLZX AL, AX // AX = 0 or 1
  494. NEGQ AX // AX = 0x00..00 or 0xff..ff
  495. #ifndef CSWAP_BLOCK
  496. #define CSWAP_BLOCK(idx) \
  497. MOVQ (idx*8)(REG_P1), BX \ // BX = x[idx]
  498. MOVQ (idx*8)(REG_P2), CX \ // CX = y[idx]
  499. MOVQ CX, DX \ // DX = y[idx]
  500. XORQ BX, DX \ // DX = y[idx] ^ x[idx]
  501. ANDQ AX, DX \ // DX = (y[idx] ^ x[idx]) & mask
  502. XORQ DX, BX \ // BX = (y[idx] ^ x[idx]) & mask) ^ x[idx] = x[idx] or y[idx]
  503. XORQ DX, CX \ // CX = (y[idx] ^ x[idx]) & mask) ^ y[idx] = y[idx] or x[idx]
  504. MOVQ BX, (idx*8)(REG_P1) \
  505. MOVQ CX, (idx*8)(REG_P2)
  506. #endif
  507. CSWAP_BLOCK(0)
  508. CSWAP_BLOCK(1)
  509. CSWAP_BLOCK(2)
  510. CSWAP_BLOCK(3)
  511. CSWAP_BLOCK(4)
  512. CSWAP_BLOCK(5)
  513. CSWAP_BLOCK(6)
  514. CSWAP_BLOCK(7)
  515. #ifdef CSWAP_BLOCK
  516. #undef CSWAP_BLOCK
  517. #endif
  518. RET
  519. TEXT ·fp503AddReduced(SB),NOSPLIT,$0-24
  520. MOVQ z+0(FP), REG_P3
  521. MOVQ x+8(FP), REG_P1
  522. MOVQ y+16(FP), REG_P2
  523. // Used later to calculate a mask
  524. XORQ CX, CX
  525. // [R8-R15]: z = x + y
  526. MOVQ ( 0)(REG_P1), R8
  527. MOVQ ( 8)(REG_P1), R9
  528. MOVQ (16)(REG_P1), R10
  529. MOVQ (24)(REG_P1), R11
  530. MOVQ (32)(REG_P1), R12
  531. MOVQ (40)(REG_P1), R13
  532. MOVQ (48)(REG_P1), R14
  533. MOVQ (56)(REG_P1), R15
  534. ADDQ ( 0)(REG_P2), R8
  535. ADCQ ( 8)(REG_P2), R9
  536. ADCQ (16)(REG_P2), R10
  537. ADCQ (24)(REG_P2), R11
  538. ADCQ (32)(REG_P2), R12
  539. ADCQ (40)(REG_P2), R13
  540. ADCQ (48)(REG_P2), R14
  541. ADCQ (56)(REG_P2), R15
  542. MOVQ P503X2_0, AX
  543. SUBQ AX, R8
  544. MOVQ P503X2_1, AX
  545. SBBQ AX, R9
  546. SBBQ AX, R10
  547. MOVQ P503X2_3, AX
  548. SBBQ AX, R11
  549. MOVQ P503X2_4, AX
  550. SBBQ AX, R12
  551. MOVQ P503X2_5, AX
  552. SBBQ AX, R13
  553. MOVQ P503X2_6, AX
  554. SBBQ AX, R14
  555. MOVQ P503X2_7, AX
  556. SBBQ AX, R15
  557. // mask
  558. SBBQ $0, CX
  559. // move z to REG_P3
  560. MOVQ R8, ( 0)(REG_P3)
  561. MOVQ R9, ( 8)(REG_P3)
  562. MOVQ R10, (16)(REG_P3)
  563. MOVQ R11, (24)(REG_P3)
  564. MOVQ R12, (32)(REG_P3)
  565. MOVQ R13, (40)(REG_P3)
  566. MOVQ R14, (48)(REG_P3)
  567. MOVQ R15, (56)(REG_P3)
  568. // if z<0 add p503x2 back
  569. MOVQ P503X2_0, R8
  570. MOVQ P503X2_1, R9
  571. MOVQ P503X2_3, R10
  572. MOVQ P503X2_4, R11
  573. MOVQ P503X2_5, R12
  574. MOVQ P503X2_6, R13
  575. MOVQ P503X2_7, R14
  576. ANDQ CX, R8
  577. ANDQ CX, R9
  578. ANDQ CX, R10
  579. ANDQ CX, R11
  580. ANDQ CX, R12
  581. ANDQ CX, R13
  582. ANDQ CX, R14
  583. MOVQ ( 0)(REG_P3), AX; ADDQ R8, AX; MOVQ AX, ( 0)(REG_P3)
  584. MOVQ ( 8)(REG_P3), AX; ADCQ R9, AX; MOVQ AX, ( 8)(REG_P3)
  585. MOVQ (16)(REG_P3), AX; ADCQ R9, AX; MOVQ AX, (16)(REG_P3)
  586. MOVQ (24)(REG_P3), AX; ADCQ R10, AX; MOVQ AX, (24)(REG_P3)
  587. MOVQ (32)(REG_P3), AX; ADCQ R11, AX; MOVQ AX, (32)(REG_P3)
  588. MOVQ (40)(REG_P3), AX; ADCQ R12, AX; MOVQ AX, (40)(REG_P3)
  589. MOVQ (48)(REG_P3), AX; ADCQ R13, AX; MOVQ AX, (48)(REG_P3)
  590. MOVQ (56)(REG_P3), AX; ADCQ R14, AX; MOVQ AX, (56)(REG_P3)
  591. RET
  592. TEXT ·fp503SubReduced(SB), NOSPLIT, $0-24
  593. MOVQ z+0(FP), REG_P3
  594. MOVQ x+8(FP), REG_P1
  595. MOVQ y+16(FP), REG_P2
  596. // Used later to calculate a mask
  597. XORQ CX, CX
  598. MOVQ ( 0)(REG_P1), R8
  599. MOVQ ( 8)(REG_P1), R9
  600. MOVQ (16)(REG_P1), R10
  601. MOVQ (24)(REG_P1), R11
  602. MOVQ (32)(REG_P1), R12
  603. MOVQ (40)(REG_P1), R13
  604. MOVQ (48)(REG_P1), R14
  605. MOVQ (56)(REG_P1), R15
  606. SUBQ ( 0)(REG_P2), R8
  607. SBBQ ( 8)(REG_P2), R9
  608. SBBQ (16)(REG_P2), R10
  609. SBBQ (24)(REG_P2), R11
  610. SBBQ (32)(REG_P2), R12
  611. SBBQ (40)(REG_P2), R13
  612. SBBQ (48)(REG_P2), R14
  613. SBBQ (56)(REG_P2), R15
  614. // mask
  615. SBBQ $0, CX
  616. // store x-y in REG_P3
  617. MOVQ R8, ( 0)(REG_P3)
  618. MOVQ R9, ( 8)(REG_P3)
  619. MOVQ R10, (16)(REG_P3)
  620. MOVQ R11, (24)(REG_P3)
  621. MOVQ R12, (32)(REG_P3)
  622. MOVQ R13, (40)(REG_P3)
  623. MOVQ R14, (48)(REG_P3)
  624. MOVQ R15, (56)(REG_P3)
  625. // if z<0 add p503x2 back
  626. MOVQ P503X2_0, R8
  627. MOVQ P503X2_1, R9
  628. MOVQ P503X2_3, R10
  629. MOVQ P503X2_4, R11
  630. MOVQ P503X2_5, R12
  631. MOVQ P503X2_6, R13
  632. MOVQ P503X2_7, R14
  633. ANDQ CX, R8
  634. ANDQ CX, R9
  635. ANDQ CX, R10
  636. ANDQ CX, R11
  637. ANDQ CX, R12
  638. ANDQ CX, R13
  639. ANDQ CX, R14
  640. MOVQ ( 0)(REG_P3), AX; ADDQ R8, AX; MOVQ AX, ( 0)(REG_P3)
  641. MOVQ ( 8)(REG_P3), AX; ADCQ R9, AX; MOVQ AX, ( 8)(REG_P3)
  642. MOVQ (16)(REG_P3), AX; ADCQ R9, AX; MOVQ AX, (16)(REG_P3)
  643. MOVQ (24)(REG_P3), AX; ADCQ R10, AX; MOVQ AX, (24)(REG_P3)
  644. MOVQ (32)(REG_P3), AX; ADCQ R11, AX; MOVQ AX, (32)(REG_P3)
  645. MOVQ (40)(REG_P3), AX; ADCQ R12, AX; MOVQ AX, (40)(REG_P3)
  646. MOVQ (48)(REG_P3), AX; ADCQ R13, AX; MOVQ AX, (48)(REG_P3)
  647. MOVQ (56)(REG_P3), AX; ADCQ R14, AX; MOVQ AX, (56)(REG_P3)
  648. RET
  649. TEXT ·mulWithMULXADX(SB), NOSPLIT, $104-24
  650. // Actual implementation
  651. MOVQ z+ 0(FP), CX
  652. MOVQ x+ 8(FP), REG_P2
  653. MOVQ y+16(FP), REG_P1
  654. MUL(CX, REG_P2, REG_P1, MULS256_MULXADX)
  655. RET
  656. // TEXT ·fp503Mul(SB), NOSPLIT, $104-24
  657. // // Actual implementation
  658. // MOVQ z+ 0(FP), CX
  659. // MOVQ x+ 8(FP), REG_P2
  660. // MOVQ y+16(FP), REG_P1
  661. // MUL(CX, REG_P2, REG_P1, MULS256_MULXADX)
  662. // RET
  663. TEXT ·fp503MulXXX(SB), NOSPLIT, $104-72
  664. // Actual implementation
  665. MOVQ z+ 0(FP), CX
  666. MOVQ x+24(FP), REG_P2
  667. MOVQ y+48(FP), REG_P1
  668. MUL(CX, REG_P2, REG_P1, MULS256_MULXADX)
  669. RET
  670. TEXT ·mulWithMULX(SB), NOSPLIT, $104-24
  671. // Actual implementation
  672. MOVQ z+ 0(FP), CX
  673. MOVQ x+ 8(FP), REG_P2
  674. MOVQ y+16(FP), REG_P1
  675. MUL(CX, REG_P2, REG_P1, MULS256_MULX)
  676. RET
  677. TEXT ·mul(SB), $96-24
  678. // Uses variant of Karatsuba method.
  679. //
  680. // Here we store the destination in CX instead of in REG_P3 because the
  681. // multiplication instructions use DX as an implicit destination
  682. // operand: MULQ $REG sets DX:AX <-- AX * $REG.
  683. // Actual implementation
  684. MOVQ z+0(FP), CX
  685. MOVQ x+8(FP), REG_P1
  686. MOVQ y+16(FP), REG_P2
  687. // RAX and RDX will be used for a mask (0-borrow)
  688. XORQ AX, AX
  689. // RCX[0-3]: U1+U0
  690. MOVQ (32)(REG_P1), R8
  691. MOVQ (40)(REG_P1), R9
  692. MOVQ (48)(REG_P1), R10
  693. MOVQ (56)(REG_P1), R11
  694. ADDQ ( 0)(REG_P1), R8
  695. ADCQ ( 8)(REG_P1), R9
  696. ADCQ (16)(REG_P1), R10
  697. ADCQ (24)(REG_P1), R11
  698. MOVQ R8, ( 0)(CX)
  699. MOVQ R9, ( 8)(CX)
  700. MOVQ R10, (16)(CX)
  701. MOVQ R11, (24)(CX)
  702. SBBQ $0, AX
  703. // R12-R15: V1+V0
  704. XORQ DX, DX
  705. MOVQ (32)(REG_P2), R12
  706. MOVQ (40)(REG_P2), R13
  707. MOVQ (48)(REG_P2), R14
  708. MOVQ (56)(REG_P2), R15
  709. ADDQ ( 0)(REG_P2), R12
  710. ADCQ ( 8)(REG_P2), R13
  711. ADCQ (16)(REG_P2), R14
  712. ADCQ (24)(REG_P2), R15
  713. SBBQ $0, DX
  714. // Store carries on stack
  715. MOVQ AX, (64)(SP)
  716. MOVQ DX, (72)(SP)
  717. // (SP[0-3],R8,R9,R10,R11) <- (U0+U1)*(V0+V1).
  718. // MUL using comba; In comments below U=U0+U1 V=V0+V1
  719. // U0*V0
  720. MOVQ (CX), AX
  721. MULQ R12
  722. MOVQ AX, (SP) // C0
  723. MOVQ DX, R8
  724. // U0*V1
  725. XORQ R9, R9
  726. MOVQ (CX), AX
  727. MULQ R13
  728. ADDQ AX, R8
  729. ADCQ DX, R9
  730. // U1*V0
  731. XORQ R10, R10
  732. MOVQ (8)(CX), AX
  733. MULQ R12
  734. ADDQ AX, R8
  735. MOVQ R8, (8)(SP) // C1
  736. ADCQ DX, R9
  737. ADCQ $0, R10
  738. // U0*V2
  739. XORQ R8, R8
  740. MOVQ (CX), AX
  741. MULQ R14
  742. ADDQ AX, R9
  743. ADCQ DX, R10
  744. ADCQ $0, R8
  745. // U2*V0
  746. MOVQ (16)(CX), AX
  747. MULQ R12
  748. ADDQ AX, R9
  749. ADCQ DX, R10
  750. ADCQ $0, R8
  751. // U1*V1
  752. MOVQ (8)(CX), AX
  753. MULQ R13
  754. ADDQ AX, R9
  755. MOVQ R9, (16)(SP) // C2
  756. ADCQ DX, R10
  757. ADCQ $0, R8
  758. // U0*V3
  759. XORQ R9, R9
  760. MOVQ (CX), AX
  761. MULQ R15
  762. ADDQ AX, R10
  763. ADCQ DX, R8
  764. ADCQ $0, R9
  765. // U3*V0
  766. MOVQ (24)(CX), AX
  767. MULQ R12
  768. ADDQ AX, R10
  769. ADCQ DX, R8
  770. ADCQ $0, R9
  771. // U1*V2
  772. MOVQ (8)(CX), AX
  773. MULQ R14
  774. ADDQ AX, R10
  775. ADCQ DX, R8
  776. ADCQ $0, R9
  777. // U2*V1
  778. MOVQ (16)(CX), AX
  779. MULQ R13
  780. ADDQ AX, R10
  781. MOVQ R10, (24)(SP) // C3
  782. ADCQ DX, R8
  783. ADCQ $0, R9
  784. // U1*V3
  785. XORQ R10, R10
  786. MOVQ (8)(CX), AX
  787. MULQ R15
  788. ADDQ AX, R8
  789. ADCQ DX, R9
  790. ADCQ $0, R10
  791. // U3*V1
  792. MOVQ (24)(CX), AX
  793. MULQ R13
  794. ADDQ AX, R8
  795. ADCQ DX, R9
  796. ADCQ $0, R10
  797. // U2*V2
  798. MOVQ (16)(CX), AX
  799. MULQ R14
  800. ADDQ AX, R8
  801. MOVQ R8, (32)(SP) // C4
  802. ADCQ DX, R9
  803. ADCQ $0, R10
  804. // U2*V3
  805. XORQ R11, R11
  806. MOVQ (16)(CX), AX
  807. MULQ R15
  808. ADDQ AX, R9
  809. ADCQ DX, R10
  810. ADCQ $0, R11
  811. // U3*V2
  812. MOVQ (24)(CX), AX
  813. MULQ R14
  814. ADDQ AX, R9 // C5
  815. ADCQ DX, R10
  816. ADCQ $0, R11
  817. // U3*V3
  818. MOVQ (24)(CX), AX
  819. MULQ R15
  820. ADDQ AX, R10 // C6
  821. ADCQ DX, R11 // C7
  822. MOVQ (64)(SP), AX
  823. ANDQ AX, R12
  824. ANDQ AX, R13
  825. ANDQ AX, R14
  826. ANDQ AX, R15
  827. ADDQ R8, R12
  828. ADCQ R9, R13
  829. ADCQ R10, R14
  830. ADCQ R11, R15
  831. MOVQ (72)(SP), AX
  832. MOVQ (CX), R8
  833. MOVQ (8)(CX), R9
  834. MOVQ (16)(CX), R10
  835. MOVQ (24)(CX), R11
  836. ANDQ AX, R8
  837. ANDQ AX, R9
  838. ANDQ AX, R10
  839. ANDQ AX, R11
  840. ADDQ R12, R8
  841. ADCQ R13, R9
  842. ADCQ R14, R10
  843. ADCQ R15, R11
  844. MOVQ R8, (32)(SP)
  845. MOVQ R9, (40)(SP)
  846. MOVQ R10, (48)(SP)
  847. MOVQ R11, (56)(SP)
  848. // CX[0-7] <- AL*BL
  849. // U0*V0
  850. MOVQ (REG_P1), R11
  851. MOVQ (REG_P2), AX
  852. MULQ R11
  853. XORQ R9, R9
  854. MOVQ AX, (CX) // C0
  855. MOVQ DX, R8
  856. // U0*V1
  857. MOVQ (16)(REG_P1), R14
  858. MOVQ (8)(REG_P2), AX
  859. MULQ R11
  860. XORQ R10, R10
  861. ADDQ AX, R8
  862. ADCQ DX, R9
  863. // U1*V0
  864. MOVQ (8)(REG_P1), R12
  865. MOVQ (REG_P2), AX
  866. MULQ R12
  867. ADDQ AX, R8
  868. MOVQ R8, (8)(CX) // C1
  869. ADCQ DX, R9
  870. ADCQ $0, R10
  871. // U0*V2
  872. XORQ R8, R8
  873. MOVQ (16)(REG_P2), AX
  874. MULQ R11
  875. ADDQ AX, R9
  876. ADCQ DX, R10
  877. ADCQ $0, R8
  878. // U2*V0
  879. MOVQ (REG_P2), R13
  880. MOVQ R14, AX
  881. MULQ R13
  882. ADDQ AX, R9
  883. ADCQ DX, R10
  884. ADCQ $0, R8
  885. // U1*V1
  886. MOVQ (8)(REG_P2), AX
  887. MULQ R12
  888. ADDQ AX, R9
  889. MOVQ R9, (16)(CX) // C2
  890. ADCQ DX, R10
  891. ADCQ $0, R8
  892. // U0*V3
  893. XORQ R9, R9
  894. MOVQ (24)(REG_P2), AX
  895. MULQ R11
  896. MOVQ (24)(REG_P1), R15
  897. ADDQ AX, R10
  898. ADCQ DX, R8
  899. ADCQ $0, R9
  900. // U3*V1
  901. MOVQ R15, AX
  902. MULQ R13
  903. ADDQ AX, R10
  904. ADCQ DX, R8
  905. ADCQ $0, R9
  906. // U2*V2
  907. MOVQ (16)(REG_P2), AX
  908. MULQ R12
  909. ADDQ AX, R10
  910. ADCQ DX, R8
  911. ADCQ $0, R9
  912. // U2*V3
  913. MOVQ (8)(REG_P2), AX
  914. MULQ R14
  915. ADDQ AX, R10
  916. MOVQ R10, (24)(CX) // C3
  917. ADCQ DX, R8
  918. ADCQ $0, R9
  919. // U3*V2
  920. XORQ R10, R10
  921. MOVQ (24)(REG_P2), AX
  922. MULQ R12
  923. ADDQ AX, R8
  924. ADCQ DX, R9
  925. ADCQ $0, R10
  926. // U3*V1
  927. MOVQ (8)(REG_P2), AX
  928. MULQ R15
  929. ADDQ AX, R8
  930. ADCQ DX, R9
  931. ADCQ $0, R10
  932. // U2*V2
  933. MOVQ (16)(REG_P2), AX
  934. MULQ R14
  935. ADDQ AX, R8
  936. MOVQ R8, (32)(CX) // C4
  937. ADCQ DX, R9
  938. ADCQ $0, R10
  939. // U2*V3
  940. XORQ R8, R8
  941. MOVQ (24)(REG_P2), AX
  942. MULQ R14
  943. ADDQ AX, R9
  944. ADCQ DX, R10
  945. ADCQ $0, R8
  946. // U3*V2
  947. MOVQ (16)(REG_P2), AX
  948. MULQ R15
  949. ADDQ AX, R9
  950. MOVQ R9, (40)(CX) // C5
  951. ADCQ DX, R10
  952. ADCQ $0, R8
  953. // U3*V3
  954. MOVQ (24)(REG_P2), AX
  955. MULQ R15
  956. ADDQ AX, R10
  957. MOVQ R10, (48)(CX) // C6
  958. ADCQ DX, R8
  959. MOVQ R8, (56)(CX) // C7
  960. // CX[8-15] <- U1*V1
  961. MOVQ (32)(REG_P1), R11
  962. MOVQ (32)(REG_P2), AX
  963. MULQ R11
  964. XORQ R9, R9
  965. MOVQ AX, (64)(CX) // C0
  966. MOVQ DX, R8
  967. MOVQ (48)(REG_P1), R14
  968. MOVQ (40)(REG_P2), AX
  969. MULQ R11
  970. XORQ R10, R10
  971. ADDQ AX, R8
  972. ADCQ DX, R9
  973. MOVQ (40)(REG_P1), R12
  974. MOVQ (32)(REG_P2), AX
  975. MULQ R12
  976. ADDQ AX, R8
  977. MOVQ R8, (72)(CX) // C1
  978. ADCQ DX, R9
  979. ADCQ $0, R10
  980. XORQ R8, R8
  981. MOVQ (48)(REG_P2), AX
  982. MULQ R11
  983. ADDQ AX, R9
  984. ADCQ DX, R10
  985. ADCQ $0, R8
  986. MOVQ (32)(REG_P2), R13
  987. MOVQ R14, AX
  988. MULQ R13
  989. ADDQ AX, R9
  990. ADCQ DX, R10
  991. ADCQ $0, R8
  992. MOVQ (40)(REG_P2), AX
  993. MULQ R12
  994. ADDQ AX, R9
  995. MOVQ R9, (80)(CX) // C2
  996. ADCQ DX, R10
  997. ADCQ $0, R8
  998. XORQ R9, R9
  999. MOVQ (56)(REG_P2), AX
  1000. MULQ R11
  1001. MOVQ (56)(REG_P1), R15
  1002. ADDQ AX, R10
  1003. ADCQ DX, R8
  1004. ADCQ $0, R9
  1005. MOVQ R15, AX
  1006. MULQ R13
  1007. ADDQ AX, R10
  1008. ADCQ DX, R8
  1009. ADCQ $0, R9
  1010. MOVQ (48)(REG_P2), AX
  1011. MULQ R12
  1012. ADDQ AX, R10
  1013. ADCQ DX, R8
  1014. ADCQ $0, R9
  1015. MOVQ (40)(REG_P2), AX
  1016. MULQ R14
  1017. ADDQ AX, R10
  1018. MOVQ R10, (88)(CX) // C3
  1019. ADCQ DX, R8
  1020. ADCQ $0, R9
  1021. XORQ R10, R10
  1022. MOVQ (56)(REG_P2), AX
  1023. MULQ R12
  1024. ADDQ AX, R8
  1025. ADCQ DX, R9
  1026. ADCQ $0, R10
  1027. MOVQ (40)(REG_P2), AX
  1028. MULQ R15
  1029. ADDQ AX, R8
  1030. ADCQ DX, R9
  1031. ADCQ $0, R10
  1032. MOVQ (48)(REG_P2), AX
  1033. MULQ R14
  1034. ADDQ AX, R8
  1035. MOVQ R8, (96)(CX) // C4
  1036. ADCQ DX, R9
  1037. ADCQ $0, R10
  1038. XORQ R8, R8
  1039. MOVQ (56)(REG_P2), AX
  1040. MULQ R14
  1041. ADDQ AX, R9
  1042. ADCQ DX, R10
  1043. ADCQ $0, R8
  1044. MOVQ (48)(REG_P2), AX
  1045. MULQ R15
  1046. ADDQ AX, R9
  1047. MOVQ R9, (104)(CX) // C5
  1048. ADCQ DX, R10
  1049. ADCQ $0, R8
  1050. MOVQ (56)(REG_P2), AX
  1051. MULQ R15
  1052. ADDQ AX, R10
  1053. MOVQ R10, (112)(CX) // C6
  1054. ADCQ DX, R8
  1055. MOVQ R8, (120)(CX) // C7
  1056. // [R8-R15] <- (U0+U1)*(V0+V1) - U1*V1
  1057. MOVQ (SP), R8
  1058. SUBQ (CX), R8
  1059. MOVQ (8)(SP), R9
  1060. SBBQ (8)(CX), R9
  1061. MOVQ (16)(SP), R10
  1062. SBBQ (16)(CX), R10
  1063. MOVQ (24)(SP), R11
  1064. SBBQ (24)(CX), R11
  1065. MOVQ (32)(SP), R12
  1066. SBBQ (32)(CX), R12
  1067. MOVQ (40)(SP), R13
  1068. SBBQ (40)(CX), R13
  1069. MOVQ (48)(SP), R14
  1070. SBBQ (48)(CX), R14
  1071. MOVQ (56)(SP), R15
  1072. SBBQ (56)(CX), R15
  1073. // [R8-R15] <- (U0+U1)*(V0+V1) - U1*V0 - U0*U1
  1074. MOVQ ( 64)(CX), AX; SUBQ AX, R8
  1075. MOVQ ( 72)(CX), AX; SBBQ AX, R9
  1076. MOVQ ( 80)(CX), AX; SBBQ AX, R10
  1077. MOVQ ( 88)(CX), AX; SBBQ AX, R11
  1078. MOVQ ( 96)(CX), AX; SBBQ AX, R12
  1079. MOVQ (104)(CX), DX; SBBQ DX, R13
  1080. MOVQ (112)(CX), DI; SBBQ DI, R14
  1081. MOVQ (120)(CX), SI; SBBQ SI, R15
  1082. // Final result
  1083. ADDQ (32)(CX), R8; MOVQ R8, (32)(CX)
  1084. ADCQ (40)(CX), R9; MOVQ R9, (40)(CX)
  1085. ADCQ (48)(CX), R10; MOVQ R10, (48)(CX)
  1086. ADCQ (56)(CX), R11; MOVQ R11, (56)(CX)
  1087. ADCQ (64)(CX), R12; MOVQ R12, (64)(CX)
  1088. ADCQ (72)(CX), R13; MOVQ R13, (72)(CX)
  1089. ADCQ (80)(CX), R14; MOVQ R14, (80)(CX)
  1090. ADCQ (88)(CX), R15; MOVQ R15, (88)(CX)
  1091. ADCQ $0, AX; MOVQ AX, (96)(CX)
  1092. ADCQ $0, DX; MOVQ DX, (104)(CX)
  1093. ADCQ $0, DI; MOVQ DI, (112)(CX)
  1094. ADCQ $0, SI; MOVQ SI, (120)(CX)
  1095. RET
  1096. TEXT ·redc(SB), $0-16
  1097. MOVQ z+0(FP), REG_P2
  1098. MOVQ x+8(FP), REG_P1
  1099. MOVQ (REG_P1), R11
  1100. MOVQ P503P1_3, AX
  1101. MULQ R11
  1102. XORQ R8, R8
  1103. ADDQ (24)(REG_P1), AX
  1104. MOVQ AX, (24)(REG_P2)
  1105. ADCQ DX, R8
  1106. XORQ R9, R9
  1107. MOVQ P503P1_4, AX
  1108. MULQ R11
  1109. XORQ R10, R10
  1110. ADDQ AX, R8
  1111. ADCQ DX, R9
  1112. MOVQ (8)(REG_P1), R12
  1113. MOVQ P503P1_3, AX
  1114. MULQ R12
  1115. ADDQ AX, R8
  1116. ADCQ DX, R9
  1117. ADCQ $0, R10
  1118. ADDQ (32)(REG_P1), R8
  1119. MOVQ R8, (32)(REG_P2) // Z4
  1120. ADCQ $0, R9
  1121. ADCQ $0, R10
  1122. XORQ R8, R8
  1123. MOVQ P503P1_5, AX
  1124. MULQ R11
  1125. ADDQ AX, R9
  1126. ADCQ DX, R10
  1127. ADCQ $0, R8
  1128. MOVQ P503P1_4, AX
  1129. MULQ R12
  1130. ADDQ AX, R9
  1131. ADCQ DX, R10
  1132. ADCQ $0, R8
  1133. MOVQ (16)(REG_P1), R13
  1134. MOVQ P503P1_3, AX
  1135. MULQ R13
  1136. ADDQ AX, R9
  1137. ADCQ DX, R10
  1138. ADCQ $0, R8
  1139. ADDQ (40)(REG_P1), R9
  1140. MOVQ R9, (40)(REG_P2) // Z5
  1141. ADCQ $0, R10
  1142. ADCQ $0, R8
  1143. XORQ R9, R9
  1144. MOVQ P503P1_6, AX
  1145. MULQ R11
  1146. ADDQ AX, R10
  1147. ADCQ DX, R8
  1148. ADCQ $0, R9
  1149. MOVQ P503P1_5, AX
  1150. MULQ R12
  1151. ADDQ AX, R10
  1152. ADCQ DX, R8
  1153. ADCQ $0, R9
  1154. MOVQ P503P1_4, AX
  1155. MULQ R13
  1156. ADDQ AX, R10
  1157. ADCQ DX, R8
  1158. ADCQ $0, R9
  1159. MOVQ (24)(REG_P2), R14
  1160. MOVQ P503P1_3, AX
  1161. MULQ R14
  1162. ADDQ AX, R10
  1163. ADCQ DX, R8
  1164. ADCQ $0, R9
  1165. ADDQ (48)(REG_P1), R10
  1166. MOVQ R10, (48)(REG_P2) // Z6
  1167. ADCQ $0, R8
  1168. ADCQ $0, R9
  1169. XORQ R10, R10
  1170. MOVQ P503P1_7, AX
  1171. MULQ R11
  1172. ADDQ AX, R8
  1173. ADCQ DX, R9
  1174. ADCQ $0, R10
  1175. MOVQ P503P1_6, AX
  1176. MULQ R12
  1177. ADDQ AX, R8
  1178. ADCQ DX, R9
  1179. ADCQ $0, R10
  1180. MOVQ P503P1_5, AX
  1181. MULQ R13
  1182. ADDQ AX, R8
  1183. ADCQ DX, R9
  1184. ADCQ $0, R10
  1185. MOVQ P503P1_4, AX
  1186. MULQ R14
  1187. ADDQ AX, R8
  1188. ADCQ DX, R9
  1189. ADCQ $0, R10
  1190. MOVQ (32)(REG_P2), R15
  1191. MOVQ P503P1_3, AX
  1192. MULQ R15
  1193. ADDQ AX, R8
  1194. ADCQ DX, R9
  1195. ADCQ $0, R10
  1196. ADDQ (56)(REG_P1), R8
  1197. MOVQ R8, (56)(REG_P2) // Z7
  1198. ADCQ $0, R9
  1199. ADCQ $0, R10
  1200. XORQ R8, R8
  1201. MOVQ P503P1_7, AX
  1202. MULQ R12
  1203. ADDQ AX, R9
  1204. ADCQ DX, R10
  1205. ADCQ $0, R8
  1206. MOVQ P503P1_6, AX
  1207. MULQ R13
  1208. ADDQ AX, R9
  1209. ADCQ DX, R10
  1210. ADCQ $0, R8
  1211. MOVQ P503P1_5, AX
  1212. MULQ R14
  1213. ADDQ AX, R9
  1214. ADCQ DX, R10
  1215. ADCQ $0, R8
  1216. MOVQ P503P1_4, AX
  1217. MULQ R15
  1218. ADDQ AX, R9
  1219. ADCQ DX, R10
  1220. ADCQ $0, R8
  1221. MOVQ (40)(REG_P2), CX
  1222. MOVQ P503P1_3, AX
  1223. MULQ CX
  1224. ADDQ AX, R9
  1225. ADCQ DX, R10
  1226. ADCQ $0, R8
  1227. ADDQ (64)(REG_P1), R9
  1228. MOVQ R9, (REG_P2) // Z0
  1229. ADCQ $0, R10
  1230. ADCQ $0, R8
  1231. XORQ R9, R9
  1232. MOVQ P503P1_7, AX
  1233. MULQ R13
  1234. ADDQ AX, R10
  1235. ADCQ DX, R8
  1236. ADCQ $0, R9
  1237. MOVQ P503P1_6, AX
  1238. MULQ R14
  1239. ADDQ AX, R10
  1240. ADCQ DX, R8
  1241. ADCQ $0, R9
  1242. MOVQ P503P1_5, AX
  1243. MULQ R15
  1244. ADDQ AX, R10
  1245. ADCQ DX, R8
  1246. ADCQ $0, R9
  1247. MOVQ P503P1_4, AX
  1248. MULQ CX
  1249. ADDQ AX, R10
  1250. ADCQ DX, R8
  1251. ADCQ $0, R9
  1252. MOVQ (48)(REG_P2), R13
  1253. MOVQ P503P1_3, AX
  1254. MULQ R13
  1255. ADDQ AX, R10
  1256. ADCQ DX, R8
  1257. ADCQ $0, R9
  1258. ADDQ (72)(REG_P1), R10
  1259. MOVQ R10, (8)(REG_P2) // Z1
  1260. ADCQ $0, R8
  1261. ADCQ $0, R9
  1262. XORQ R10, R10
  1263. MOVQ P503P1_7, AX
  1264. MULQ R14
  1265. ADDQ AX, R8
  1266. ADCQ DX, R9
  1267. ADCQ $0, R10
  1268. MOVQ P503P1_6, AX
  1269. MULQ R15
  1270. ADDQ AX, R8
  1271. ADCQ DX, R9
  1272. ADCQ $0, R10
  1273. MOVQ P503P1_5, AX
  1274. MULQ CX
  1275. ADDQ AX, R8
  1276. ADCQ DX, R9
  1277. ADCQ $0, R10
  1278. MOVQ P503P1_4, AX
  1279. MULQ R13
  1280. ADDQ AX, R8
  1281. ADCQ DX, R9
  1282. ADCQ $0, R10
  1283. MOVQ (56)(REG_P2), R14
  1284. MOVQ P503P1_3, AX
  1285. MULQ R14
  1286. ADDQ AX, R8
  1287. ADCQ DX, R9
  1288. ADCQ $0, R10
  1289. ADDQ (80)(REG_P1), R8
  1290. MOVQ R8, (16)(REG_P2) // Z2
  1291. ADCQ $0, R9
  1292. ADCQ $0, R10
  1293. XORQ R8, R8
  1294. MOVQ P503P1_7, AX
  1295. MULQ R15
  1296. ADDQ AX, R9
  1297. ADCQ DX, R10
  1298. ADCQ $0, R8
  1299. MOVQ P503P1_6, AX
  1300. MULQ CX
  1301. ADDQ AX, R9
  1302. ADCQ DX, R10
  1303. ADCQ $0, R8
  1304. MOVQ P503P1_5, AX
  1305. MULQ R13
  1306. ADDQ AX, R9
  1307. ADCQ DX, R10
  1308. ADCQ $0, R8
  1309. MOVQ P503P1_4, AX
  1310. MULQ R14
  1311. ADDQ AX, R9
  1312. ADCQ DX, R10
  1313. ADCQ $0, R8
  1314. ADDQ (88)(REG_P1), R9
  1315. MOVQ R9, (24)(REG_P2) // Z3
  1316. ADCQ $0, R10
  1317. ADCQ $0, R8
  1318. XORQ R9, R9
  1319. MOVQ P503P1_7, AX
  1320. MULQ CX
  1321. ADDQ AX, R10
  1322. ADCQ DX, R8
  1323. ADCQ $0, R9
  1324. MOVQ P503P1_6, AX
  1325. MULQ R13
  1326. ADDQ AX, R10
  1327. ADCQ DX, R8
  1328. ADCQ $0, R9
  1329. MOVQ P503P1_5, AX
  1330. MULQ R14
  1331. ADDQ AX, R10
  1332. ADCQ DX, R8
  1333. ADCQ $0, R9
  1334. ADDQ (96)(REG_P1), R10
  1335. MOVQ R10, (32)(REG_P2) // Z4
  1336. ADCQ $0, R8
  1337. ADCQ $0, R9
  1338. XORQ R10, R10
  1339. MOVQ P503P1_7, AX
  1340. MULQ R13
  1341. ADDQ AX, R8
  1342. ADCQ DX, R9
  1343. ADCQ $0, R10
  1344. MOVQ P503P1_6, AX
  1345. MULQ R14
  1346. ADDQ AX, R8
  1347. ADCQ DX, R9
  1348. ADCQ $0, R10
  1349. ADDQ (104)(REG_P1), R8 // Z5
  1350. MOVQ R8, (40)(REG_P2) // Z5
  1351. ADCQ $0, R9
  1352. ADCQ $0, R10
  1353. MOVQ P503P1_7, AX
  1354. MULQ R14
  1355. ADDQ AX, R9
  1356. ADCQ DX, R10
  1357. ADDQ (112)(REG_P1), R9 // Z6
  1358. MOVQ R9, (48)(REG_P2) // Z6
  1359. ADCQ $0, R10
  1360. ADDQ (120)(REG_P1), R10 // Z7
  1361. MOVQ R10, (56)(REG_P2) // Z7
  1362. RET
  1363. TEXT ·redcWithMULX(SB), $0-16
  1364. MOVQ z+0(FP), DI
  1365. MOVQ x+8(FP), SI
  1366. REDC(DI, SI, MULS_128x320_MULX)
  1367. RET
  1368. TEXT ·redcWithMULXADX(SB), $0-16
  1369. MOVQ z+0(FP), DI
  1370. MOVQ x+8(FP), SI
  1371. REDC(DI, SI, MULS_128x320_MULXADX)
  1372. RET
  1373. TEXT ·fp503MontgomeryReduce(SB), $0-16
  1374. MOVQ z+0(FP), DI
  1375. MOVQ x+8(FP), SI
  1376. REDC(DI, SI, MULS_128x320_MULXADX)
  1377. RET
  1378. TEXT ·fp503AddLazy(SB), NOSPLIT, $0-24
  1379. MOVQ z+0(FP), REG_P3
  1380. MOVQ x+8(FP), REG_P1
  1381. MOVQ y+16(FP), REG_P2
  1382. MOVQ (REG_P1), R8
  1383. MOVQ (8)(REG_P1), R9
  1384. MOVQ (16)(REG_P1), R10
  1385. MOVQ (24)(REG_P1), R11
  1386. MOVQ (32)(REG_P1), R12
  1387. MOVQ (40)(REG_P1), R13
  1388. MOVQ (48)(REG_P1), R14
  1389. MOVQ (56)(REG_P1), R15
  1390. ADDQ (REG_P2), R8
  1391. ADCQ (8)(REG_P2), R9
  1392. ADCQ (16)(REG_P2), R10
  1393. ADCQ (24)(REG_P2), R11
  1394. ADCQ (32)(REG_P2), R12
  1395. ADCQ (40)(REG_P2), R13
  1396. ADCQ (48)(REG_P2), R14
  1397. ADCQ (56)(REG_P2), R15
  1398. MOVQ R8, (REG_P3)
  1399. MOVQ R9, (8)(REG_P3)
  1400. MOVQ R10, (16)(REG_P3)
  1401. MOVQ R11, (24)(REG_P3)
  1402. MOVQ R12, (32)(REG_P3)
  1403. MOVQ R13, (40)(REG_P3)
  1404. MOVQ R14, (48)(REG_P3)
  1405. MOVQ R15, (56)(REG_P3)
  1406. RET
  1407. TEXT ·fp503X2AddLazy(SB), NOSPLIT, $0-24
  1408. MOVQ z+0(FP), REG_P3
  1409. MOVQ x+8(FP), REG_P1
  1410. MOVQ y+16(FP), REG_P2
  1411. MOVQ (REG_P1), R8
  1412. MOVQ (8)(REG_P1), R9
  1413. MOVQ (16)(REG_P1), R10
  1414. MOVQ (24)(REG_P1), R11
  1415. MOVQ (32)(REG_P1), R12
  1416. MOVQ (40)(REG_P1), R13
  1417. MOVQ (48)(REG_P1), R14
  1418. MOVQ (56)(REG_P1), R15
  1419. MOVQ (64)(REG_P1), AX
  1420. MOVQ (72)(REG_P1), BX
  1421. MOVQ (80)(REG_P1), CX
  1422. ADDQ (REG_P2), R8
  1423. ADCQ (8)(REG_P2), R9
  1424. ADCQ (16)(REG_P2), R10
  1425. ADCQ (24)(REG_P2), R11
  1426. ADCQ (32)(REG_P2), R12
  1427. ADCQ (40)(REG_P2), R13
  1428. ADCQ (48)(REG_P2), R14
  1429. ADCQ (56)(REG_P2), R15
  1430. ADCQ (64)(REG_P2), AX
  1431. ADCQ (72)(REG_P2), BX
  1432. ADCQ (80)(REG_P2), CX
  1433. MOVQ R8, (REG_P3)
  1434. MOVQ R9, (8)(REG_P3)
  1435. MOVQ R10, (16)(REG_P3)
  1436. MOVQ R11, (24)(REG_P3)
  1437. MOVQ R12, (32)(REG_P3)
  1438. MOVQ R13, (40)(REG_P3)
  1439. MOVQ R14, (48)(REG_P3)
  1440. MOVQ R15, (56)(REG_P3)
  1441. MOVQ AX, (64)(REG_P3)
  1442. MOVQ BX, (72)(REG_P3)
  1443. MOVQ CX, (80)(REG_P3)
  1444. MOVQ (88)(REG_P1), R8
  1445. MOVQ (96)(REG_P1), R9
  1446. MOVQ (104)(REG_P1), R10
  1447. MOVQ (112)(REG_P1), R11
  1448. MOVQ (120)(REG_P1), R12
  1449. ADCQ (88)(REG_P2), R8
  1450. ADCQ (96)(REG_P2), R9
  1451. ADCQ (104)(REG_P2), R10
  1452. ADCQ (112)(REG_P2), R11
  1453. ADCQ (120)(REG_P2), R12
  1454. MOVQ R8, (88)(REG_P3)
  1455. MOVQ R9, (96)(REG_P3)
  1456. MOVQ R10, (104)(REG_P3)
  1457. MOVQ R11, (112)(REG_P3)
  1458. MOVQ R12, (120)(REG_P3)
  1459. RET
  1460. TEXT ·fp503X2SubLazy(SB), NOSPLIT, $0-24
  1461. MOVQ z+0(FP), REG_P3
  1462. MOVQ x+8(FP), REG_P1
  1463. MOVQ y+16(FP), REG_P2
  1464. // Used later to store result of 0-borrow
  1465. XORQ CX, CX
  1466. // SUBC for first 11 limbs
  1467. MOVQ (REG_P1), R8
  1468. MOVQ (8)(REG_P1), R9
  1469. MOVQ (16)(REG_P1), R10
  1470. MOVQ (24)(REG_P1), R11
  1471. MOVQ (32)(REG_P1), R12
  1472. MOVQ (40)(REG_P1), R13
  1473. MOVQ (48)(REG_P1), R14
  1474. MOVQ (56)(REG_P1), R15
  1475. MOVQ (64)(REG_P1), AX
  1476. MOVQ (72)(REG_P1), BX
  1477. SUBQ (REG_P2), R8
  1478. SBBQ (8)(REG_P2), R9
  1479. SBBQ (16)(REG_P2), R10
  1480. SBBQ (24)(REG_P2), R11
  1481. SBBQ (32)(REG_P2), R12
  1482. SBBQ (40)(REG_P2), R13
  1483. SBBQ (48)(REG_P2), R14
  1484. SBBQ (56)(REG_P2), R15
  1485. SBBQ (64)(REG_P2), AX
  1486. SBBQ (72)(REG_P2), BX
  1487. MOVQ R8, (REG_P3)
  1488. MOVQ R9, (8)(REG_P3)
  1489. MOVQ R10, (16)(REG_P3)
  1490. MOVQ R11, (24)(REG_P3)
  1491. MOVQ R12, (32)(REG_P3)
  1492. MOVQ R13, (40)(REG_P3)
  1493. MOVQ R14, (48)(REG_P3)
  1494. MOVQ R15, (56)(REG_P3)
  1495. MOVQ AX, (64)(REG_P3)
  1496. MOVQ BX, (72)(REG_P3)
  1497. // SUBC for last 5 limbs
  1498. MOVQ (80)(REG_P1), R8
  1499. MOVQ (88)(REG_P1), R9
  1500. MOVQ (96)(REG_P1), R10
  1501. MOVQ (104)(REG_P1), R11
  1502. MOVQ (112)(REG_P1), R12
  1503. MOVQ (120)(REG_P1), R13
  1504. SBBQ (80)(REG_P2), R8
  1505. SBBQ (88)(REG_P2), R9
  1506. SBBQ (96)(REG_P2), R10
  1507. SBBQ (104)(REG_P2), R11
  1508. SBBQ (112)(REG_P2), R12
  1509. SBBQ (120)(REG_P2), R13
  1510. MOVQ R8, (80)(REG_P3)
  1511. MOVQ R9, (88)(REG_P3)
  1512. MOVQ R10, (96)(REG_P3)
  1513. MOVQ R11, (104)(REG_P3)
  1514. MOVQ R12, (112)(REG_P3)
  1515. MOVQ R13, (120)(REG_P3)
  1516. // Now the carry flag is 1 if x-y < 0. If so, add p*2^512.
  1517. SBBQ $0, CX
  1518. // Load p into registers:
  1519. MOVQ P503_0, R8
  1520. // P503_{1,2} = P503_0, so reuse R8
  1521. MOVQ P503_3, R9
  1522. MOVQ P503_4, R10
  1523. MOVQ P503_5, R11
  1524. MOVQ P503_6, R12
  1525. MOVQ P503_7, R13
  1526. ANDQ CX, R8
  1527. ANDQ CX, R9
  1528. ANDQ CX, R10
  1529. ANDQ CX, R11
  1530. ANDQ CX, R12
  1531. ANDQ CX, R13
  1532. MOVQ (64 )(REG_P3), AX; ADDQ R8, AX; MOVQ AX, (64 )(REG_P3)
  1533. MOVQ (64+ 8)(REG_P3), AX; ADCQ R8, AX; MOVQ AX, (64+ 8)(REG_P3)
  1534. MOVQ (64+16)(REG_P3), AX; ADCQ R8, AX; MOVQ AX, (64+16)(REG_P3)
  1535. MOVQ (64+24)(REG_P3), AX; ADCQ R9, AX; MOVQ AX, (64+24)(REG_P3)
  1536. MOVQ (64+32)(REG_P3), AX; ADCQ R10, AX; MOVQ AX, (64+32)(REG_P3)
  1537. MOVQ (64+40)(REG_P3), AX; ADCQ R11, AX; MOVQ AX, (64+40)(REG_P3)
  1538. MOVQ (64+48)(REG_P3), AX; ADCQ R12, AX; MOVQ AX, (64+48)(REG_P3)
  1539. MOVQ (64+56)(REG_P3), AX; ADCQ R13, AX; MOVQ AX, (64+56)(REG_P3)
  1540. RET