Ви не можете вибрати більше 25 тем Теми мають розпочинатися з літери або цифри, можуть містити дефіси (-) і не повинні перевищувати 35 символів.
 
 
 

1709 рядки
39 KiB

  1. // +build amd64,!noasm
  2. #include "textflag.h"
  3. // p503
  4. #define P503_0 $0xFFFFFFFFFFFFFFFF
  5. #define P503_1 $0xFFFFFFFFFFFFFFFF
  6. #define P503_2 $0xFFFFFFFFFFFFFFFF
  7. #define P503_3 $0xABFFFFFFFFFFFFFF
  8. #define P503_4 $0x13085BDA2211E7A0
  9. #define P503_5 $0x1B9BF6C87B7E7DAF
  10. #define P503_6 $0x6045C6BDDA77A4D0
  11. #define P503_7 $0x004066F541811E1E
  12. // p503+1
  13. #define P503P1_3 $0xAC00000000000000
  14. #define P503P1_4 $0x13085BDA2211E7A0
  15. #define P503P1_5 $0x1B9BF6C87B7E7DAF
  16. #define P503P1_6 $0x6045C6BDDA77A4D0
  17. #define P503P1_7 $0x004066F541811E1E
  18. // p503x2
  19. #define P503X2_0 $0xFFFFFFFFFFFFFFFE
  20. #define P503X2_1 $0xFFFFFFFFFFFFFFFF
  21. #define P503X2_2 $0xFFFFFFFFFFFFFFFF
  22. #define P503X2_3 $0x57FFFFFFFFFFFFFF
  23. #define P503X2_4 $0x2610B7B44423CF41
  24. #define P503X2_5 $0x3737ED90F6FCFB5E
  25. #define P503X2_6 $0xC08B8D7BB4EF49A0
  26. #define P503X2_7 $0x0080CDEA83023C3C
  27. #define REG_P1 DI
  28. #define REG_P2 SI
  29. #define REG_P3 DX
  30. // Performs schoolbook multiplication of 2 256-bit numbers. This optimized version
  31. // uses MULX instruction. Macro smashes value in DX.
  32. // Input: I0 and I1.
  33. // Output: O
  34. // All the other arguments are resgisters, used for storing temporary values
  35. #define MULS256_MULX(O, I0, I1, T0, T1, T2, T3, T4, T5, T6, T7, T8, T9) \
  36. MOVQ I0, DX \
  37. MULXQ I1, T1, T0 \ // T0:T1 = A0*B0
  38. MOVQ T1, O \ // O[0]
  39. MULXQ 8+I1, T2, T1 \ // T1:T2 = U0*V1
  40. ADDQ T2, T0 \
  41. MULXQ 16+I1, T3, T2 \ // T2:T3 = U0*V2
  42. ADCQ T3, T1 \
  43. MULXQ 24+I1, T4, T3 \ // T3:T4 = U0*V3
  44. ADCQ T4, T2 \
  45. \ // Column U1
  46. MOVQ 8+I0, DX \
  47. ADCQ $0, T3 \
  48. MULXQ 0+I1, T4, T5 \ // T5:T4 = U1*V0
  49. MULXQ 8+I1, T7, T6 \ // T6:T7 = U1*V1
  50. ADDQ T7, T5 \
  51. MULXQ 16+I1, T8, T7 \ // T7:T8 = U1*V2
  52. ADCQ T8, T6 \
  53. MULXQ 24+I1, T9, T8 \ // T8:T9 = U1*V3
  54. ADCQ T9, T7 \
  55. ADCQ $0, T8 \
  56. ADDQ T0, T4 \
  57. MOVQ T4, 8+O \ // O[1]
  58. ADCQ T1, T5 \
  59. ADCQ T2, T6 \
  60. ADCQ T3, T7 \
  61. \ // Column U2
  62. MOVQ 16+I0, DX \
  63. ADCQ $0, T8 \
  64. MULXQ 0+I1, T0, T1 \ // T1:T0 = U2*V0
  65. MULXQ 8+I1, T3, T2 \ // T2:T3 = U2*V1
  66. ADDQ T3, T1 \
  67. MULXQ 16+I1, T4, T3 \ // T3:T4 = U2*V2
  68. ADCQ T4, T2 \
  69. MULXQ 24+I1, T9, T4 \ // T4:T9 = U2*V3
  70. ADCQ T9, T3 \
  71. \ // Column U3
  72. MOVQ 24+I0, DX \
  73. ADCQ $0, T4 \
  74. ADDQ T5, T0 \
  75. MOVQ T0, 16+O \ // O[2]
  76. ADCQ T6, T1 \
  77. ADCQ T7, T2 \
  78. ADCQ T8, T3 \
  79. ADCQ $0, T4 \
  80. MULXQ 0+I1, T0, T5 \ // T5:T0 = U3*V0
  81. MULXQ 8+I1, T7, T6 \ // T6:T7 = U3*V1
  82. ADDQ T7, T5 \
  83. MULXQ 16+I1, T8, T7 \ // T7:T8 = U3*V2
  84. ADCQ T8, T6 \
  85. MULXQ 24+I1, T9, T8 \ // T8:T9 = U3*V3
  86. ADCQ T9, T7 \
  87. ADCQ $0, T8 \
  88. \ // Add values in remaining columns
  89. ADDQ T0, T1 \
  90. MOVQ T1, 24+O \ // O[3]
  91. ADCQ T5, T2 \
  92. MOVQ T2, 32+O \ // O[4]
  93. ADCQ T6, T3 \
  94. MOVQ T3, 40+O \ // O[5]
  95. ADCQ T7, T4 \
  96. MOVQ T4, 48+O \ // O[6]
  97. ADCQ $0, T8 \ // O[7]
  98. MOVQ T8, 56+O
  99. // Performs schoolbook multiplication of 2 256-bit numbers. This optimized version
  100. // uses ADOX, ADCX and MULX instructions. Macro smashes values in AX and DX.
  101. // Input: I0 and I1.
  102. // Output: O
  103. // All the other arguments resgisters are used for storing temporary values
  104. #define MULS256_MULXADX(O, I0, I1, T0, T1, T2, T3, T4, T5, T6, T7, T8, T9) \
  105. \ // U0[0]
  106. MOVQ 0+I0, DX \ // MULX requires multiplayer in DX
  107. \ // T0:T1 = I1*DX
  108. MULXQ I1, T1, T0 \ // T0:T1 = U0*V0 (low:high)
  109. MOVQ T1, O \ // O0[0]
  110. MULXQ 8+I1, T2, T1 \ // T2:T1 = U0*V1
  111. XORQ AX, AX \
  112. ADOXQ T2, T0 \
  113. MULXQ 16+I1, T3, T2 \ // T2:T3 = U0*V2
  114. ADOXQ T3, T1 \
  115. MULXQ 24+I1, T4, T3 \ // T3:T4 = U0*V3
  116. ADOXQ T4, T2 \
  117. \ // Column U1
  118. MOVQ 8+I0, DX \
  119. MULXQ I1, T4, T5 \ // T5:T4 = U1*V0
  120. ADOXQ AX, T3 \
  121. XORQ AX, AX \
  122. MULXQ 8+I1, T7, T6 \ // T6:T7 = U1*V1
  123. ADOXQ T0, T4 \
  124. MOVQ T4, 8+O \ // O[1]
  125. ADCXQ T7, T5 \
  126. MULXQ 16+I1, T8, T7 \ // T7:T8 = U1*V2
  127. ADCXQ T8, T6 \
  128. ADOXQ T1, T5 \
  129. MULXQ 24+I1, T9, T8 \ // T8:T9 = U1*V3
  130. ADCXQ T9, T7 \
  131. ADCXQ AX, T8 \
  132. ADOXQ T2, T6 \
  133. \ // Column U2
  134. MOVQ 16+I0, DX \
  135. MULXQ I1, T0, T1 \ // T1:T0 = U2*V0
  136. ADOXQ T3, T7 \
  137. ADOXQ AX, T8 \
  138. XORQ AX, AX \
  139. MULXQ 8+I1, T3, T2 \ // T2:T3 = U2*V1
  140. ADOXQ T5, T0 \
  141. MOVQ T0, 16+O \ // O[2]
  142. ADCXQ T3, T1 \
  143. MULXQ 16+I1, T4, T3 \ // T3:T4 = U2*V2
  144. ADCXQ T4, T2 \
  145. ADOXQ T6, T1 \
  146. MULXQ 24+I1, T9, T4 \ // T9:T4 = U2*V3
  147. ADCXQ T9, T3 \
  148. MOVQ 24+I0, DX \
  149. ADCXQ AX, T4 \
  150. \
  151. ADOXQ T7, T2 \
  152. ADOXQ T8, T3 \
  153. ADOXQ AX, T4 \
  154. \ // Column U3
  155. MULXQ I1, T0, T5 \ // T5:T0 = U3*B0
  156. XORQ AX, AX \
  157. MULXQ 8+I1, T7, T6 \ // T6:T7 = U3*B1
  158. ADCXQ T7, T5 \
  159. ADOXQ T0, T1 \
  160. MULXQ 16+I1, T8, T7 \ // T7:T8 = U3*V2
  161. ADCXQ T8, T6 \
  162. ADOXQ T5, T2 \
  163. MULXQ 24+I1, T9, T8 \ // T8:T9 = U3*V3
  164. ADCXQ T9, T7 \
  165. ADCXQ AX, T8 \
  166. \
  167. ADOXQ T6, T3 \
  168. ADOXQ T7, T4 \
  169. ADOXQ AX, T8 \
  170. MOVQ T1, 24+O \ // O[3]
  171. MOVQ T2, 32+O \ // O[4]
  172. MOVQ T3, 40+O \ // O[5]
  173. MOVQ T4, 48+O \ // O[6] and O[7] below
  174. MOVQ T8, 56+O
  175. // Template of a macro that performs schoolbook multiplication of 128-bit with 320-bit
  176. // number. It uses MULX instruction This template must be customized with functions
  177. // performing ADD (add1, add2) and ADD-with-carry (adc1, adc2). addX/adcX may or may
  178. // not be instructions that use two independent carry chains.
  179. // Input:
  180. // * I0 128-bit number
  181. // * I1 320-bit number
  182. // * add1, add2: instruction performing integer addition and starting carry chain
  183. // * adc1, adc2: instruction performing integer addition with carry
  184. // Output: T[0-6] registers
  185. #define MULS_128x320(I0, I1, T0, T1, T2, T3, T4, T5, T6, T7, T8, T9, add1, add2, adc1, adc2) \
  186. \ // Column 0
  187. MOVQ I0, DX \
  188. MULXQ I1+24(SB), T0, T1 \
  189. MULXQ I1+32(SB), T4, T2 \
  190. XORQ AX, AX \
  191. MULXQ I1+40(SB), T5, T3 \
  192. add1 T4, T1 \
  193. adc1 T5, T2 \
  194. MULXQ I1+48(SB), T7, T4 \
  195. adc1 T7, T3 \
  196. MULXQ I1+56(SB), T6, T5 \
  197. adc1 T6, T4 \
  198. adc1 AX, T5 \
  199. \ // Column 1
  200. MOVQ 8+I0, DX \
  201. MULXQ I1+24(SB), T6, T7 \
  202. add2 T6, T1 \
  203. adc2 T7, T2 \
  204. MULXQ I1+32(SB), T8, T6 \
  205. adc2 T6, T3 \
  206. MULXQ I1+40(SB), T7, T9 \
  207. adc2 T9, T4 \
  208. MULXQ I1+48(SB), T9, T6 \
  209. adc2 T6, T5 \
  210. MULXQ I1+56(SB), DX, T6 \
  211. adc2 AX, T6 \
  212. \ // Output
  213. XORQ AX, AX \
  214. add1 T8, T2 \
  215. adc1 T7, T3 \
  216. adc1 T9, T4 \
  217. adc1 DX, T5 \
  218. adc1 AX, T6
  219. // Multiplies 128-bit with 320-bit integer. Optimized with MULX instruction.
  220. #define MULS_128x320_MULX(I0, I1, T0, T1, T2, T3, T4, T5, T6, T7, T8, T9) \
  221. MULS_128x320(I0, I1, T0, T1, T2, T3, T4, T5, T6, T7, T8, T9, ADDQ, ADDQ, ADCQ, ADCQ)
  222. // Multiplies 128-bit with 320-bit integer. Optimized with MULX, ADOX and ADCX instructions
  223. #define MULS_128x320_MULXADX(I0, I1, T0, T1, T2, T3, T4, T5, T6, T7, T8, T9) \
  224. MULS_128x320(I0, I1, T0, T1, T2, T3, T4, T5, T6, T7, T8, T9, ADOXQ, ADCXQ, ADOXQ, ADCXQ)
  225. // Template of a macro performing multiplication of two 512-bit numbers. It uses one
  226. // level of Karatsuba and one level of schoolbook multiplication. Template must be
  227. // customized with macro performing schoolbook multiplication.
  228. // Input:
  229. // * I0, I1 - two 512-bit numbers
  230. // * MULS - either MULS256_MULX or MULS256_MULXADX
  231. // Output: OUT - 1024-bit long
  232. #define MUL(OUT, I0, I1, MULS) \
  233. \ // R[8-11]: U1+U0
  234. XORQ AX, AX \
  235. MOVQ ( 0)(I0), R8 \
  236. MOVQ ( 8)(I0), R9 \
  237. MOVQ (16)(I0), R10 \
  238. MOVQ (24)(I0), R11 \
  239. ADDQ (32)(I0), R8 \
  240. ADCQ (40)(I0), R9 \
  241. ADCQ (48)(I0), R10 \
  242. ADCQ (56)(I0), R11 \
  243. SBBQ $0, AX \ // store mask
  244. MOVQ R8, ( 0)(SP) \
  245. MOVQ R9, ( 8)(SP) \
  246. MOVQ R10, (16)(SP) \
  247. MOVQ R11, (24)(SP) \
  248. \
  249. \ // R[12-15]: V1+V0
  250. XORQ BX, BX \
  251. MOVQ ( 0)(I1), R12 \
  252. MOVQ ( 8)(I1), R13 \
  253. MOVQ (16)(I1), R14 \
  254. MOVQ (24)(I1), R15 \
  255. ADDQ (32)(I1), R12 \
  256. ADCQ (40)(I1), R13 \
  257. ADCQ (48)(I1), R14 \
  258. ADCQ (56)(I1), R15 \
  259. SBBQ $0, BX \ // store mask
  260. MOVQ R12, (32)(SP) \
  261. MOVQ R13, (40)(SP) \
  262. MOVQ R14, (48)(SP) \
  263. MOVQ R15, (56)(SP) \
  264. \ // Prepare mask for U0+U1 (U1+U0 mod 256^4 if U1+U0 sets carry flag, otherwise 0)
  265. ANDQ AX, R12 \
  266. ANDQ AX, R13 \
  267. ANDQ AX, R14 \
  268. ANDQ AX, R15 \
  269. \ // Prepare mask for V0+V1 (V1+V0 mod 256^4 if U1+U0 sets carry flag, otherwise 0)
  270. ANDQ BX, R8 \
  271. ANDQ BX, R9 \
  272. ANDQ BX, R10 \
  273. ANDQ BX, R11 \
  274. \ // res = masked(U0+U1) + masked(V0 + V1)
  275. ADDQ R12, R8 \
  276. ADCQ R13, R9 \
  277. ADCQ R14, R10 \
  278. ADCQ R15, R11 \
  279. \ // SP[64-96] <- res
  280. MOVQ R8, (64)(SP) \
  281. MOVQ R9, (72)(SP) \
  282. MOVQ R10, (80)(SP) \
  283. MOVQ R11, (88)(SP) \
  284. \ // BP will be used for schoolbook multiplication below
  285. MOVQ BP, 96(SP) \
  286. \ // (U1+U0)*(V1+V0)
  287. MULS((64)(OUT), 0(SP), 32(SP), R8, R9, R10, R11, R12, R13, R14, R15, BX, BP) \
  288. \ // U0 x V0
  289. MULS(0(OUT), 0(I0), 0(I1), R8, R9, R10, R11, R12, R13, R14, R15, BX, BP) \
  290. \ // U1 x V1
  291. MULS(0(SP), 32(I0), 32(I1), R8, R9, R10, R11, R12, R13, R14, R15, BX, BP) \
  292. \ // Recover BP
  293. MOVQ 96(SP), BP \
  294. \ // Final part of schoolbook multiplication; R[8-11] = (U0+U1) x (V0+V1)
  295. MOVQ (64)(SP), R8 \
  296. MOVQ (72)(SP), R9 \
  297. MOVQ (80)(SP), R10 \
  298. MOVQ (88)(SP), R11 \
  299. MOVQ (96)(OUT), AX \
  300. ADDQ AX, R8 \
  301. MOVQ (104)(OUT), AX \
  302. ADCQ AX, R9 \
  303. MOVQ (112)(OUT), AX \
  304. ADCQ AX, R10 \
  305. MOVQ (120)(OUT), AX \
  306. ADCQ AX, R11 \
  307. \ // R[12-15, 8-11] = (U0+U1) x (V0+V1) - U0xV0
  308. MOVQ (64)(OUT), R12 \
  309. MOVQ (72)(OUT), R13 \
  310. MOVQ (80)(OUT), R14 \
  311. MOVQ (88)(OUT), R15 \
  312. SUBQ ( 0)(OUT), R12 \
  313. SBBQ ( 8)(OUT), R13 \
  314. SBBQ (16)(OUT), R14 \
  315. SBBQ (24)(OUT), R15 \
  316. SBBQ (32)(OUT), R8 \
  317. SBBQ (40)(OUT), R9 \
  318. SBBQ (48)(OUT), R10 \
  319. SBBQ (56)(OUT), R11 \
  320. \ // r8-r15 <- (U0+U1) x (V0+V1) - U0xV0 - U1xV1
  321. SUBQ ( 0)(SP), R12 \
  322. SBBQ ( 8)(SP), R13 \
  323. SBBQ (16)(SP), R14 \
  324. SBBQ (24)(SP), R15 \
  325. SBBQ (32)(SP), R8 \
  326. SBBQ (40)(SP), R9 \
  327. SBBQ (48)(SP), R10 \
  328. SBBQ (56)(SP), R11 \
  329. \
  330. ; ADDQ (32)(OUT), R12; MOVQ R12, ( 32)(OUT) \
  331. ; ADCQ (40)(OUT), R13; MOVQ R13, ( 40)(OUT) \
  332. ; ADCQ (48)(OUT), R14; MOVQ R14, ( 48)(OUT) \
  333. ; ADCQ (56)(OUT), R15; MOVQ R15, ( 56)(OUT) \
  334. MOVQ ( 0)(SP), AX; ADCQ AX, R8; MOVQ R8, ( 64)(OUT) \
  335. MOVQ ( 8)(SP), AX; ADCQ AX, R9; MOVQ R9, ( 72)(OUT) \
  336. MOVQ (16)(SP), AX; ADCQ AX, R10; MOVQ R10, ( 80)(OUT) \
  337. MOVQ (24)(SP), AX; ADCQ AX, R11; MOVQ R11, ( 88)(OUT) \
  338. MOVQ (32)(SP), R12; ADCQ $0, R12; MOVQ R12, ( 96)(OUT) \
  339. MOVQ (40)(SP), R13; ADCQ $0, R13; MOVQ R13, (104)(OUT) \
  340. MOVQ (48)(SP), R14; ADCQ $0, R14; MOVQ R14, (112)(OUT) \
  341. MOVQ (56)(SP), R15; ADCQ $0, R15; MOVQ R15, (120)(OUT)
  342. // Template for calculating the Montgomery reduction algorithm described in
  343. // section 5.2.3 of https://eprint.iacr.org/2017/1015.pdf. Template must be
  344. // customized with schoolbook multiplicaton for 128 x 320-bit number.
  345. // This macro reuses memory of IN value and *changes* it. Smashes registers
  346. // R[8-15], BX, CX
  347. // Input:
  348. // * IN: 1024-bit number to be reduced
  349. // * MULS: either MULS_128x320_MULX or MULS_128x320_MULXADX
  350. // Output: OUT 512-bit
  351. #define REDC(OUT, IN, MULS) \
  352. MULS(0(IN), ·p503p1, R8, R9, R10, R11, R12, R13, R14, BX, CX, R15) \
  353. XORQ R15, R15 \
  354. ADDQ (24)(IN), R8 \
  355. ADCQ (32)(IN), R9 \
  356. ADCQ (40)(IN), R10 \
  357. ADCQ (48)(IN), R11 \
  358. ADCQ (56)(IN), R12 \
  359. ADCQ (64)(IN), R13 \
  360. ADCQ (72)(IN), R14 \
  361. ADCQ (80)(IN), R15 \
  362. MOVQ R8, (24)(IN) \
  363. MOVQ R9, (32)(IN) \
  364. MOVQ R10, (40)(IN) \
  365. MOVQ R11, (48)(IN) \
  366. MOVQ R12, (56)(IN) \
  367. MOVQ R13, (64)(IN) \
  368. MOVQ R14, (72)(IN) \
  369. MOVQ R15, (80)(IN) \
  370. MOVQ (88)(IN), R8 \
  371. MOVQ (96)(IN), R9 \
  372. MOVQ (104)(IN), R10 \
  373. MOVQ (112)(IN), R11 \
  374. MOVQ (120)(IN), R12 \
  375. ADCQ $0, R8 \
  376. ADCQ $0, R9 \
  377. ADCQ $0, R10 \
  378. ADCQ $0, R11 \
  379. ADCQ $0, R12 \
  380. MOVQ R8, (88)(IN) \
  381. MOVQ R9, (96)(IN) \
  382. MOVQ R10, (104)(IN) \
  383. MOVQ R11, (112)(IN) \
  384. MOVQ R12, (120)(IN) \
  385. \
  386. MULS(16(IN), ·p503p1, R8, R9, R10, R11, R12, R13, R14, BX, CX, R15) \
  387. XORQ R15, R15 \
  388. ADDQ (40)(IN), R8 \
  389. ADCQ (48)(IN), R9 \
  390. ADCQ (56)(IN), R10 \
  391. ADCQ (64)(IN), R11 \
  392. ADCQ (72)(IN), R12 \
  393. ADCQ (80)(IN), R13 \
  394. ADCQ (88)(IN), R14 \
  395. ADCQ (96)(IN), R15 \
  396. MOVQ R8, (40)(IN) \
  397. MOVQ R9, (48)(IN) \
  398. MOVQ R10, (56)(IN) \
  399. MOVQ R11, (64)(IN) \
  400. MOVQ R12, (72)(IN) \
  401. MOVQ R13, (80)(IN) \
  402. MOVQ R14, (88)(IN) \
  403. MOVQ R15, (96)(IN) \
  404. MOVQ (104)(IN), R8 \
  405. MOVQ (112)(IN), R9 \
  406. MOVQ (120)(IN), R10 \
  407. ADCQ $0, R8 \
  408. ADCQ $0, R9 \
  409. ADCQ $0, R10 \
  410. MOVQ R8, (104)(IN) \
  411. MOVQ R9, (112)(IN) \
  412. MOVQ R10, (120)(IN) \
  413. \
  414. MULS(32(IN), ·p503p1, R8, R9, R10, R11, R12, R13, R14, BX, CX, R15) \
  415. XORQ R15, R15 \
  416. XORQ BX, BX \
  417. ADDQ ( 56)(IN), R8 \
  418. ADCQ ( 64)(IN), R9 \
  419. ADCQ ( 72)(IN), R10 \
  420. ADCQ ( 80)(IN), R11 \
  421. ADCQ ( 88)(IN), R12 \
  422. ADCQ ( 96)(IN), R13 \
  423. ADCQ (104)(IN), R14 \
  424. ADCQ (112)(IN), R15 \
  425. ADCQ (120)(IN), BX \
  426. MOVQ R8, ( 56)(IN) \
  427. MOVQ R10, ( 72)(IN) \
  428. MOVQ R11, ( 80)(IN) \
  429. MOVQ R12, ( 88)(IN) \
  430. MOVQ R13, ( 96)(IN) \
  431. MOVQ R14, (104)(IN) \
  432. MOVQ R15, (112)(IN) \
  433. MOVQ BX, (120)(IN) \
  434. MOVQ R9, ( 0)(OUT) \ // Result: OUT[0]
  435. \
  436. MULS(48(IN), ·p503p1, R8, R9, R10, R11, R12, R13, R14, BX, CX, R15) \
  437. ADDQ ( 72)(IN), R8 \
  438. ADCQ ( 80)(IN), R9 \
  439. ADCQ ( 88)(IN), R10 \
  440. ADCQ ( 96)(IN), R11 \
  441. ADCQ (104)(IN), R12 \
  442. ADCQ (112)(IN), R13 \
  443. ADCQ (120)(IN), R14 \
  444. MOVQ R8, ( 8)(OUT) \ // Result: OUT[1]
  445. MOVQ R9, (16)(OUT) \ // Result: OUT[2]
  446. MOVQ R10, (24)(OUT) \ // Result: OUT[3]
  447. MOVQ R11, (32)(OUT) \ // Result: OUT[4]
  448. MOVQ R12, (40)(OUT) \ // Result: OUT[5]
  449. MOVQ R13, (48)(OUT) \ // Result: OUT[6] and OUT[7]
  450. MOVQ R14, (56)(OUT)
  451. TEXT ·fp503StrongReduce(SB), NOSPLIT, $0-8
  452. MOVQ x+0(FP), REG_P1
  453. // Zero AX for later use:
  454. XORQ AX, AX
  455. // Load p into registers:
  456. MOVQ P503_0, R8
  457. // P503_{1,2} = P503_0, so reuse R8
  458. MOVQ P503_3, R9
  459. MOVQ P503_4, R10
  460. MOVQ P503_5, R11
  461. MOVQ P503_6, R12
  462. MOVQ P503_7, R13
  463. // Set x <- x - p
  464. SUBQ R8, ( 0)(REG_P1)
  465. SBBQ R8, ( 8)(REG_P1)
  466. SBBQ R8, (16)(REG_P1)
  467. SBBQ R9, (24)(REG_P1)
  468. SBBQ R10, (32)(REG_P1)
  469. SBBQ R11, (40)(REG_P1)
  470. SBBQ R12, (48)(REG_P1)
  471. SBBQ R13, (56)(REG_P1)
  472. // Save carry flag indicating x-p < 0 as a mask
  473. SBBQ $0, AX
  474. // Conditionally add p to x if x-p < 0
  475. ANDQ AX, R8
  476. ANDQ AX, R9
  477. ANDQ AX, R10
  478. ANDQ AX, R11
  479. ANDQ AX, R12
  480. ANDQ AX, R13
  481. ADDQ R8, ( 0)(REG_P1)
  482. ADCQ R8, ( 8)(REG_P1)
  483. ADCQ R8, (16)(REG_P1)
  484. ADCQ R9, (24)(REG_P1)
  485. ADCQ R10,(32)(REG_P1)
  486. ADCQ R11,(40)(REG_P1)
  487. ADCQ R12,(48)(REG_P1)
  488. ADCQ R13,(56)(REG_P1)
  489. RET
  490. TEXT ·fp503ConditionalSwap(SB),NOSPLIT,$0-17
  491. MOVQ x+0(FP), REG_P1
  492. MOVQ y+8(FP), REG_P2
  493. MOVB choice+16(FP), AL // AL = 0 or 1
  494. MOVBLZX AL, AX // AX = 0 or 1
  495. NEGQ AX // AX = 0x00..00 or 0xff..ff
  496. #ifndef CSWAP_BLOCK
  497. #define CSWAP_BLOCK(idx) \
  498. MOVQ (idx*8)(REG_P1), BX \ // BX = x[idx]
  499. MOVQ (idx*8)(REG_P2), CX \ // CX = y[idx]
  500. MOVQ CX, DX \ // DX = y[idx]
  501. XORQ BX, DX \ // DX = y[idx] ^ x[idx]
  502. ANDQ AX, DX \ // DX = (y[idx] ^ x[idx]) & mask
  503. XORQ DX, BX \ // BX = (y[idx] ^ x[idx]) & mask) ^ x[idx] = x[idx] or y[idx]
  504. XORQ DX, CX \ // CX = (y[idx] ^ x[idx]) & mask) ^ y[idx] = y[idx] or x[idx]
  505. MOVQ BX, (idx*8)(REG_P1) \
  506. MOVQ CX, (idx*8)(REG_P2)
  507. #endif
  508. CSWAP_BLOCK(0)
  509. CSWAP_BLOCK(1)
  510. CSWAP_BLOCK(2)
  511. CSWAP_BLOCK(3)
  512. CSWAP_BLOCK(4)
  513. CSWAP_BLOCK(5)
  514. CSWAP_BLOCK(6)
  515. CSWAP_BLOCK(7)
  516. #ifdef CSWAP_BLOCK
  517. #undef CSWAP_BLOCK
  518. #endif
  519. RET
  520. TEXT ·fp503AddReduced(SB),NOSPLIT,$0-24
  521. MOVQ z+0(FP), REG_P3
  522. MOVQ x+8(FP), REG_P1
  523. MOVQ y+16(FP), REG_P2
  524. // Used later to calculate a mask
  525. XORQ CX, CX
  526. // [R8-R15]: z = x + y
  527. MOVQ ( 0)(REG_P1), R8
  528. MOVQ ( 8)(REG_P1), R9
  529. MOVQ (16)(REG_P1), R10
  530. MOVQ (24)(REG_P1), R11
  531. MOVQ (32)(REG_P1), R12
  532. MOVQ (40)(REG_P1), R13
  533. MOVQ (48)(REG_P1), R14
  534. MOVQ (56)(REG_P1), R15
  535. ADDQ ( 0)(REG_P2), R8
  536. ADCQ ( 8)(REG_P2), R9
  537. ADCQ (16)(REG_P2), R10
  538. ADCQ (24)(REG_P2), R11
  539. ADCQ (32)(REG_P2), R12
  540. ADCQ (40)(REG_P2), R13
  541. ADCQ (48)(REG_P2), R14
  542. ADCQ (56)(REG_P2), R15
  543. MOVQ P503X2_0, AX
  544. SUBQ AX, R8
  545. MOVQ P503X2_1, AX
  546. SBBQ AX, R9
  547. SBBQ AX, R10
  548. MOVQ P503X2_3, AX
  549. SBBQ AX, R11
  550. MOVQ P503X2_4, AX
  551. SBBQ AX, R12
  552. MOVQ P503X2_5, AX
  553. SBBQ AX, R13
  554. MOVQ P503X2_6, AX
  555. SBBQ AX, R14
  556. MOVQ P503X2_7, AX
  557. SBBQ AX, R15
  558. // mask
  559. SBBQ $0, CX
  560. // move z to REG_P3
  561. MOVQ R8, ( 0)(REG_P3)
  562. MOVQ R9, ( 8)(REG_P3)
  563. MOVQ R10, (16)(REG_P3)
  564. MOVQ R11, (24)(REG_P3)
  565. MOVQ R12, (32)(REG_P3)
  566. MOVQ R13, (40)(REG_P3)
  567. MOVQ R14, (48)(REG_P3)
  568. MOVQ R15, (56)(REG_P3)
  569. // if z<0 add p503x2 back
  570. MOVQ P503X2_0, R8
  571. MOVQ P503X2_1, R9
  572. MOVQ P503X2_3, R10
  573. MOVQ P503X2_4, R11
  574. MOVQ P503X2_5, R12
  575. MOVQ P503X2_6, R13
  576. MOVQ P503X2_7, R14
  577. ANDQ CX, R8
  578. ANDQ CX, R9
  579. ANDQ CX, R10
  580. ANDQ CX, R11
  581. ANDQ CX, R12
  582. ANDQ CX, R13
  583. ANDQ CX, R14
  584. MOVQ ( 0)(REG_P3), AX; ADDQ R8, AX; MOVQ AX, ( 0)(REG_P3)
  585. MOVQ ( 8)(REG_P3), AX; ADCQ R9, AX; MOVQ AX, ( 8)(REG_P3)
  586. MOVQ (16)(REG_P3), AX; ADCQ R9, AX; MOVQ AX, (16)(REG_P3)
  587. MOVQ (24)(REG_P3), AX; ADCQ R10, AX; MOVQ AX, (24)(REG_P3)
  588. MOVQ (32)(REG_P3), AX; ADCQ R11, AX; MOVQ AX, (32)(REG_P3)
  589. MOVQ (40)(REG_P3), AX; ADCQ R12, AX; MOVQ AX, (40)(REG_P3)
  590. MOVQ (48)(REG_P3), AX; ADCQ R13, AX; MOVQ AX, (48)(REG_P3)
  591. MOVQ (56)(REG_P3), AX; ADCQ R14, AX; MOVQ AX, (56)(REG_P3)
  592. RET
  593. TEXT ·fp503SubReduced(SB), NOSPLIT, $0-24
  594. MOVQ z+0(FP), REG_P3
  595. MOVQ x+8(FP), REG_P1
  596. MOVQ y+16(FP), REG_P2
  597. // Used later to calculate a mask
  598. XORQ CX, CX
  599. MOVQ ( 0)(REG_P1), R8
  600. MOVQ ( 8)(REG_P1), R9
  601. MOVQ (16)(REG_P1), R10
  602. MOVQ (24)(REG_P1), R11
  603. MOVQ (32)(REG_P1), R12
  604. MOVQ (40)(REG_P1), R13
  605. MOVQ (48)(REG_P1), R14
  606. MOVQ (56)(REG_P1), R15
  607. SUBQ ( 0)(REG_P2), R8
  608. SBBQ ( 8)(REG_P2), R9
  609. SBBQ (16)(REG_P2), R10
  610. SBBQ (24)(REG_P2), R11
  611. SBBQ (32)(REG_P2), R12
  612. SBBQ (40)(REG_P2), R13
  613. SBBQ (48)(REG_P2), R14
  614. SBBQ (56)(REG_P2), R15
  615. // mask
  616. SBBQ $0, CX
  617. // store x-y in REG_P3
  618. MOVQ R8, ( 0)(REG_P3)
  619. MOVQ R9, ( 8)(REG_P3)
  620. MOVQ R10, (16)(REG_P3)
  621. MOVQ R11, (24)(REG_P3)
  622. MOVQ R12, (32)(REG_P3)
  623. MOVQ R13, (40)(REG_P3)
  624. MOVQ R14, (48)(REG_P3)
  625. MOVQ R15, (56)(REG_P3)
  626. // if z<0 add p503x2 back
  627. MOVQ P503X2_0, R8
  628. MOVQ P503X2_1, R9
  629. MOVQ P503X2_3, R10
  630. MOVQ P503X2_4, R11
  631. MOVQ P503X2_5, R12
  632. MOVQ P503X2_6, R13
  633. MOVQ P503X2_7, R14
  634. ANDQ CX, R8
  635. ANDQ CX, R9
  636. ANDQ CX, R10
  637. ANDQ CX, R11
  638. ANDQ CX, R12
  639. ANDQ CX, R13
  640. ANDQ CX, R14
  641. MOVQ ( 0)(REG_P3), AX; ADDQ R8, AX; MOVQ AX, ( 0)(REG_P3)
  642. MOVQ ( 8)(REG_P3), AX; ADCQ R9, AX; MOVQ AX, ( 8)(REG_P3)
  643. MOVQ (16)(REG_P3), AX; ADCQ R9, AX; MOVQ AX, (16)(REG_P3)
  644. MOVQ (24)(REG_P3), AX; ADCQ R10, AX; MOVQ AX, (24)(REG_P3)
  645. MOVQ (32)(REG_P3), AX; ADCQ R11, AX; MOVQ AX, (32)(REG_P3)
  646. MOVQ (40)(REG_P3), AX; ADCQ R12, AX; MOVQ AX, (40)(REG_P3)
  647. MOVQ (48)(REG_P3), AX; ADCQ R13, AX; MOVQ AX, (48)(REG_P3)
  648. MOVQ (56)(REG_P3), AX; ADCQ R14, AX; MOVQ AX, (56)(REG_P3)
  649. RET
  650. TEXT ·fp503Mul(SB), NOSPLIT, $104-24
  651. MOVQ z+ 0(FP), CX
  652. MOVQ x+ 8(FP), REG_P1
  653. MOVQ y+16(FP), REG_P2
  654. // Check wether to use optimized implementation
  655. CMPB ·useADXMULX(SB), $1
  656. JE mul_with_mulx_adx
  657. CMPB ·useMULX(SB), $1
  658. JE mul_with_mulx
  659. // Generic x86 implementation (below) uses variant of Karatsuba method.
  660. //
  661. // Here we store the destination in CX instead of in REG_P3 because the
  662. // multiplication instructions use DX as an implicit destination
  663. // operand: MULQ $REG sets DX:AX <-- AX * $REG.
  664. // RAX and RDX will be used for a mask (0-borrow)
  665. XORQ AX, AX
  666. // RCX[0-3]: U1+U0
  667. MOVQ (32)(REG_P1), R8
  668. MOVQ (40)(REG_P1), R9
  669. MOVQ (48)(REG_P1), R10
  670. MOVQ (56)(REG_P1), R11
  671. ADDQ ( 0)(REG_P1), R8
  672. ADCQ ( 8)(REG_P1), R9
  673. ADCQ (16)(REG_P1), R10
  674. ADCQ (24)(REG_P1), R11
  675. MOVQ R8, ( 0)(CX)
  676. MOVQ R9, ( 8)(CX)
  677. MOVQ R10, (16)(CX)
  678. MOVQ R11, (24)(CX)
  679. SBBQ $0, AX
  680. // R12-R15: V1+V0
  681. XORQ DX, DX
  682. MOVQ (32)(REG_P2), R12
  683. MOVQ (40)(REG_P2), R13
  684. MOVQ (48)(REG_P2), R14
  685. MOVQ (56)(REG_P2), R15
  686. ADDQ ( 0)(REG_P2), R12
  687. ADCQ ( 8)(REG_P2), R13
  688. ADCQ (16)(REG_P2), R14
  689. ADCQ (24)(REG_P2), R15
  690. SBBQ $0, DX
  691. // Store carries on stack
  692. MOVQ AX, (64)(SP)
  693. MOVQ DX, (72)(SP)
  694. // (SP[0-3],R8,R9,R10,R11) <- (U0+U1)*(V0+V1).
  695. // MUL using comba; In comments below U=U0+U1 V=V0+V1
  696. // U0*V0
  697. MOVQ (CX), AX
  698. MULQ R12
  699. MOVQ AX, (SP) // C0
  700. MOVQ DX, R8
  701. // U0*V1
  702. XORQ R9, R9
  703. MOVQ (CX), AX
  704. MULQ R13
  705. ADDQ AX, R8
  706. ADCQ DX, R9
  707. // U1*V0
  708. XORQ R10, R10
  709. MOVQ (8)(CX), AX
  710. MULQ R12
  711. ADDQ AX, R8
  712. MOVQ R8, (8)(SP) // C1
  713. ADCQ DX, R9
  714. ADCQ $0, R10
  715. // U0*V2
  716. XORQ R8, R8
  717. MOVQ (CX), AX
  718. MULQ R14
  719. ADDQ AX, R9
  720. ADCQ DX, R10
  721. ADCQ $0, R8
  722. // U2*V0
  723. MOVQ (16)(CX), AX
  724. MULQ R12
  725. ADDQ AX, R9
  726. ADCQ DX, R10
  727. ADCQ $0, R8
  728. // U1*V1
  729. MOVQ (8)(CX), AX
  730. MULQ R13
  731. ADDQ AX, R9
  732. MOVQ R9, (16)(SP) // C2
  733. ADCQ DX, R10
  734. ADCQ $0, R8
  735. // U0*V3
  736. XORQ R9, R9
  737. MOVQ (CX), AX
  738. MULQ R15
  739. ADDQ AX, R10
  740. ADCQ DX, R8
  741. ADCQ $0, R9
  742. // U3*V0
  743. MOVQ (24)(CX), AX
  744. MULQ R12
  745. ADDQ AX, R10
  746. ADCQ DX, R8
  747. ADCQ $0, R9
  748. // U1*V2
  749. MOVQ (8)(CX), AX
  750. MULQ R14
  751. ADDQ AX, R10
  752. ADCQ DX, R8
  753. ADCQ $0, R9
  754. // U2*V1
  755. MOVQ (16)(CX), AX
  756. MULQ R13
  757. ADDQ AX, R10
  758. MOVQ R10, (24)(SP) // C3
  759. ADCQ DX, R8
  760. ADCQ $0, R9
  761. // U1*V3
  762. XORQ R10, R10
  763. MOVQ (8)(CX), AX
  764. MULQ R15
  765. ADDQ AX, R8
  766. ADCQ DX, R9
  767. ADCQ $0, R10
  768. // U3*V1
  769. MOVQ (24)(CX), AX
  770. MULQ R13
  771. ADDQ AX, R8
  772. ADCQ DX, R9
  773. ADCQ $0, R10
  774. // U2*V2
  775. MOVQ (16)(CX), AX
  776. MULQ R14
  777. ADDQ AX, R8
  778. MOVQ R8, (32)(SP) // C4
  779. ADCQ DX, R9
  780. ADCQ $0, R10
  781. // U2*V3
  782. XORQ R11, R11
  783. MOVQ (16)(CX), AX
  784. MULQ R15
  785. ADDQ AX, R9
  786. ADCQ DX, R10
  787. ADCQ $0, R11
  788. // U3*V2
  789. MOVQ (24)(CX), AX
  790. MULQ R14
  791. ADDQ AX, R9 // C5
  792. ADCQ DX, R10
  793. ADCQ $0, R11
  794. // U3*V3
  795. MOVQ (24)(CX), AX
  796. MULQ R15
  797. ADDQ AX, R10 // C6
  798. ADCQ DX, R11 // C7
  799. MOVQ (64)(SP), AX
  800. ANDQ AX, R12
  801. ANDQ AX, R13
  802. ANDQ AX, R14
  803. ANDQ AX, R15
  804. ADDQ R8, R12
  805. ADCQ R9, R13
  806. ADCQ R10, R14
  807. ADCQ R11, R15
  808. MOVQ (72)(SP), AX
  809. MOVQ (CX), R8
  810. MOVQ (8)(CX), R9
  811. MOVQ (16)(CX), R10
  812. MOVQ (24)(CX), R11
  813. ANDQ AX, R8
  814. ANDQ AX, R9
  815. ANDQ AX, R10
  816. ANDQ AX, R11
  817. ADDQ R12, R8
  818. ADCQ R13, R9
  819. ADCQ R14, R10
  820. ADCQ R15, R11
  821. MOVQ R8, (32)(SP)
  822. MOVQ R9, (40)(SP)
  823. MOVQ R10, (48)(SP)
  824. MOVQ R11, (56)(SP)
  825. // CX[0-7] <- AL*BL
  826. // U0*V0
  827. MOVQ (REG_P1), R11
  828. MOVQ (REG_P2), AX
  829. MULQ R11
  830. XORQ R9, R9
  831. MOVQ AX, (CX) // C0
  832. MOVQ DX, R8
  833. // U0*V1
  834. MOVQ (16)(REG_P1), R14
  835. MOVQ (8)(REG_P2), AX
  836. MULQ R11
  837. XORQ R10, R10
  838. ADDQ AX, R8
  839. ADCQ DX, R9
  840. // U1*V0
  841. MOVQ (8)(REG_P1), R12
  842. MOVQ (REG_P2), AX
  843. MULQ R12
  844. ADDQ AX, R8
  845. MOVQ R8, (8)(CX) // C1
  846. ADCQ DX, R9
  847. ADCQ $0, R10
  848. // U0*V2
  849. XORQ R8, R8
  850. MOVQ (16)(REG_P2), AX
  851. MULQ R11
  852. ADDQ AX, R9
  853. ADCQ DX, R10
  854. ADCQ $0, R8
  855. // U2*V0
  856. MOVQ (REG_P2), R13
  857. MOVQ R14, AX
  858. MULQ R13
  859. ADDQ AX, R9
  860. ADCQ DX, R10
  861. ADCQ $0, R8
  862. // U1*V1
  863. MOVQ (8)(REG_P2), AX
  864. MULQ R12
  865. ADDQ AX, R9
  866. MOVQ R9, (16)(CX) // C2
  867. ADCQ DX, R10
  868. ADCQ $0, R8
  869. // U0*V3
  870. XORQ R9, R9
  871. MOVQ (24)(REG_P2), AX
  872. MULQ R11
  873. MOVQ (24)(REG_P1), R15
  874. ADDQ AX, R10
  875. ADCQ DX, R8
  876. ADCQ $0, R9
  877. // U3*V1
  878. MOVQ R15, AX
  879. MULQ R13
  880. ADDQ AX, R10
  881. ADCQ DX, R8
  882. ADCQ $0, R9
  883. // U2*V2
  884. MOVQ (16)(REG_P2), AX
  885. MULQ R12
  886. ADDQ AX, R10
  887. ADCQ DX, R8
  888. ADCQ $0, R9
  889. // U2*V3
  890. MOVQ (8)(REG_P2), AX
  891. MULQ R14
  892. ADDQ AX, R10
  893. MOVQ R10, (24)(CX) // C3
  894. ADCQ DX, R8
  895. ADCQ $0, R9
  896. // U3*V2
  897. XORQ R10, R10
  898. MOVQ (24)(REG_P2), AX
  899. MULQ R12
  900. ADDQ AX, R8
  901. ADCQ DX, R9
  902. ADCQ $0, R10
  903. // U3*V1
  904. MOVQ (8)(REG_P2), AX
  905. MULQ R15
  906. ADDQ AX, R8
  907. ADCQ DX, R9
  908. ADCQ $0, R10
  909. // U2*V2
  910. MOVQ (16)(REG_P2), AX
  911. MULQ R14
  912. ADDQ AX, R8
  913. MOVQ R8, (32)(CX) // C4
  914. ADCQ DX, R9
  915. ADCQ $0, R10
  916. // U2*V3
  917. XORQ R8, R8
  918. MOVQ (24)(REG_P2), AX
  919. MULQ R14
  920. ADDQ AX, R9
  921. ADCQ DX, R10
  922. ADCQ $0, R8
  923. // U3*V2
  924. MOVQ (16)(REG_P2), AX
  925. MULQ R15
  926. ADDQ AX, R9
  927. MOVQ R9, (40)(CX) // C5
  928. ADCQ DX, R10
  929. ADCQ $0, R8
  930. // U3*V3
  931. MOVQ (24)(REG_P2), AX
  932. MULQ R15
  933. ADDQ AX, R10
  934. MOVQ R10, (48)(CX) // C6
  935. ADCQ DX, R8
  936. MOVQ R8, (56)(CX) // C7
  937. // CX[8-15] <- U1*V1
  938. MOVQ (32)(REG_P1), R11
  939. MOVQ (32)(REG_P2), AX
  940. MULQ R11
  941. XORQ R9, R9
  942. MOVQ AX, (64)(CX) // C0
  943. MOVQ DX, R8
  944. MOVQ (48)(REG_P1), R14
  945. MOVQ (40)(REG_P2), AX
  946. MULQ R11
  947. XORQ R10, R10
  948. ADDQ AX, R8
  949. ADCQ DX, R9
  950. MOVQ (40)(REG_P1), R12
  951. MOVQ (32)(REG_P2), AX
  952. MULQ R12
  953. ADDQ AX, R8
  954. MOVQ R8, (72)(CX) // C1
  955. ADCQ DX, R9
  956. ADCQ $0, R10
  957. XORQ R8, R8
  958. MOVQ (48)(REG_P2), AX
  959. MULQ R11
  960. ADDQ AX, R9
  961. ADCQ DX, R10
  962. ADCQ $0, R8
  963. MOVQ (32)(REG_P2), R13
  964. MOVQ R14, AX
  965. MULQ R13
  966. ADDQ AX, R9
  967. ADCQ DX, R10
  968. ADCQ $0, R8
  969. MOVQ (40)(REG_P2), AX
  970. MULQ R12
  971. ADDQ AX, R9
  972. MOVQ R9, (80)(CX) // C2
  973. ADCQ DX, R10
  974. ADCQ $0, R8
  975. XORQ R9, R9
  976. MOVQ (56)(REG_P2), AX
  977. MULQ R11
  978. MOVQ (56)(REG_P1), R15
  979. ADDQ AX, R10
  980. ADCQ DX, R8
  981. ADCQ $0, R9
  982. MOVQ R15, AX
  983. MULQ R13
  984. ADDQ AX, R10
  985. ADCQ DX, R8
  986. ADCQ $0, R9
  987. MOVQ (48)(REG_P2), AX
  988. MULQ R12
  989. ADDQ AX, R10
  990. ADCQ DX, R8
  991. ADCQ $0, R9
  992. MOVQ (40)(REG_P2), AX
  993. MULQ R14
  994. ADDQ AX, R10
  995. MOVQ R10, (88)(CX) // C3
  996. ADCQ DX, R8
  997. ADCQ $0, R9
  998. XORQ R10, R10
  999. MOVQ (56)(REG_P2), AX
  1000. MULQ R12
  1001. ADDQ AX, R8
  1002. ADCQ DX, R9
  1003. ADCQ $0, R10
  1004. MOVQ (40)(REG_P2), AX
  1005. MULQ R15
  1006. ADDQ AX, R8
  1007. ADCQ DX, R9
  1008. ADCQ $0, R10
  1009. MOVQ (48)(REG_P2), AX
  1010. MULQ R14
  1011. ADDQ AX, R8
  1012. MOVQ R8, (96)(CX) // C4
  1013. ADCQ DX, R9
  1014. ADCQ $0, R10
  1015. XORQ R8, R8
  1016. MOVQ (56)(REG_P2), AX
  1017. MULQ R14
  1018. ADDQ AX, R9
  1019. ADCQ DX, R10
  1020. ADCQ $0, R8
  1021. MOVQ (48)(REG_P2), AX
  1022. MULQ R15
  1023. ADDQ AX, R9
  1024. MOVQ R9, (104)(CX) // C5
  1025. ADCQ DX, R10
  1026. ADCQ $0, R8
  1027. MOVQ (56)(REG_P2), AX
  1028. MULQ R15
  1029. ADDQ AX, R10
  1030. MOVQ R10, (112)(CX) // C6
  1031. ADCQ DX, R8
  1032. MOVQ R8, (120)(CX) // C7
  1033. // [R8-R15] <- (U0+U1)*(V0+V1) - U1*V1
  1034. MOVQ (SP), R8
  1035. SUBQ (CX), R8
  1036. MOVQ (8)(SP), R9
  1037. SBBQ (8)(CX), R9
  1038. MOVQ (16)(SP), R10
  1039. SBBQ (16)(CX), R10
  1040. MOVQ (24)(SP), R11
  1041. SBBQ (24)(CX), R11
  1042. MOVQ (32)(SP), R12
  1043. SBBQ (32)(CX), R12
  1044. MOVQ (40)(SP), R13
  1045. SBBQ (40)(CX), R13
  1046. MOVQ (48)(SP), R14
  1047. SBBQ (48)(CX), R14
  1048. MOVQ (56)(SP), R15
  1049. SBBQ (56)(CX), R15
  1050. // [R8-R15] <- (U0+U1)*(V0+V1) - U1*V0 - U0*U1
  1051. MOVQ ( 64)(CX), AX; SUBQ AX, R8
  1052. MOVQ ( 72)(CX), AX; SBBQ AX, R9
  1053. MOVQ ( 80)(CX), AX; SBBQ AX, R10
  1054. MOVQ ( 88)(CX), AX; SBBQ AX, R11
  1055. MOVQ ( 96)(CX), AX; SBBQ AX, R12
  1056. MOVQ (104)(CX), DX; SBBQ DX, R13
  1057. MOVQ (112)(CX), DI; SBBQ DI, R14
  1058. MOVQ (120)(CX), SI; SBBQ SI, R15
  1059. // Final result
  1060. ADDQ (32)(CX), R8; MOVQ R8, (32)(CX)
  1061. ADCQ (40)(CX), R9; MOVQ R9, (40)(CX)
  1062. ADCQ (48)(CX), R10; MOVQ R10, (48)(CX)
  1063. ADCQ (56)(CX), R11; MOVQ R11, (56)(CX)
  1064. ADCQ (64)(CX), R12; MOVQ R12, (64)(CX)
  1065. ADCQ (72)(CX), R13; MOVQ R13, (72)(CX)
  1066. ADCQ (80)(CX), R14; MOVQ R14, (80)(CX)
  1067. ADCQ (88)(CX), R15; MOVQ R15, (88)(CX)
  1068. ADCQ $0, AX; MOVQ AX, (96)(CX)
  1069. ADCQ $0, DX; MOVQ DX, (104)(CX)
  1070. ADCQ $0, DI; MOVQ DI, (112)(CX)
  1071. ADCQ $0, SI; MOVQ SI, (120)(CX)
  1072. RET
  1073. mul_with_mulx_adx:
  1074. // Mul implementation for CPUs supporting two independent carry chain
  1075. // (ADOX/ADCX) instructions and carry-less MULX multiplier
  1076. MUL(CX, REG_P1, REG_P2, MULS256_MULXADX)
  1077. RET
  1078. mul_with_mulx:
  1079. // Mul implementation for CPUs supporting carry-less MULX multiplier.
  1080. MUL(CX, REG_P1, REG_P2, MULS256_MULX)
  1081. RET
  1082. TEXT ·fp503MontgomeryReduce(SB), $0-16
  1083. MOVQ z+0(FP), REG_P2
  1084. MOVQ x+8(FP), REG_P1
  1085. // Check wether to use optimized implementation
  1086. CMPB ·useADXMULX(SB), $1
  1087. JE redc_with_mulx_adx
  1088. CMPB ·useMULX(SB), $1
  1089. JE redc_with_mulx
  1090. MOVQ (REG_P1), R11
  1091. MOVQ P503P1_3, AX
  1092. MULQ R11
  1093. XORQ R8, R8
  1094. ADDQ (24)(REG_P1), AX
  1095. MOVQ AX, (24)(REG_P2)
  1096. ADCQ DX, R8
  1097. XORQ R9, R9
  1098. MOVQ P503P1_4, AX
  1099. MULQ R11
  1100. XORQ R10, R10
  1101. ADDQ AX, R8
  1102. ADCQ DX, R9
  1103. MOVQ (8)(REG_P1), R12
  1104. MOVQ P503P1_3, AX
  1105. MULQ R12
  1106. ADDQ AX, R8
  1107. ADCQ DX, R9
  1108. ADCQ $0, R10
  1109. ADDQ (32)(REG_P1), R8
  1110. MOVQ R8, (32)(REG_P2) // Z4
  1111. ADCQ $0, R9
  1112. ADCQ $0, R10
  1113. XORQ R8, R8
  1114. MOVQ P503P1_5, AX
  1115. MULQ R11
  1116. ADDQ AX, R9
  1117. ADCQ DX, R10
  1118. ADCQ $0, R8
  1119. MOVQ P503P1_4, AX
  1120. MULQ R12
  1121. ADDQ AX, R9
  1122. ADCQ DX, R10
  1123. ADCQ $0, R8
  1124. MOVQ (16)(REG_P1), R13
  1125. MOVQ P503P1_3, AX
  1126. MULQ R13
  1127. ADDQ AX, R9
  1128. ADCQ DX, R10
  1129. ADCQ $0, R8
  1130. ADDQ (40)(REG_P1), R9
  1131. MOVQ R9, (40)(REG_P2) // Z5
  1132. ADCQ $0, R10
  1133. ADCQ $0, R8
  1134. XORQ R9, R9
  1135. MOVQ P503P1_6, AX
  1136. MULQ R11
  1137. ADDQ AX, R10
  1138. ADCQ DX, R8
  1139. ADCQ $0, R9
  1140. MOVQ P503P1_5, AX
  1141. MULQ R12
  1142. ADDQ AX, R10
  1143. ADCQ DX, R8
  1144. ADCQ $0, R9
  1145. MOVQ P503P1_4, AX
  1146. MULQ R13
  1147. ADDQ AX, R10
  1148. ADCQ DX, R8
  1149. ADCQ $0, R9
  1150. MOVQ (24)(REG_P2), R14
  1151. MOVQ P503P1_3, AX
  1152. MULQ R14
  1153. ADDQ AX, R10
  1154. ADCQ DX, R8
  1155. ADCQ $0, R9
  1156. ADDQ (48)(REG_P1), R10
  1157. MOVQ R10, (48)(REG_P2) // Z6
  1158. ADCQ $0, R8
  1159. ADCQ $0, R9
  1160. XORQ R10, R10
  1161. MOVQ P503P1_7, AX
  1162. MULQ R11
  1163. ADDQ AX, R8
  1164. ADCQ DX, R9
  1165. ADCQ $0, R10
  1166. MOVQ P503P1_6, AX
  1167. MULQ R12
  1168. ADDQ AX, R8
  1169. ADCQ DX, R9
  1170. ADCQ $0, R10
  1171. MOVQ P503P1_5, AX
  1172. MULQ R13
  1173. ADDQ AX, R8
  1174. ADCQ DX, R9
  1175. ADCQ $0, R10
  1176. MOVQ P503P1_4, AX
  1177. MULQ R14
  1178. ADDQ AX, R8
  1179. ADCQ DX, R9
  1180. ADCQ $0, R10
  1181. MOVQ (32)(REG_P2), R15
  1182. MOVQ P503P1_3, AX
  1183. MULQ R15
  1184. ADDQ AX, R8
  1185. ADCQ DX, R9
  1186. ADCQ $0, R10
  1187. ADDQ (56)(REG_P1), R8
  1188. MOVQ R8, (56)(REG_P2) // Z7
  1189. ADCQ $0, R9
  1190. ADCQ $0, R10
  1191. XORQ R8, R8
  1192. MOVQ P503P1_7, AX
  1193. MULQ R12
  1194. ADDQ AX, R9
  1195. ADCQ DX, R10
  1196. ADCQ $0, R8
  1197. MOVQ P503P1_6, AX
  1198. MULQ R13
  1199. ADDQ AX, R9
  1200. ADCQ DX, R10
  1201. ADCQ $0, R8
  1202. MOVQ P503P1_5, AX
  1203. MULQ R14
  1204. ADDQ AX, R9
  1205. ADCQ DX, R10
  1206. ADCQ $0, R8
  1207. MOVQ P503P1_4, AX
  1208. MULQ R15
  1209. ADDQ AX, R9
  1210. ADCQ DX, R10
  1211. ADCQ $0, R8
  1212. MOVQ (40)(REG_P2), CX
  1213. MOVQ P503P1_3, AX
  1214. MULQ CX
  1215. ADDQ AX, R9
  1216. ADCQ DX, R10
  1217. ADCQ $0, R8
  1218. ADDQ (64)(REG_P1), R9
  1219. MOVQ R9, (REG_P2) // Z0
  1220. ADCQ $0, R10
  1221. ADCQ $0, R8
  1222. XORQ R9, R9
  1223. MOVQ P503P1_7, AX
  1224. MULQ R13
  1225. ADDQ AX, R10
  1226. ADCQ DX, R8
  1227. ADCQ $0, R9
  1228. MOVQ P503P1_6, AX
  1229. MULQ R14
  1230. ADDQ AX, R10
  1231. ADCQ DX, R8
  1232. ADCQ $0, R9
  1233. MOVQ P503P1_5, AX
  1234. MULQ R15
  1235. ADDQ AX, R10
  1236. ADCQ DX, R8
  1237. ADCQ $0, R9
  1238. MOVQ P503P1_4, AX
  1239. MULQ CX
  1240. ADDQ AX, R10
  1241. ADCQ DX, R8
  1242. ADCQ $0, R9
  1243. MOVQ (48)(REG_P2), R13
  1244. MOVQ P503P1_3, AX
  1245. MULQ R13
  1246. ADDQ AX, R10
  1247. ADCQ DX, R8
  1248. ADCQ $0, R9
  1249. ADDQ (72)(REG_P1), R10
  1250. MOVQ R10, (8)(REG_P2) // Z1
  1251. ADCQ $0, R8
  1252. ADCQ $0, R9
  1253. XORQ R10, R10
  1254. MOVQ P503P1_7, AX
  1255. MULQ R14
  1256. ADDQ AX, R8
  1257. ADCQ DX, R9
  1258. ADCQ $0, R10
  1259. MOVQ P503P1_6, AX
  1260. MULQ R15
  1261. ADDQ AX, R8
  1262. ADCQ DX, R9
  1263. ADCQ $0, R10
  1264. MOVQ P503P1_5, AX
  1265. MULQ CX
  1266. ADDQ AX, R8
  1267. ADCQ DX, R9
  1268. ADCQ $0, R10
  1269. MOVQ P503P1_4, AX
  1270. MULQ R13
  1271. ADDQ AX, R8
  1272. ADCQ DX, R9
  1273. ADCQ $0, R10
  1274. MOVQ (56)(REG_P2), R14
  1275. MOVQ P503P1_3, AX
  1276. MULQ R14
  1277. ADDQ AX, R8
  1278. ADCQ DX, R9
  1279. ADCQ $0, R10
  1280. ADDQ (80)(REG_P1), R8
  1281. MOVQ R8, (16)(REG_P2) // Z2
  1282. ADCQ $0, R9
  1283. ADCQ $0, R10
  1284. XORQ R8, R8
  1285. MOVQ P503P1_7, AX
  1286. MULQ R15
  1287. ADDQ AX, R9
  1288. ADCQ DX, R10
  1289. ADCQ $0, R8
  1290. MOVQ P503P1_6, AX
  1291. MULQ CX
  1292. ADDQ AX, R9
  1293. ADCQ DX, R10
  1294. ADCQ $0, R8
  1295. MOVQ P503P1_5, AX
  1296. MULQ R13
  1297. ADDQ AX, R9
  1298. ADCQ DX, R10
  1299. ADCQ $0, R8
  1300. MOVQ P503P1_4, AX
  1301. MULQ R14
  1302. ADDQ AX, R9
  1303. ADCQ DX, R10
  1304. ADCQ $0, R8
  1305. ADDQ (88)(REG_P1), R9
  1306. MOVQ R9, (24)(REG_P2) // Z3
  1307. ADCQ $0, R10
  1308. ADCQ $0, R8
  1309. XORQ R9, R9
  1310. MOVQ P503P1_7, AX
  1311. MULQ CX
  1312. ADDQ AX, R10
  1313. ADCQ DX, R8
  1314. ADCQ $0, R9
  1315. MOVQ P503P1_6, AX
  1316. MULQ R13
  1317. ADDQ AX, R10
  1318. ADCQ DX, R8
  1319. ADCQ $0, R9
  1320. MOVQ P503P1_5, AX
  1321. MULQ R14
  1322. ADDQ AX, R10
  1323. ADCQ DX, R8
  1324. ADCQ $0, R9
  1325. ADDQ (96)(REG_P1), R10
  1326. MOVQ R10, (32)(REG_P2) // Z4
  1327. ADCQ $0, R8
  1328. ADCQ $0, R9
  1329. XORQ R10, R10
  1330. MOVQ P503P1_7, AX
  1331. MULQ R13
  1332. ADDQ AX, R8
  1333. ADCQ DX, R9
  1334. ADCQ $0, R10
  1335. MOVQ P503P1_6, AX
  1336. MULQ R14
  1337. ADDQ AX, R8
  1338. ADCQ DX, R9
  1339. ADCQ $0, R10
  1340. ADDQ (104)(REG_P1), R8 // Z5
  1341. MOVQ R8, (40)(REG_P2) // Z5
  1342. ADCQ $0, R9
  1343. ADCQ $0, R10
  1344. MOVQ P503P1_7, AX
  1345. MULQ R14
  1346. ADDQ AX, R9
  1347. ADCQ DX, R10
  1348. ADDQ (112)(REG_P1), R9 // Z6
  1349. MOVQ R9, (48)(REG_P2) // Z6
  1350. ADCQ $0, R10
  1351. ADDQ (120)(REG_P1), R10 // Z7
  1352. MOVQ R10, (56)(REG_P2) // Z7
  1353. RET
  1354. redc_with_mulx_adx:
  1355. // Implementation of the Montgomery reduction for CPUs
  1356. // supporting two independent carry chain (ADOX/ADCX)
  1357. // instructions and carry-less MULX multiplier
  1358. REDC(REG_P2, REG_P1, MULS_128x320_MULXADX)
  1359. RET
  1360. redc_with_mulx:
  1361. // Implementation of the Montgomery reduction for CPUs
  1362. // supporting carry-less MULX multiplier.
  1363. REDC(REG_P2, REG_P1, MULS_128x320_MULX)
  1364. RET
  1365. TEXT ·fp503AddLazy(SB), NOSPLIT, $0-24
  1366. MOVQ z+0(FP), REG_P3
  1367. MOVQ x+8(FP), REG_P1
  1368. MOVQ y+16(FP), REG_P2
  1369. MOVQ (REG_P1), R8
  1370. MOVQ (8)(REG_P1), R9
  1371. MOVQ (16)(REG_P1), R10
  1372. MOVQ (24)(REG_P1), R11
  1373. MOVQ (32)(REG_P1), R12
  1374. MOVQ (40)(REG_P1), R13
  1375. MOVQ (48)(REG_P1), R14
  1376. MOVQ (56)(REG_P1), R15
  1377. ADDQ (REG_P2), R8
  1378. ADCQ (8)(REG_P2), R9
  1379. ADCQ (16)(REG_P2), R10
  1380. ADCQ (24)(REG_P2), R11
  1381. ADCQ (32)(REG_P2), R12
  1382. ADCQ (40)(REG_P2), R13
  1383. ADCQ (48)(REG_P2), R14
  1384. ADCQ (56)(REG_P2), R15
  1385. MOVQ R8, (REG_P3)
  1386. MOVQ R9, (8)(REG_P3)
  1387. MOVQ R10, (16)(REG_P3)
  1388. MOVQ R11, (24)(REG_P3)
  1389. MOVQ R12, (32)(REG_P3)
  1390. MOVQ R13, (40)(REG_P3)
  1391. MOVQ R14, (48)(REG_P3)
  1392. MOVQ R15, (56)(REG_P3)
  1393. RET
  1394. TEXT ·fp503X2AddLazy(SB), NOSPLIT, $0-24
  1395. MOVQ z+0(FP), REG_P3
  1396. MOVQ x+8(FP), REG_P1
  1397. MOVQ y+16(FP), REG_P2
  1398. MOVQ (REG_P1), R8
  1399. MOVQ (8)(REG_P1), R9
  1400. MOVQ (16)(REG_P1), R10
  1401. MOVQ (24)(REG_P1), R11
  1402. MOVQ (32)(REG_P1), R12
  1403. MOVQ (40)(REG_P1), R13
  1404. MOVQ (48)(REG_P1), R14
  1405. MOVQ (56)(REG_P1), R15
  1406. MOVQ (64)(REG_P1), AX
  1407. MOVQ (72)(REG_P1), BX
  1408. MOVQ (80)(REG_P1), CX
  1409. ADDQ (REG_P2), R8
  1410. ADCQ (8)(REG_P2), R9
  1411. ADCQ (16)(REG_P2), R10
  1412. ADCQ (24)(REG_P2), R11
  1413. ADCQ (32)(REG_P2), R12
  1414. ADCQ (40)(REG_P2), R13
  1415. ADCQ (48)(REG_P2), R14
  1416. ADCQ (56)(REG_P2), R15
  1417. ADCQ (64)(REG_P2), AX
  1418. ADCQ (72)(REG_P2), BX
  1419. ADCQ (80)(REG_P2), CX
  1420. MOVQ R8, (REG_P3)
  1421. MOVQ R9, (8)(REG_P3)
  1422. MOVQ R10, (16)(REG_P3)
  1423. MOVQ R11, (24)(REG_P3)
  1424. MOVQ R12, (32)(REG_P3)
  1425. MOVQ R13, (40)(REG_P3)
  1426. MOVQ R14, (48)(REG_P3)
  1427. MOVQ R15, (56)(REG_P3)
  1428. MOVQ AX, (64)(REG_P3)
  1429. MOVQ BX, (72)(REG_P3)
  1430. MOVQ CX, (80)(REG_P3)
  1431. MOVQ (88)(REG_P1), R8
  1432. MOVQ (96)(REG_P1), R9
  1433. MOVQ (104)(REG_P1), R10
  1434. MOVQ (112)(REG_P1), R11
  1435. MOVQ (120)(REG_P1), R12
  1436. ADCQ (88)(REG_P2), R8
  1437. ADCQ (96)(REG_P2), R9
  1438. ADCQ (104)(REG_P2), R10
  1439. ADCQ (112)(REG_P2), R11
  1440. ADCQ (120)(REG_P2), R12
  1441. MOVQ R8, (88)(REG_P3)
  1442. MOVQ R9, (96)(REG_P3)
  1443. MOVQ R10, (104)(REG_P3)
  1444. MOVQ R11, (112)(REG_P3)
  1445. MOVQ R12, (120)(REG_P3)
  1446. RET
  1447. TEXT ·fp503X2SubLazy(SB), NOSPLIT, $0-24
  1448. MOVQ z+0(FP), REG_P3
  1449. MOVQ x+8(FP), REG_P1
  1450. MOVQ y+16(FP), REG_P2
  1451. // Used later to store result of 0-borrow
  1452. XORQ CX, CX
  1453. // SUBC for first 11 limbs
  1454. MOVQ (REG_P1), R8
  1455. MOVQ (8)(REG_P1), R9
  1456. MOVQ (16)(REG_P1), R10
  1457. MOVQ (24)(REG_P1), R11
  1458. MOVQ (32)(REG_P1), R12
  1459. MOVQ (40)(REG_P1), R13
  1460. MOVQ (48)(REG_P1), R14
  1461. MOVQ (56)(REG_P1), R15
  1462. MOVQ (64)(REG_P1), AX
  1463. MOVQ (72)(REG_P1), BX
  1464. SUBQ (REG_P2), R8
  1465. SBBQ (8)(REG_P2), R9
  1466. SBBQ (16)(REG_P2), R10
  1467. SBBQ (24)(REG_P2), R11
  1468. SBBQ (32)(REG_P2), R12
  1469. SBBQ (40)(REG_P2), R13
  1470. SBBQ (48)(REG_P2), R14
  1471. SBBQ (56)(REG_P2), R15
  1472. SBBQ (64)(REG_P2), AX
  1473. SBBQ (72)(REG_P2), BX
  1474. MOVQ R8, (REG_P3)
  1475. MOVQ R9, (8)(REG_P3)
  1476. MOVQ R10, (16)(REG_P3)
  1477. MOVQ R11, (24)(REG_P3)
  1478. MOVQ R12, (32)(REG_P3)
  1479. MOVQ R13, (40)(REG_P3)
  1480. MOVQ R14, (48)(REG_P3)
  1481. MOVQ R15, (56)(REG_P3)
  1482. MOVQ AX, (64)(REG_P3)
  1483. MOVQ BX, (72)(REG_P3)
  1484. // SUBC for last 5 limbs
  1485. MOVQ (80)(REG_P1), R8
  1486. MOVQ (88)(REG_P1), R9
  1487. MOVQ (96)(REG_P1), R10
  1488. MOVQ (104)(REG_P1), R11
  1489. MOVQ (112)(REG_P1), R12
  1490. MOVQ (120)(REG_P1), R13
  1491. SBBQ (80)(REG_P2), R8
  1492. SBBQ (88)(REG_P2), R9
  1493. SBBQ (96)(REG_P2), R10
  1494. SBBQ (104)(REG_P2), R11
  1495. SBBQ (112)(REG_P2), R12
  1496. SBBQ (120)(REG_P2), R13
  1497. MOVQ R8, (80)(REG_P3)
  1498. MOVQ R9, (88)(REG_P3)
  1499. MOVQ R10, (96)(REG_P3)
  1500. MOVQ R11, (104)(REG_P3)
  1501. MOVQ R12, (112)(REG_P3)
  1502. MOVQ R13, (120)(REG_P3)
  1503. // Now the carry flag is 1 if x-y < 0. If so, add p*2^512.
  1504. SBBQ $0, CX
  1505. // Load p into registers:
  1506. MOVQ P503_0, R8
  1507. // P503_{1,2} = P503_0, so reuse R8
  1508. MOVQ P503_3, R9
  1509. MOVQ P503_4, R10
  1510. MOVQ P503_5, R11
  1511. MOVQ P503_6, R12
  1512. MOVQ P503_7, R13
  1513. ANDQ CX, R8
  1514. ANDQ CX, R9
  1515. ANDQ CX, R10
  1516. ANDQ CX, R11
  1517. ANDQ CX, R12
  1518. ANDQ CX, R13
  1519. MOVQ (64 )(REG_P3), AX; ADDQ R8, AX; MOVQ AX, (64 )(REG_P3)
  1520. MOVQ (64+ 8)(REG_P3), AX; ADCQ R8, AX; MOVQ AX, (64+ 8)(REG_P3)
  1521. MOVQ (64+16)(REG_P3), AX; ADCQ R8, AX; MOVQ AX, (64+16)(REG_P3)
  1522. MOVQ (64+24)(REG_P3), AX; ADCQ R9, AX; MOVQ AX, (64+24)(REG_P3)
  1523. MOVQ (64+32)(REG_P3), AX; ADCQ R10, AX; MOVQ AX, (64+32)(REG_P3)
  1524. MOVQ (64+40)(REG_P3), AX; ADCQ R11, AX; MOVQ AX, (64+40)(REG_P3)
  1525. MOVQ (64+48)(REG_P3), AX; ADCQ R12, AX; MOVQ AX, (64+48)(REG_P3)
  1526. MOVQ (64+56)(REG_P3), AX; ADCQ R13, AX; MOVQ AX, (64+56)(REG_P3)
  1527. RET