Du kannst nicht mehr als 25 Themen auswählen Themen müssen entweder mit einem Buchstaben oder einer Ziffer beginnen. Sie können Bindestriche („-“) enthalten und bis zu 35 Zeichen lang sein.
 
 
 

1703 Zeilen
38 KiB

  1. // +build amd64,!noasm
  2. #include "textflag.h"
  3. // p503
  4. #define P503_0 $0xFFFFFFFFFFFFFFFF
  5. #define P503_1 $0xFFFFFFFFFFFFFFFF
  6. #define P503_2 $0xFFFFFFFFFFFFFFFF
  7. #define P503_3 $0xABFFFFFFFFFFFFFF
  8. #define P503_4 $0x13085BDA2211E7A0
  9. #define P503_5 $0x1B9BF6C87B7E7DAF
  10. #define P503_6 $0x6045C6BDDA77A4D0
  11. #define P503_7 $0x004066F541811E1E
  12. // p503+1
  13. #define P503P1_3 $0xAC00000000000000
  14. #define P503P1_4 $0x13085BDA2211E7A0
  15. #define P503P1_5 $0x1B9BF6C87B7E7DAF
  16. #define P503P1_6 $0x6045C6BDDA77A4D0
  17. #define P503P1_7 $0x004066F541811E1E
  18. // p503x2
  19. #define P503X2_0 $0xFFFFFFFFFFFFFFFE
  20. #define P503X2_1 $0xFFFFFFFFFFFFFFFF
  21. #define P503X2_2 $0xFFFFFFFFFFFFFFFF
  22. #define P503X2_3 $0x57FFFFFFFFFFFFFF
  23. #define P503X2_4 $0x2610B7B44423CF41
  24. #define P503X2_5 $0x3737ED90F6FCFB5E
  25. #define P503X2_6 $0xC08B8D7BB4EF49A0
  26. #define P503X2_7 $0x0080CDEA83023C3C
  27. #define REG_P1 DI
  28. #define REG_P2 SI
  29. #define REG_P3 DX
  30. // Performs schoolbook multiplication of 2 256-bit numbers. This optimized version
  31. // uses MULX instruction. Macro smashes value in DX.
  32. // Input: I0 and I1.
  33. // Output: O
  34. // All the other arguments are resgisters, used for storing temporary values
  35. #define MULS256_MULX(O, I0, I1, T0, T1, T2, T3, T4, T5, T6, T7, T8, T9) \
  36. MOVQ I0, DX \
  37. MULXQ I1, T1, T0 \ // T0:T1 = A0*B0
  38. MOVQ T1, O \ // O[0]
  39. MULXQ 8+I1, T2, T1 \ // T1:T2 = U0*V1
  40. ADDQ T2, T0 \
  41. MULXQ 16+I1, T3, T2 \ // T2:T3 = U0*V2
  42. ADCQ T3, T1 \
  43. MULXQ 24+I1, T4, T3 \ // T3:T4 = U0*V3
  44. ADCQ T4, T2 \
  45. \ // Column U1
  46. MOVQ 8+I0, DX \
  47. ADCQ $0, T3 \
  48. MULXQ 0+I1, T4, T5 \ // T5:T4 = U1*V0
  49. MULXQ 8+I1, T7, T6 \ // T6:T7 = U1*V1
  50. ADDQ T7, T5 \
  51. MULXQ 16+I1, T8, T7 \ // T7:T8 = U1*V2
  52. ADCQ T8, T6 \
  53. MULXQ 24+I1, T9, T8 \ // T8:T9 = U1*V3
  54. ADCQ T9, T7 \
  55. ADCQ $0, T8 \
  56. ADDQ T0, T4 \
  57. MOVQ T4, 8+O \ // O[1]
  58. ADCQ T1, T5 \
  59. ADCQ T2, T6 \
  60. ADCQ T3, T7 \
  61. \ // Column U2
  62. MOVQ 16+I0, DX \
  63. ADCQ $0, T8 \
  64. MULXQ 0+I1, T0, T1 \ // T1:T0 = U2*V0
  65. MULXQ 8+I1, T3, T2 \ // T2:T3 = U2*V1
  66. ADDQ T3, T1 \
  67. MULXQ 16+I1, T4, T3 \ // T3:T4 = U2*V2
  68. ADCQ T4, T2 \
  69. MULXQ 24+I1, T9, T4 \ // T4:T9 = U2*V3
  70. ADCQ T9, T3 \
  71. \ // Column U3
  72. MOVQ 24+I0, DX \
  73. ADCQ $0, T4 \
  74. ADDQ T5, T0 \
  75. MOVQ T0, 16+O \ // O[2]
  76. ADCQ T6, T1 \
  77. ADCQ T7, T2 \
  78. ADCQ T8, T3 \
  79. ADCQ $0, T4 \
  80. MULXQ 0+I1, T0, T5 \ // T5:T0 = U3*V0
  81. MULXQ 8+I1, T7, T6 \ // T6:T7 = U3*V1
  82. ADDQ T7, T5 \
  83. MULXQ 16+I1, T8, T7 \ // T7:T8 = U3*V2
  84. ADCQ T8, T6 \
  85. MULXQ 24+I1, T9, T8 \ // T8:T9 = U3*V3
  86. ADCQ T9, T7 \
  87. ADCQ $0, T8 \
  88. \ // Add values in remaining columns
  89. ADDQ T0, T1 \
  90. MOVQ T1, 24+O \ // O[3]
  91. ADCQ T5, T2 \
  92. MOVQ T2, 32+O \ // O[4]
  93. ADCQ T6, T3 \
  94. MOVQ T3, 40+O \ // O[5]
  95. ADCQ T7, T4 \
  96. MOVQ T4, 48+O \ // O[6]
  97. ADCQ $0, T8 \ // O[7]
  98. MOVQ T8, 56+O
  99. // Performs schoolbook multiplication of 2 256-bit numbers. This optimized version
  100. // uses ADOX, ADCX and MULX instructions. Macro smashes values in AX and DX.
  101. // Input: I0 and I1.
  102. // Output: O
  103. // All the other arguments resgisters are used for storing temporary values
  104. #define MULS256_MULXADX(O, I0, I1, T0, T1, T2, T3, T4, T5, T6, T7, T8, T9) \
  105. \ // U0[0]
  106. MOVQ 0+I0, DX \ // MULX requires multiplayer in DX
  107. \ // T0:T1 = I1*DX
  108. MULXQ I1, T1, T0 \ // T0:T1 = U0*V0 (low:high)
  109. MOVQ T1, O \ // O0[0]
  110. MULXQ 8+I1, T2, T1 \ // T2:T1 = U0*V1
  111. XORQ AX, AX \
  112. ADOXQ T2, T0 \
  113. MULXQ 16+I1, T3, T2 \ // T2:T3 = U0*V2
  114. ADOXQ T3, T1 \
  115. MULXQ 24+I1, T4, T3 \ // T3:T4 = U0*V3
  116. ADOXQ T4, T2 \
  117. \ // Column U1
  118. MOVQ 8+I0, DX \
  119. MULXQ I1, T4, T5 \ // T5:T4 = U1*V0
  120. ADOXQ AX, T3 \
  121. XORQ AX, AX \
  122. MULXQ 8+I1, T7, T6 \ // T6:T7 = U1*V1
  123. ADOXQ T0, T4 \
  124. MOVQ T4, 8+O \ // O[1]
  125. ADCXQ T7, T5 \
  126. MULXQ 16+I1, T8, T7 \ // T7:T8 = U1*V2
  127. ADCXQ T8, T6 \
  128. ADOXQ T1, T5 \
  129. MULXQ 24+I1, T9, T8 \ // T8:T9 = U1*V3
  130. ADCXQ T9, T7 \
  131. ADCXQ AX, T8 \
  132. ADOXQ T2, T6 \
  133. \ // Column U2
  134. MOVQ 16+I0, DX \
  135. MULXQ I1, T0, T1 \ // T1:T0 = U2*V0
  136. ADOXQ T3, T7 \
  137. ADOXQ AX, T8 \
  138. XORQ AX, AX \
  139. MULXQ 8+I1, T3, T2 \ // T2:T3 = U2*V1
  140. ADOXQ T5, T0 \
  141. MOVQ T0, 16+O \ // O[2]
  142. ADCXQ T3, T1 \
  143. MULXQ 16+I1, T4, T3 \ // T3:T4 = U2*V2
  144. ADCXQ T4, T2 \
  145. ADOXQ T6, T1 \
  146. MULXQ 24+I1, T9, T4 \ // T9:T4 = U2*V3
  147. ADCXQ T9, T3 \
  148. MOVQ 24+I0, DX \
  149. ADCXQ AX, T4 \
  150. \
  151. ADOXQ T7, T2 \
  152. ADOXQ T8, T3 \
  153. ADOXQ AX, T4 \
  154. \ // Column U3
  155. MULXQ I1, T0, T5 \ // T5:T0 = U3*B0
  156. XORQ AX, AX \
  157. MULXQ 8+I1, T7, T6 \ // T6:T7 = U3*B1
  158. ADCXQ T7, T5 \
  159. ADOXQ T0, T1 \
  160. MULXQ 16+I1, T8, T7 \ // T7:T8 = U3*V2
  161. ADCXQ T8, T6 \
  162. ADOXQ T5, T2 \
  163. MULXQ 24+I1, T9, T8 \ // T8:T9 = U3*V3
  164. ADCXQ T9, T7 \
  165. ADCXQ AX, T8 \
  166. \
  167. ADOXQ T6, T3 \
  168. ADOXQ T7, T4 \
  169. ADOXQ AX, T8 \
  170. MOVQ T1, 24+O \ // O[3]
  171. MOVQ T2, 32+O \ // O[4]
  172. MOVQ T3, 40+O \ // O[5]
  173. MOVQ T4, 48+O \ // O[6] and O[7] below
  174. MOVQ T8, 56+O
  175. // Template of a macro that performs schoolbook multiplication of 128-bit with 320-bit
  176. // number. It uses MULX instruction This template must be customized with functions
  177. // performing ADD (add1, add2) and ADD-with-carry (adc1, adc2). addX/adcX may or may
  178. // not be instructions that use two independent carry chains.
  179. // Input:
  180. // * I0 128-bit number
  181. // * I1 320-bit number
  182. // * add1, add2: instruction performing integer addition and starting carry chain
  183. // * adc1, adc2: instruction performing integer addition with carry
  184. // Output: T[0-6] registers
  185. #define MULS_128x320(I0, I1, T0, T1, T2, T3, T4, T5, T6, T7, T8, T9, add1, add2, adc1, adc2) \
  186. \ // Column 0
  187. MOVQ I0, DX \
  188. MULXQ I1+24(SB), T0, T1 \
  189. MULXQ I1+32(SB), T4, T2 \
  190. XORQ AX, AX \
  191. MULXQ I1+40(SB), T5, T3 \
  192. add1 T4, T1 \
  193. adc1 T5, T2 \
  194. MULXQ I1+48(SB), T7, T4 \
  195. adc1 T7, T3 \
  196. MULXQ I1+56(SB), T6, T5 \
  197. adc1 T6, T4 \
  198. adc1 AX, T5 \
  199. \ // Column 1
  200. MOVQ 8+I0, DX \
  201. MULXQ I1+24(SB), T6, T7 \
  202. add2 T6, T1 \
  203. adc2 T7, T2 \
  204. MULXQ I1+32(SB), T8, T6 \
  205. adc2 T6, T3 \
  206. MULXQ I1+40(SB), T7, T9 \
  207. adc2 T9, T4 \
  208. MULXQ I1+48(SB), T9, T6 \
  209. adc2 T6, T5 \
  210. MULXQ I1+56(SB), DX, T6 \
  211. adc2 AX, T6 \
  212. \ // Output
  213. XORQ AX, AX \
  214. add1 T8, T2 \
  215. adc1 T7, T3 \
  216. adc1 T9, T4 \
  217. adc1 DX, T5 \
  218. adc1 AX, T6
  219. // Multiplies 128-bit with 320-bit integer. Optimized with MULX instruction.
  220. #define MULS_128x320_MULX(I0, I1, T0, T1, T2, T3, T4, T5, T6, T7, T8, T9) \
  221. MULS_128x320(I0, I1, T0, T1, T2, T3, T4, T5, T6, T7, T8, T9, ADDQ, ADDQ, ADCQ, ADCQ)
  222. // Multiplies 128-bit with 320-bit integer. Optimized with MULX, ADOX and ADCX instructions
  223. #define MULS_128x320_MULXADX(I0, I1, T0, T1, T2, T3, T4, T5, T6, T7, T8, T9) \
  224. MULS_128x320(I0, I1, T0, T1, T2, T3, T4, T5, T6, T7, T8, T9, ADOXQ, ADCXQ, ADOXQ, ADCXQ)
  225. // Template of a macro performing multiplication of two 512-bit numbers. It uses one
  226. // level of Karatsuba and one level of schoolbook multiplication. Template must be
  227. // customized with macro performing schoolbook multiplication.
  228. // Input:
  229. // * I0, I1 - two 512-bit numbers
  230. // * MULS - either MULS256_MULX or MULS256_MULXADX
  231. // Output: OUT - 1024-bit long
  232. #define MUL(OUT, I0, I1, MULS) \
  233. \ // R[8-11]: U1+U0
  234. XORQ AX, AX \
  235. MOVQ ( 0)(I0), R8 \
  236. MOVQ ( 8)(I0), R9 \
  237. MOVQ (16)(I0), R10 \
  238. MOVQ (24)(I0), R11 \
  239. ADDQ (32)(I0), R8 \
  240. ADCQ (40)(I0), R9 \
  241. ADCQ (48)(I0), R10 \
  242. ADCQ (56)(I0), R11 \
  243. SBBQ $0, AX \ // store mask
  244. MOVQ R8, ( 0)(SP) \
  245. MOVQ R9, ( 8)(SP) \
  246. MOVQ R10, (16)(SP) \
  247. MOVQ R11, (24)(SP) \
  248. \
  249. \ // R[12-15]: V1+V0
  250. XORQ BX, BX \
  251. MOVQ ( 0)(I1), R12 \
  252. MOVQ ( 8)(I1), R13 \
  253. MOVQ (16)(I1), R14 \
  254. MOVQ (24)(I1), R15 \
  255. ADDQ (32)(I1), R12 \
  256. ADCQ (40)(I1), R13 \
  257. ADCQ (48)(I1), R14 \
  258. ADCQ (56)(I1), R15 \
  259. SBBQ $0, BX \ // store mask
  260. MOVQ R12, (32)(SP) \
  261. MOVQ R13, (40)(SP) \
  262. MOVQ R14, (48)(SP) \
  263. MOVQ R15, (56)(SP) \
  264. \ // Prepare mask for U0+U1 (U1+U0 mod 256^4 if U1+U0 sets carry flag, otherwise 0)
  265. ANDQ AX, R12 \
  266. ANDQ AX, R13 \
  267. ANDQ AX, R14 \
  268. ANDQ AX, R15 \
  269. \ // Prepare mask for V0+V1 (V1+V0 mod 256^4 if U1+U0 sets carry flag, otherwise 0)
  270. ANDQ BX, R8 \
  271. ANDQ BX, R9 \
  272. ANDQ BX, R10 \
  273. ANDQ BX, R11 \
  274. \ // res = masked(U0+U1) + masked(V0 + V1)
  275. ADDQ R12, R8 \
  276. ADCQ R13, R9 \
  277. ADCQ R14, R10 \
  278. ADCQ R15, R11 \
  279. \ // SP[64-96] <- res
  280. MOVQ R8, (64)(SP) \
  281. MOVQ R9, (72)(SP) \
  282. MOVQ R10, (80)(SP) \
  283. MOVQ R11, (88)(SP) \
  284. \ // BP will be used for schoolbook multiplication below
  285. MOVQ BP, 96(SP) \
  286. \ // (U1+U0)*(V1+V0)
  287. MULS((64)(OUT), 0(SP), 32(SP), R8, R9, R10, R11, R12, R13, R14, R15, BX, BP) \
  288. \ // U0 x V0
  289. MULS(0(OUT), 0(I0), 0(I1), R8, R9, R10, R11, R12, R13, R14, R15, BX, BP) \
  290. \ // U1 x V1
  291. MULS(0(SP), 32(I0), 32(I1), R8, R9, R10, R11, R12, R13, R14, R15, BX, BP) \
  292. \ // Recover BP
  293. MOVQ 96(SP), BP \
  294. \ // Final part of schoolbook multiplication; R[8-11] = (U0+U1) x (V0+V1)
  295. MOVQ (64)(SP), R8 \
  296. MOVQ (72)(SP), R9 \
  297. MOVQ (80)(SP), R10 \
  298. MOVQ (88)(SP), R11 \
  299. MOVQ (96)(OUT), AX \
  300. ADDQ AX, R8 \
  301. MOVQ (104)(OUT), AX \
  302. ADCQ AX, R9 \
  303. MOVQ (112)(OUT), AX \
  304. ADCQ AX, R10 \
  305. MOVQ (120)(OUT), AX \
  306. ADCQ AX, R11 \
  307. \ // R[12-15, 8-11] = (U0+U1) x (V0+V1) - U0xV0
  308. MOVQ (64)(OUT), R12 \
  309. MOVQ (72)(OUT), R13 \
  310. MOVQ (80)(OUT), R14 \
  311. MOVQ (88)(OUT), R15 \
  312. SUBQ ( 0)(OUT), R12 \
  313. SBBQ ( 8)(OUT), R13 \
  314. SBBQ (16)(OUT), R14 \
  315. SBBQ (24)(OUT), R15 \
  316. SBBQ (32)(OUT), R8 \
  317. SBBQ (40)(OUT), R9 \
  318. SBBQ (48)(OUT), R10 \
  319. SBBQ (56)(OUT), R11 \
  320. \ // r8-r15 <- (U0+U1) x (V0+V1) - U0xV0 - U1xV1
  321. SUBQ ( 0)(SP), R12 \
  322. SBBQ ( 8)(SP), R13 \
  323. SBBQ (16)(SP), R14 \
  324. SBBQ (24)(SP), R15 \
  325. SBBQ (32)(SP), R8 \
  326. SBBQ (40)(SP), R9 \
  327. SBBQ (48)(SP), R10 \
  328. SBBQ (56)(SP), R11 \
  329. \
  330. ; ADDQ (32)(OUT), R12; MOVQ R12, ( 32)(OUT) \
  331. ; ADCQ (40)(OUT), R13; MOVQ R13, ( 40)(OUT) \
  332. ; ADCQ (48)(OUT), R14; MOVQ R14, ( 48)(OUT) \
  333. ; ADCQ (56)(OUT), R15; MOVQ R15, ( 56)(OUT) \
  334. MOVQ ( 0)(SP), AX; ADCQ AX, R8; MOVQ R8, ( 64)(OUT) \
  335. MOVQ ( 8)(SP), AX; ADCQ AX, R9; MOVQ R9, ( 72)(OUT) \
  336. MOVQ (16)(SP), AX; ADCQ AX, R10; MOVQ R10, ( 80)(OUT) \
  337. MOVQ (24)(SP), AX; ADCQ AX, R11; MOVQ R11, ( 88)(OUT) \
  338. MOVQ (32)(SP), R12; ADCQ $0, R12; MOVQ R12, ( 96)(OUT) \
  339. MOVQ (40)(SP), R13; ADCQ $0, R13; MOVQ R13, (104)(OUT) \
  340. MOVQ (48)(SP), R14; ADCQ $0, R14; MOVQ R14, (112)(OUT) \
  341. MOVQ (56)(SP), R15; ADCQ $0, R15; MOVQ R15, (120)(OUT)
  342. // Template for calculating the Montgomery reduction algorithm described in
  343. // section 5.2.3 of https://eprint.iacr.org/2017/1015.pdf. Template must be
  344. // customized with schoolbook multiplicaton for 128 x 320-bit number.
  345. // This macro reuses memory of IN value and *changes* it.
  346. // Input:
  347. // * IN: 1024-bit number to be reduced
  348. // * MULS: either MULS_128x320_MULX or MULS_128x320_MULXADX
  349. // Output: OUT 512-bit
  350. #define REDC(OUT, IN, MULS) \
  351. MULS(0(IN), ·p503p1, R8, R9, R10, R11, R12, R13, R14, BX, CX, R15) \
  352. XORQ R15, R15 \
  353. ADDQ (24)(IN), R8 \
  354. ADCQ (32)(IN), R9 \
  355. ADCQ (40)(IN), R10 \
  356. ADCQ (48)(IN), R11 \
  357. ADCQ (56)(IN), R12 \
  358. ADCQ (64)(IN), R13 \
  359. ADCQ (72)(IN), R14 \
  360. ADCQ (80)(IN), R15 \
  361. MOVQ R8, (24)(IN) \
  362. MOVQ R9, (32)(IN) \
  363. MOVQ R10, (40)(IN) \
  364. MOVQ R11, (48)(IN) \
  365. MOVQ R12, (56)(IN) \
  366. MOVQ R13, (64)(IN) \
  367. MOVQ R14, (72)(IN) \
  368. MOVQ R15, (80)(IN) \
  369. MOVQ (88)(IN), R8 \
  370. MOVQ (96)(IN), R9 \
  371. MOVQ (104)(IN), R10 \
  372. MOVQ (112)(IN), R11 \
  373. MOVQ (120)(IN), R12 \
  374. ADCQ $0, R8 \
  375. ADCQ $0, R9 \
  376. ADCQ $0, R10 \
  377. ADCQ $0, R11 \
  378. ADCQ $0, R12 \
  379. MOVQ R8, (88)(IN) \
  380. MOVQ R9, (96)(IN) \
  381. MOVQ R10, (104)(IN) \
  382. MOVQ R11, (112)(IN) \
  383. MOVQ R12, (120)(IN) \
  384. \
  385. MULS(16(IN), ·p503p1, R8, R9, R10, R11, R12, R13, R14, BX, CX, R15) \
  386. XORQ R15, R15 \
  387. ADDQ (40)(IN), R8 \
  388. ADCQ (48)(IN), R9 \
  389. ADCQ (56)(IN), R10 \
  390. ADCQ (64)(IN), R11 \
  391. ADCQ (72)(IN), R12 \
  392. ADCQ (80)(IN), R13 \
  393. ADCQ (88)(IN), R14 \
  394. ADCQ (96)(IN), R15 \
  395. MOVQ R8, (40)(IN) \
  396. MOVQ R9, (48)(IN) \
  397. MOVQ R10, (56)(IN) \
  398. MOVQ R11, (64)(IN) \
  399. MOVQ R12, (72)(IN) \
  400. MOVQ R13, (80)(IN) \
  401. MOVQ R14, (88)(IN) \
  402. MOVQ R15, (96)(IN) \
  403. MOVQ (104)(IN), R8 \
  404. MOVQ (112)(IN), R9 \
  405. MOVQ (120)(IN), R10 \
  406. ADCQ $0, R8 \
  407. ADCQ $0, R9 \
  408. ADCQ $0, R10 \
  409. MOVQ R8, (104)(IN) \
  410. MOVQ R9, (112)(IN) \
  411. MOVQ R10, (120)(IN) \
  412. \
  413. MULS(32(IN), ·p503p1, R8, R9, R10, R11, R12, R13, R14, BX, CX, R15) \
  414. XORQ R15, R15 \
  415. XORQ BX, BX \
  416. ADDQ ( 56)(IN), R8 \
  417. ADCQ ( 64)(IN), R9 \
  418. ADCQ ( 72)(IN), R10 \
  419. ADCQ ( 80)(IN), R11 \
  420. ADCQ ( 88)(IN), R12 \
  421. ADCQ ( 96)(IN), R13 \
  422. ADCQ (104)(IN), R14 \
  423. ADCQ (112)(IN), R15 \
  424. ADCQ (120)(IN), BX \
  425. MOVQ R8, ( 56)(IN) \
  426. MOVQ R10, ( 72)(IN) \
  427. MOVQ R11, ( 80)(IN) \
  428. MOVQ R12, ( 88)(IN) \
  429. MOVQ R13, ( 96)(IN) \
  430. MOVQ R14, (104)(IN) \
  431. MOVQ R15, (112)(IN) \
  432. MOVQ BX, (120)(IN) \
  433. MOVQ R9, ( 0)(OUT) \ // Result: OUT[0]
  434. \
  435. MULS(48(IN), ·p503p1, R8, R9, R10, R11, R12, R13, R14, BX, CX, R15) \
  436. ADDQ ( 72)(IN), R8 \
  437. ADCQ ( 80)(IN), R9 \
  438. ADCQ ( 88)(IN), R10 \
  439. ADCQ ( 96)(IN), R11 \
  440. ADCQ (104)(IN), R12 \
  441. ADCQ (112)(IN), R13 \
  442. ADCQ (120)(IN), R14 \
  443. MOVQ R8, ( 8)(OUT) \ // Result: OUT[1]
  444. MOVQ R9, (16)(OUT) \ // Result: OUT[2]
  445. MOVQ R10, (24)(OUT) \ // Result: OUT[3]
  446. MOVQ R11, (32)(OUT) \ // Result: OUT[4]
  447. MOVQ R12, (40)(OUT) \ // Result: OUT[5]
  448. MOVQ R13, (48)(OUT) \ // Result: OUT[6] and OUT[7]
  449. MOVQ R14, (56)(OUT)
  450. TEXT ·fp503StrongReduce(SB), NOSPLIT, $0-8
  451. MOVQ x+0(FP), REG_P1
  452. // Zero AX for later use:
  453. XORQ AX, AX
  454. // Load p into registers:
  455. MOVQ P503_0, R8
  456. // P503_{1,2} = P503_0, so reuse R8
  457. MOVQ P503_3, R9
  458. MOVQ P503_4, R10
  459. MOVQ P503_5, R11
  460. MOVQ P503_6, R12
  461. MOVQ P503_7, R13
  462. // Set x <- x - p
  463. SUBQ R8, ( 0)(REG_P1)
  464. SBBQ R8, ( 8)(REG_P1)
  465. SBBQ R8, (16)(REG_P1)
  466. SBBQ R9, (24)(REG_P1)
  467. SBBQ R10, (32)(REG_P1)
  468. SBBQ R11, (40)(REG_P1)
  469. SBBQ R12, (48)(REG_P1)
  470. SBBQ R13, (56)(REG_P1)
  471. // Save carry flag indicating x-p < 0 as a mask
  472. SBBQ $0, AX
  473. // Conditionally add p to x if x-p < 0
  474. ANDQ AX, R8
  475. ANDQ AX, R9
  476. ANDQ AX, R10
  477. ANDQ AX, R11
  478. ANDQ AX, R12
  479. ANDQ AX, R13
  480. ADDQ R8, ( 0)(REG_P1)
  481. ADCQ R8, ( 8)(REG_P1)
  482. ADCQ R8, (16)(REG_P1)
  483. ADCQ R9, (24)(REG_P1)
  484. ADCQ R10,(32)(REG_P1)
  485. ADCQ R11,(40)(REG_P1)
  486. ADCQ R12,(48)(REG_P1)
  487. ADCQ R13,(56)(REG_P1)
  488. RET
  489. TEXT ·fp503ConditionalSwap(SB),NOSPLIT,$0-17
  490. MOVQ x+0(FP), REG_P1
  491. MOVQ y+8(FP), REG_P2
  492. MOVB choice+16(FP), AL // AL = 0 or 1
  493. MOVBLZX AL, AX // AX = 0 or 1
  494. NEGQ AX // AX = 0x00..00 or 0xff..ff
  495. #ifndef CSWAP_BLOCK
  496. #define CSWAP_BLOCK(idx) \
  497. MOVQ (idx*8)(REG_P1), BX \ // BX = x[idx]
  498. MOVQ (idx*8)(REG_P2), CX \ // CX = y[idx]
  499. MOVQ CX, DX \ // DX = y[idx]
  500. XORQ BX, DX \ // DX = y[idx] ^ x[idx]
  501. ANDQ AX, DX \ // DX = (y[idx] ^ x[idx]) & mask
  502. XORQ DX, BX \ // BX = (y[idx] ^ x[idx]) & mask) ^ x[idx] = x[idx] or y[idx]
  503. XORQ DX, CX \ // CX = (y[idx] ^ x[idx]) & mask) ^ y[idx] = y[idx] or x[idx]
  504. MOVQ BX, (idx*8)(REG_P1) \
  505. MOVQ CX, (idx*8)(REG_P2)
  506. #endif
  507. CSWAP_BLOCK(0)
  508. CSWAP_BLOCK(1)
  509. CSWAP_BLOCK(2)
  510. CSWAP_BLOCK(3)
  511. CSWAP_BLOCK(4)
  512. CSWAP_BLOCK(5)
  513. CSWAP_BLOCK(6)
  514. CSWAP_BLOCK(7)
  515. #ifdef CSWAP_BLOCK
  516. #undef CSWAP_BLOCK
  517. #endif
  518. RET
  519. TEXT ·fp503AddReduced(SB),NOSPLIT,$0-24
  520. MOVQ z+0(FP), REG_P3
  521. MOVQ x+8(FP), REG_P1
  522. MOVQ y+16(FP), REG_P2
  523. // Used later to calculate a mask
  524. XORQ CX, CX
  525. // [R8-R15]: z = x + y
  526. MOVQ ( 0)(REG_P1), R8
  527. MOVQ ( 8)(REG_P1), R9
  528. MOVQ (16)(REG_P1), R10
  529. MOVQ (24)(REG_P1), R11
  530. MOVQ (32)(REG_P1), R12
  531. MOVQ (40)(REG_P1), R13
  532. MOVQ (48)(REG_P1), R14
  533. MOVQ (56)(REG_P1), R15
  534. ADDQ ( 0)(REG_P2), R8
  535. ADCQ ( 8)(REG_P2), R9
  536. ADCQ (16)(REG_P2), R10
  537. ADCQ (24)(REG_P2), R11
  538. ADCQ (32)(REG_P2), R12
  539. ADCQ (40)(REG_P2), R13
  540. ADCQ (48)(REG_P2), R14
  541. ADCQ (56)(REG_P2), R15
  542. MOVQ P503X2_0, AX
  543. SUBQ AX, R8
  544. MOVQ P503X2_1, AX
  545. SBBQ AX, R9
  546. SBBQ AX, R10
  547. MOVQ P503X2_3, AX
  548. SBBQ AX, R11
  549. MOVQ P503X2_4, AX
  550. SBBQ AX, R12
  551. MOVQ P503X2_5, AX
  552. SBBQ AX, R13
  553. MOVQ P503X2_6, AX
  554. SBBQ AX, R14
  555. MOVQ P503X2_7, AX
  556. SBBQ AX, R15
  557. // mask
  558. SBBQ $0, CX
  559. // move z to REG_P3
  560. MOVQ R8, ( 0)(REG_P3)
  561. MOVQ R9, ( 8)(REG_P3)
  562. MOVQ R10, (16)(REG_P3)
  563. MOVQ R11, (24)(REG_P3)
  564. MOVQ R12, (32)(REG_P3)
  565. MOVQ R13, (40)(REG_P3)
  566. MOVQ R14, (48)(REG_P3)
  567. MOVQ R15, (56)(REG_P3)
  568. // if z<0 add p503x2 back
  569. MOVQ P503X2_0, R8
  570. MOVQ P503X2_1, R9
  571. MOVQ P503X2_3, R10
  572. MOVQ P503X2_4, R11
  573. MOVQ P503X2_5, R12
  574. MOVQ P503X2_6, R13
  575. MOVQ P503X2_7, R14
  576. ANDQ CX, R8
  577. ANDQ CX, R9
  578. ANDQ CX, R10
  579. ANDQ CX, R11
  580. ANDQ CX, R12
  581. ANDQ CX, R13
  582. ANDQ CX, R14
  583. MOVQ ( 0)(REG_P3), AX; ADDQ R8, AX; MOVQ AX, ( 0)(REG_P3)
  584. MOVQ ( 8)(REG_P3), AX; ADCQ R9, AX; MOVQ AX, ( 8)(REG_P3)
  585. MOVQ (16)(REG_P3), AX; ADCQ R9, AX; MOVQ AX, (16)(REG_P3)
  586. MOVQ (24)(REG_P3), AX; ADCQ R10, AX; MOVQ AX, (24)(REG_P3)
  587. MOVQ (32)(REG_P3), AX; ADCQ R11, AX; MOVQ AX, (32)(REG_P3)
  588. MOVQ (40)(REG_P3), AX; ADCQ R12, AX; MOVQ AX, (40)(REG_P3)
  589. MOVQ (48)(REG_P3), AX; ADCQ R13, AX; MOVQ AX, (48)(REG_P3)
  590. MOVQ (56)(REG_P3), AX; ADCQ R14, AX; MOVQ AX, (56)(REG_P3)
  591. RET
  592. TEXT ·fp503SubReduced(SB), NOSPLIT, $0-24
  593. MOVQ z+0(FP), REG_P3
  594. MOVQ x+8(FP), REG_P1
  595. MOVQ y+16(FP), REG_P2
  596. // Used later to calculate a mask
  597. XORQ CX, CX
  598. MOVQ ( 0)(REG_P1), R8
  599. MOVQ ( 8)(REG_P1), R9
  600. MOVQ (16)(REG_P1), R10
  601. MOVQ (24)(REG_P1), R11
  602. MOVQ (32)(REG_P1), R12
  603. MOVQ (40)(REG_P1), R13
  604. MOVQ (48)(REG_P1), R14
  605. MOVQ (56)(REG_P1), R15
  606. SUBQ ( 0)(REG_P2), R8
  607. SBBQ ( 8)(REG_P2), R9
  608. SBBQ (16)(REG_P2), R10
  609. SBBQ (24)(REG_P2), R11
  610. SBBQ (32)(REG_P2), R12
  611. SBBQ (40)(REG_P2), R13
  612. SBBQ (48)(REG_P2), R14
  613. SBBQ (56)(REG_P2), R15
  614. // mask
  615. SBBQ $0, CX
  616. // store x-y in REG_P3
  617. MOVQ R8, ( 0)(REG_P3)
  618. MOVQ R9, ( 8)(REG_P3)
  619. MOVQ R10, (16)(REG_P3)
  620. MOVQ R11, (24)(REG_P3)
  621. MOVQ R12, (32)(REG_P3)
  622. MOVQ R13, (40)(REG_P3)
  623. MOVQ R14, (48)(REG_P3)
  624. MOVQ R15, (56)(REG_P3)
  625. // if z<0 add p503x2 back
  626. MOVQ P503X2_0, R8
  627. MOVQ P503X2_1, R9
  628. MOVQ P503X2_3, R10
  629. MOVQ P503X2_4, R11
  630. MOVQ P503X2_5, R12
  631. MOVQ P503X2_6, R13
  632. MOVQ P503X2_7, R14
  633. ANDQ CX, R8
  634. ANDQ CX, R9
  635. ANDQ CX, R10
  636. ANDQ CX, R11
  637. ANDQ CX, R12
  638. ANDQ CX, R13
  639. ANDQ CX, R14
  640. MOVQ ( 0)(REG_P3), AX; ADDQ R8, AX; MOVQ AX, ( 0)(REG_P3)
  641. MOVQ ( 8)(REG_P3), AX; ADCQ R9, AX; MOVQ AX, ( 8)(REG_P3)
  642. MOVQ (16)(REG_P3), AX; ADCQ R9, AX; MOVQ AX, (16)(REG_P3)
  643. MOVQ (24)(REG_P3), AX; ADCQ R10, AX; MOVQ AX, (24)(REG_P3)
  644. MOVQ (32)(REG_P3), AX; ADCQ R11, AX; MOVQ AX, (32)(REG_P3)
  645. MOVQ (40)(REG_P3), AX; ADCQ R12, AX; MOVQ AX, (40)(REG_P3)
  646. MOVQ (48)(REG_P3), AX; ADCQ R13, AX; MOVQ AX, (48)(REG_P3)
  647. MOVQ (56)(REG_P3), AX; ADCQ R14, AX; MOVQ AX, (56)(REG_P3)
  648. RET
  649. TEXT ·mulWithMULXADX(SB), NOSPLIT, $104-24
  650. // Actual implementation
  651. MOVQ z+ 0(FP), CX
  652. MOVQ x+ 8(FP), REG_P2
  653. MOVQ y+16(FP), REG_P1
  654. MUL(CX, REG_P2, REG_P1, MULS256_MULXADX)
  655. RET
  656. TEXT ·mulWithMULX(SB), NOSPLIT, $104-24
  657. // Actual implementation
  658. MOVQ z+ 0(FP), CX
  659. MOVQ x+ 8(FP), REG_P2
  660. MOVQ y+16(FP), REG_P1
  661. MUL(CX, REG_P2, REG_P1, MULS256_MULX)
  662. RET
  663. TEXT ·mul(SB), $96-24
  664. // Uses variant of Karatsuba method.
  665. //
  666. // Here we store the destination in CX instead of in REG_P3 because the
  667. // multiplication instructions use DX as an implicit destination
  668. // operand: MULQ $REG sets DX:AX <-- AX * $REG.
  669. // Actual implementation
  670. MOVQ z+0(FP), CX
  671. MOVQ x+8(FP), REG_P1
  672. MOVQ y+16(FP), REG_P2
  673. // RAX and RDX will be used for a mask (0-borrow)
  674. XORQ AX, AX
  675. // RCX[0-3]: U1+U0
  676. MOVQ (32)(REG_P1), R8
  677. MOVQ (40)(REG_P1), R9
  678. MOVQ (48)(REG_P1), R10
  679. MOVQ (56)(REG_P1), R11
  680. ADDQ ( 0)(REG_P1), R8
  681. ADCQ ( 8)(REG_P1), R9
  682. ADCQ (16)(REG_P1), R10
  683. ADCQ (24)(REG_P1), R11
  684. MOVQ R8, ( 0)(CX)
  685. MOVQ R9, ( 8)(CX)
  686. MOVQ R10, (16)(CX)
  687. MOVQ R11, (24)(CX)
  688. SBBQ $0, AX
  689. // R12-R15: V1+V0
  690. XORQ DX, DX
  691. MOVQ (32)(REG_P2), R12
  692. MOVQ (40)(REG_P2), R13
  693. MOVQ (48)(REG_P2), R14
  694. MOVQ (56)(REG_P2), R15
  695. ADDQ ( 0)(REG_P2), R12
  696. ADCQ ( 8)(REG_P2), R13
  697. ADCQ (16)(REG_P2), R14
  698. ADCQ (24)(REG_P2), R15
  699. SBBQ $0, DX
  700. // Store carries on stack
  701. MOVQ AX, (64)(SP)
  702. MOVQ DX, (72)(SP)
  703. // (SP[0-3],R8,R9,R10,R11) <- (U0+U1)*(V0+V1).
  704. // MUL using comba; In comments below U=U0+U1 V=V0+V1
  705. // U0*V0
  706. MOVQ (CX), AX
  707. MULQ R12
  708. MOVQ AX, (SP) // C0
  709. MOVQ DX, R8
  710. // U0*V1
  711. XORQ R9, R9
  712. MOVQ (CX), AX
  713. MULQ R13
  714. ADDQ AX, R8
  715. ADCQ DX, R9
  716. // U1*V0
  717. XORQ R10, R10
  718. MOVQ (8)(CX), AX
  719. MULQ R12
  720. ADDQ AX, R8
  721. MOVQ R8, (8)(SP) // C1
  722. ADCQ DX, R9
  723. ADCQ $0, R10
  724. // U0*V2
  725. XORQ R8, R8
  726. MOVQ (CX), AX
  727. MULQ R14
  728. ADDQ AX, R9
  729. ADCQ DX, R10
  730. ADCQ $0, R8
  731. // U2*V0
  732. MOVQ (16)(CX), AX
  733. MULQ R12
  734. ADDQ AX, R9
  735. ADCQ DX, R10
  736. ADCQ $0, R8
  737. // U1*V1
  738. MOVQ (8)(CX), AX
  739. MULQ R13
  740. ADDQ AX, R9
  741. MOVQ R9, (16)(SP) // C2
  742. ADCQ DX, R10
  743. ADCQ $0, R8
  744. // U0*V3
  745. XORQ R9, R9
  746. MOVQ (CX), AX
  747. MULQ R15
  748. ADDQ AX, R10
  749. ADCQ DX, R8
  750. ADCQ $0, R9
  751. // U3*V0
  752. MOVQ (24)(CX), AX
  753. MULQ R12
  754. ADDQ AX, R10
  755. ADCQ DX, R8
  756. ADCQ $0, R9
  757. // U1*V2
  758. MOVQ (8)(CX), AX
  759. MULQ R14
  760. ADDQ AX, R10
  761. ADCQ DX, R8
  762. ADCQ $0, R9
  763. // U2*V1
  764. MOVQ (16)(CX), AX
  765. MULQ R13
  766. ADDQ AX, R10
  767. MOVQ R10, (24)(SP) // C3
  768. ADCQ DX, R8
  769. ADCQ $0, R9
  770. // U1*V3
  771. XORQ R10, R10
  772. MOVQ (8)(CX), AX
  773. MULQ R15
  774. ADDQ AX, R8
  775. ADCQ DX, R9
  776. ADCQ $0, R10
  777. // U3*V1
  778. MOVQ (24)(CX), AX
  779. MULQ R13
  780. ADDQ AX, R8
  781. ADCQ DX, R9
  782. ADCQ $0, R10
  783. // U2*V2
  784. MOVQ (16)(CX), AX
  785. MULQ R14
  786. ADDQ AX, R8
  787. MOVQ R8, (32)(SP) // C4
  788. ADCQ DX, R9
  789. ADCQ $0, R10
  790. // U2*V3
  791. XORQ R11, R11
  792. MOVQ (16)(CX), AX
  793. MULQ R15
  794. ADDQ AX, R9
  795. ADCQ DX, R10
  796. ADCQ $0, R11
  797. // U3*V2
  798. MOVQ (24)(CX), AX
  799. MULQ R14
  800. ADDQ AX, R9 // C5
  801. ADCQ DX, R10
  802. ADCQ $0, R11
  803. // U3*V3
  804. MOVQ (24)(CX), AX
  805. MULQ R15
  806. ADDQ AX, R10 // C6
  807. ADCQ DX, R11 // C7
  808. MOVQ (64)(SP), AX
  809. ANDQ AX, R12
  810. ANDQ AX, R13
  811. ANDQ AX, R14
  812. ANDQ AX, R15
  813. ADDQ R8, R12
  814. ADCQ R9, R13
  815. ADCQ R10, R14
  816. ADCQ R11, R15
  817. MOVQ (72)(SP), AX
  818. MOVQ (CX), R8
  819. MOVQ (8)(CX), R9
  820. MOVQ (16)(CX), R10
  821. MOVQ (24)(CX), R11
  822. ANDQ AX, R8
  823. ANDQ AX, R9
  824. ANDQ AX, R10
  825. ANDQ AX, R11
  826. ADDQ R12, R8
  827. ADCQ R13, R9
  828. ADCQ R14, R10
  829. ADCQ R15, R11
  830. MOVQ R8, (32)(SP)
  831. MOVQ R9, (40)(SP)
  832. MOVQ R10, (48)(SP)
  833. MOVQ R11, (56)(SP)
  834. // CX[0-7] <- AL*BL
  835. // U0*V0
  836. MOVQ (REG_P1), R11
  837. MOVQ (REG_P2), AX
  838. MULQ R11
  839. XORQ R9, R9
  840. MOVQ AX, (CX) // C0
  841. MOVQ DX, R8
  842. // U0*V1
  843. MOVQ (16)(REG_P1), R14
  844. MOVQ (8)(REG_P2), AX
  845. MULQ R11
  846. XORQ R10, R10
  847. ADDQ AX, R8
  848. ADCQ DX, R9
  849. // U1*V0
  850. MOVQ (8)(REG_P1), R12
  851. MOVQ (REG_P2), AX
  852. MULQ R12
  853. ADDQ AX, R8
  854. MOVQ R8, (8)(CX) // C1
  855. ADCQ DX, R9
  856. ADCQ $0, R10
  857. // U0*V2
  858. XORQ R8, R8
  859. MOVQ (16)(REG_P2), AX
  860. MULQ R11
  861. ADDQ AX, R9
  862. ADCQ DX, R10
  863. ADCQ $0, R8
  864. // U2*V0
  865. MOVQ (REG_P2), R13
  866. MOVQ R14, AX
  867. MULQ R13
  868. ADDQ AX, R9
  869. ADCQ DX, R10
  870. ADCQ $0, R8
  871. // U1*V1
  872. MOVQ (8)(REG_P2), AX
  873. MULQ R12
  874. ADDQ AX, R9
  875. MOVQ R9, (16)(CX) // C2
  876. ADCQ DX, R10
  877. ADCQ $0, R8
  878. // U0*V3
  879. XORQ R9, R9
  880. MOVQ (24)(REG_P2), AX
  881. MULQ R11
  882. MOVQ (24)(REG_P1), R15
  883. ADDQ AX, R10
  884. ADCQ DX, R8
  885. ADCQ $0, R9
  886. // U3*V1
  887. MOVQ R15, AX
  888. MULQ R13
  889. ADDQ AX, R10
  890. ADCQ DX, R8
  891. ADCQ $0, R9
  892. // U2*V2
  893. MOVQ (16)(REG_P2), AX
  894. MULQ R12
  895. ADDQ AX, R10
  896. ADCQ DX, R8
  897. ADCQ $0, R9
  898. // U2*V3
  899. MOVQ (8)(REG_P2), AX
  900. MULQ R14
  901. ADDQ AX, R10
  902. MOVQ R10, (24)(CX) // C3
  903. ADCQ DX, R8
  904. ADCQ $0, R9
  905. // U3*V2
  906. XORQ R10, R10
  907. MOVQ (24)(REG_P2), AX
  908. MULQ R12
  909. ADDQ AX, R8
  910. ADCQ DX, R9
  911. ADCQ $0, R10
  912. // U3*V1
  913. MOVQ (8)(REG_P2), AX
  914. MULQ R15
  915. ADDQ AX, R8
  916. ADCQ DX, R9
  917. ADCQ $0, R10
  918. // U2*V2
  919. MOVQ (16)(REG_P2), AX
  920. MULQ R14
  921. ADDQ AX, R8
  922. MOVQ R8, (32)(CX) // C4
  923. ADCQ DX, R9
  924. ADCQ $0, R10
  925. // U2*V3
  926. XORQ R8, R8
  927. MOVQ (24)(REG_P2), AX
  928. MULQ R14
  929. ADDQ AX, R9
  930. ADCQ DX, R10
  931. ADCQ $0, R8
  932. // U3*V2
  933. MOVQ (16)(REG_P2), AX
  934. MULQ R15
  935. ADDQ AX, R9
  936. MOVQ R9, (40)(CX) // C5
  937. ADCQ DX, R10
  938. ADCQ $0, R8
  939. // U3*V3
  940. MOVQ (24)(REG_P2), AX
  941. MULQ R15
  942. ADDQ AX, R10
  943. MOVQ R10, (48)(CX) // C6
  944. ADCQ DX, R8
  945. MOVQ R8, (56)(CX) // C7
  946. // CX[8-15] <- U1*V1
  947. MOVQ (32)(REG_P1), R11
  948. MOVQ (32)(REG_P2), AX
  949. MULQ R11
  950. XORQ R9, R9
  951. MOVQ AX, (64)(CX) // C0
  952. MOVQ DX, R8
  953. MOVQ (48)(REG_P1), R14
  954. MOVQ (40)(REG_P2), AX
  955. MULQ R11
  956. XORQ R10, R10
  957. ADDQ AX, R8
  958. ADCQ DX, R9
  959. MOVQ (40)(REG_P1), R12
  960. MOVQ (32)(REG_P2), AX
  961. MULQ R12
  962. ADDQ AX, R8
  963. MOVQ R8, (72)(CX) // C1
  964. ADCQ DX, R9
  965. ADCQ $0, R10
  966. XORQ R8, R8
  967. MOVQ (48)(REG_P2), AX
  968. MULQ R11
  969. ADDQ AX, R9
  970. ADCQ DX, R10
  971. ADCQ $0, R8
  972. MOVQ (32)(REG_P2), R13
  973. MOVQ R14, AX
  974. MULQ R13
  975. ADDQ AX, R9
  976. ADCQ DX, R10
  977. ADCQ $0, R8
  978. MOVQ (40)(REG_P2), AX
  979. MULQ R12
  980. ADDQ AX, R9
  981. MOVQ R9, (80)(CX) // C2
  982. ADCQ DX, R10
  983. ADCQ $0, R8
  984. XORQ R9, R9
  985. MOVQ (56)(REG_P2), AX
  986. MULQ R11
  987. MOVQ (56)(REG_P1), R15
  988. ADDQ AX, R10
  989. ADCQ DX, R8
  990. ADCQ $0, R9
  991. MOVQ R15, AX
  992. MULQ R13
  993. ADDQ AX, R10
  994. ADCQ DX, R8
  995. ADCQ $0, R9
  996. MOVQ (48)(REG_P2), AX
  997. MULQ R12
  998. ADDQ AX, R10
  999. ADCQ DX, R8
  1000. ADCQ $0, R9
  1001. MOVQ (40)(REG_P2), AX
  1002. MULQ R14
  1003. ADDQ AX, R10
  1004. MOVQ R10, (88)(CX) // C3
  1005. ADCQ DX, R8
  1006. ADCQ $0, R9
  1007. XORQ R10, R10
  1008. MOVQ (56)(REG_P2), AX
  1009. MULQ R12
  1010. ADDQ AX, R8
  1011. ADCQ DX, R9
  1012. ADCQ $0, R10
  1013. MOVQ (40)(REG_P2), AX
  1014. MULQ R15
  1015. ADDQ AX, R8
  1016. ADCQ DX, R9
  1017. ADCQ $0, R10
  1018. MOVQ (48)(REG_P2), AX
  1019. MULQ R14
  1020. ADDQ AX, R8
  1021. MOVQ R8, (96)(CX) // C4
  1022. ADCQ DX, R9
  1023. ADCQ $0, R10
  1024. XORQ R8, R8
  1025. MOVQ (56)(REG_P2), AX
  1026. MULQ R14
  1027. ADDQ AX, R9
  1028. ADCQ DX, R10
  1029. ADCQ $0, R8
  1030. MOVQ (48)(REG_P2), AX
  1031. MULQ R15
  1032. ADDQ AX, R9
  1033. MOVQ R9, (104)(CX) // C5
  1034. ADCQ DX, R10
  1035. ADCQ $0, R8
  1036. MOVQ (56)(REG_P2), AX
  1037. MULQ R15
  1038. ADDQ AX, R10
  1039. MOVQ R10, (112)(CX) // C6
  1040. ADCQ DX, R8
  1041. MOVQ R8, (120)(CX) // C7
  1042. // [R8-R15] <- (U0+U1)*(V0+V1) - U1*V1
  1043. MOVQ (SP), R8
  1044. SUBQ (CX), R8
  1045. MOVQ (8)(SP), R9
  1046. SBBQ (8)(CX), R9
  1047. MOVQ (16)(SP), R10
  1048. SBBQ (16)(CX), R10
  1049. MOVQ (24)(SP), R11
  1050. SBBQ (24)(CX), R11
  1051. MOVQ (32)(SP), R12
  1052. SBBQ (32)(CX), R12
  1053. MOVQ (40)(SP), R13
  1054. SBBQ (40)(CX), R13
  1055. MOVQ (48)(SP), R14
  1056. SBBQ (48)(CX), R14
  1057. MOVQ (56)(SP), R15
  1058. SBBQ (56)(CX), R15
  1059. // [R8-R15] <- (U0+U1)*(V0+V1) - U1*V0 - U0*U1
  1060. MOVQ ( 64)(CX), AX; SUBQ AX, R8
  1061. MOVQ ( 72)(CX), AX; SBBQ AX, R9
  1062. MOVQ ( 80)(CX), AX; SBBQ AX, R10
  1063. MOVQ ( 88)(CX), AX; SBBQ AX, R11
  1064. MOVQ ( 96)(CX), AX; SBBQ AX, R12
  1065. MOVQ (104)(CX), DX; SBBQ DX, R13
  1066. MOVQ (112)(CX), DI; SBBQ DI, R14
  1067. MOVQ (120)(CX), SI; SBBQ SI, R15
  1068. // Final result
  1069. ADDQ (32)(CX), R8; MOVQ R8, (32)(CX)
  1070. ADCQ (40)(CX), R9; MOVQ R9, (40)(CX)
  1071. ADCQ (48)(CX), R10; MOVQ R10, (48)(CX)
  1072. ADCQ (56)(CX), R11; MOVQ R11, (56)(CX)
  1073. ADCQ (64)(CX), R12; MOVQ R12, (64)(CX)
  1074. ADCQ (72)(CX), R13; MOVQ R13, (72)(CX)
  1075. ADCQ (80)(CX), R14; MOVQ R14, (80)(CX)
  1076. ADCQ (88)(CX), R15; MOVQ R15, (88)(CX)
  1077. ADCQ $0, AX; MOVQ AX, (96)(CX)
  1078. ADCQ $0, DX; MOVQ DX, (104)(CX)
  1079. ADCQ $0, DI; MOVQ DI, (112)(CX)
  1080. ADCQ $0, SI; MOVQ SI, (120)(CX)
  1081. RET
  1082. TEXT ·redc(SB), $0-16
  1083. MOVQ z+0(FP), REG_P2
  1084. MOVQ x+8(FP), REG_P1
  1085. MOVQ (REG_P1), R11
  1086. MOVQ P503P1_3, AX
  1087. MULQ R11
  1088. XORQ R8, R8
  1089. ADDQ (24)(REG_P1), AX
  1090. MOVQ AX, (24)(REG_P2)
  1091. ADCQ DX, R8
  1092. XORQ R9, R9
  1093. MOVQ P503P1_4, AX
  1094. MULQ R11
  1095. XORQ R10, R10
  1096. ADDQ AX, R8
  1097. ADCQ DX, R9
  1098. MOVQ (8)(REG_P1), R12
  1099. MOVQ P503P1_3, AX
  1100. MULQ R12
  1101. ADDQ AX, R8
  1102. ADCQ DX, R9
  1103. ADCQ $0, R10
  1104. ADDQ (32)(REG_P1), R8
  1105. MOVQ R8, (32)(REG_P2) // Z4
  1106. ADCQ $0, R9
  1107. ADCQ $0, R10
  1108. XORQ R8, R8
  1109. MOVQ P503P1_5, AX
  1110. MULQ R11
  1111. ADDQ AX, R9
  1112. ADCQ DX, R10
  1113. ADCQ $0, R8
  1114. MOVQ P503P1_4, AX
  1115. MULQ R12
  1116. ADDQ AX, R9
  1117. ADCQ DX, R10
  1118. ADCQ $0, R8
  1119. MOVQ (16)(REG_P1), R13
  1120. MOVQ P503P1_3, AX
  1121. MULQ R13
  1122. ADDQ AX, R9
  1123. ADCQ DX, R10
  1124. ADCQ $0, R8
  1125. ADDQ (40)(REG_P1), R9
  1126. MOVQ R9, (40)(REG_P2) // Z5
  1127. ADCQ $0, R10
  1128. ADCQ $0, R8
  1129. XORQ R9, R9
  1130. MOVQ P503P1_6, AX
  1131. MULQ R11
  1132. ADDQ AX, R10
  1133. ADCQ DX, R8
  1134. ADCQ $0, R9
  1135. MOVQ P503P1_5, AX
  1136. MULQ R12
  1137. ADDQ AX, R10
  1138. ADCQ DX, R8
  1139. ADCQ $0, R9
  1140. MOVQ P503P1_4, AX
  1141. MULQ R13
  1142. ADDQ AX, R10
  1143. ADCQ DX, R8
  1144. ADCQ $0, R9
  1145. MOVQ (24)(REG_P2), R14
  1146. MOVQ P503P1_3, AX
  1147. MULQ R14
  1148. ADDQ AX, R10
  1149. ADCQ DX, R8
  1150. ADCQ $0, R9
  1151. ADDQ (48)(REG_P1), R10
  1152. MOVQ R10, (48)(REG_P2) // Z6
  1153. ADCQ $0, R8
  1154. ADCQ $0, R9
  1155. XORQ R10, R10
  1156. MOVQ P503P1_7, AX
  1157. MULQ R11
  1158. ADDQ AX, R8
  1159. ADCQ DX, R9
  1160. ADCQ $0, R10
  1161. MOVQ P503P1_6, AX
  1162. MULQ R12
  1163. ADDQ AX, R8
  1164. ADCQ DX, R9
  1165. ADCQ $0, R10
  1166. MOVQ P503P1_5, AX
  1167. MULQ R13
  1168. ADDQ AX, R8
  1169. ADCQ DX, R9
  1170. ADCQ $0, R10
  1171. MOVQ P503P1_4, AX
  1172. MULQ R14
  1173. ADDQ AX, R8
  1174. ADCQ DX, R9
  1175. ADCQ $0, R10
  1176. MOVQ (32)(REG_P2), R15
  1177. MOVQ P503P1_3, AX
  1178. MULQ R15
  1179. ADDQ AX, R8
  1180. ADCQ DX, R9
  1181. ADCQ $0, R10
  1182. ADDQ (56)(REG_P1), R8
  1183. MOVQ R8, (56)(REG_P2) // Z7
  1184. ADCQ $0, R9
  1185. ADCQ $0, R10
  1186. XORQ R8, R8
  1187. MOVQ P503P1_7, AX
  1188. MULQ R12
  1189. ADDQ AX, R9
  1190. ADCQ DX, R10
  1191. ADCQ $0, R8
  1192. MOVQ P503P1_6, AX
  1193. MULQ R13
  1194. ADDQ AX, R9
  1195. ADCQ DX, R10
  1196. ADCQ $0, R8
  1197. MOVQ P503P1_5, AX
  1198. MULQ R14
  1199. ADDQ AX, R9
  1200. ADCQ DX, R10
  1201. ADCQ $0, R8
  1202. MOVQ P503P1_4, AX
  1203. MULQ R15
  1204. ADDQ AX, R9
  1205. ADCQ DX, R10
  1206. ADCQ $0, R8
  1207. MOVQ (40)(REG_P2), CX
  1208. MOVQ P503P1_3, AX
  1209. MULQ CX
  1210. ADDQ AX, R9
  1211. ADCQ DX, R10
  1212. ADCQ $0, R8
  1213. ADDQ (64)(REG_P1), R9
  1214. MOVQ R9, (REG_P2) // Z0
  1215. ADCQ $0, R10
  1216. ADCQ $0, R8
  1217. XORQ R9, R9
  1218. MOVQ P503P1_7, AX
  1219. MULQ R13
  1220. ADDQ AX, R10
  1221. ADCQ DX, R8
  1222. ADCQ $0, R9
  1223. MOVQ P503P1_6, AX
  1224. MULQ R14
  1225. ADDQ AX, R10
  1226. ADCQ DX, R8
  1227. ADCQ $0, R9
  1228. MOVQ P503P1_5, AX
  1229. MULQ R15
  1230. ADDQ AX, R10
  1231. ADCQ DX, R8
  1232. ADCQ $0, R9
  1233. MOVQ P503P1_4, AX
  1234. MULQ CX
  1235. ADDQ AX, R10
  1236. ADCQ DX, R8
  1237. ADCQ $0, R9
  1238. MOVQ (48)(REG_P2), R13
  1239. MOVQ P503P1_3, AX
  1240. MULQ R13
  1241. ADDQ AX, R10
  1242. ADCQ DX, R8
  1243. ADCQ $0, R9
  1244. ADDQ (72)(REG_P1), R10
  1245. MOVQ R10, (8)(REG_P2) // Z1
  1246. ADCQ $0, R8
  1247. ADCQ $0, R9
  1248. XORQ R10, R10
  1249. MOVQ P503P1_7, AX
  1250. MULQ R14
  1251. ADDQ AX, R8
  1252. ADCQ DX, R9
  1253. ADCQ $0, R10
  1254. MOVQ P503P1_6, AX
  1255. MULQ R15
  1256. ADDQ AX, R8
  1257. ADCQ DX, R9
  1258. ADCQ $0, R10
  1259. MOVQ P503P1_5, AX
  1260. MULQ CX
  1261. ADDQ AX, R8
  1262. ADCQ DX, R9
  1263. ADCQ $0, R10
  1264. MOVQ P503P1_4, AX
  1265. MULQ R13
  1266. ADDQ AX, R8
  1267. ADCQ DX, R9
  1268. ADCQ $0, R10
  1269. MOVQ (56)(REG_P2), R14
  1270. MOVQ P503P1_3, AX
  1271. MULQ R14
  1272. ADDQ AX, R8
  1273. ADCQ DX, R9
  1274. ADCQ $0, R10
  1275. ADDQ (80)(REG_P1), R8
  1276. MOVQ R8, (16)(REG_P2) // Z2
  1277. ADCQ $0, R9
  1278. ADCQ $0, R10
  1279. XORQ R8, R8
  1280. MOVQ P503P1_7, AX
  1281. MULQ R15
  1282. ADDQ AX, R9
  1283. ADCQ DX, R10
  1284. ADCQ $0, R8
  1285. MOVQ P503P1_6, AX
  1286. MULQ CX
  1287. ADDQ AX, R9
  1288. ADCQ DX, R10
  1289. ADCQ $0, R8
  1290. MOVQ P503P1_5, AX
  1291. MULQ R13
  1292. ADDQ AX, R9
  1293. ADCQ DX, R10
  1294. ADCQ $0, R8
  1295. MOVQ P503P1_4, AX
  1296. MULQ R14
  1297. ADDQ AX, R9
  1298. ADCQ DX, R10
  1299. ADCQ $0, R8
  1300. ADDQ (88)(REG_P1), R9
  1301. MOVQ R9, (24)(REG_P2) // Z3
  1302. ADCQ $0, R10
  1303. ADCQ $0, R8
  1304. XORQ R9, R9
  1305. MOVQ P503P1_7, AX
  1306. MULQ CX
  1307. ADDQ AX, R10
  1308. ADCQ DX, R8
  1309. ADCQ $0, R9
  1310. MOVQ P503P1_6, AX
  1311. MULQ R13
  1312. ADDQ AX, R10
  1313. ADCQ DX, R8
  1314. ADCQ $0, R9
  1315. MOVQ P503P1_5, AX
  1316. MULQ R14
  1317. ADDQ AX, R10
  1318. ADCQ DX, R8
  1319. ADCQ $0, R9
  1320. ADDQ (96)(REG_P1), R10
  1321. MOVQ R10, (32)(REG_P2) // Z4
  1322. ADCQ $0, R8
  1323. ADCQ $0, R9
  1324. XORQ R10, R10
  1325. MOVQ P503P1_7, AX
  1326. MULQ R13
  1327. ADDQ AX, R8
  1328. ADCQ DX, R9
  1329. ADCQ $0, R10
  1330. MOVQ P503P1_6, AX
  1331. MULQ R14
  1332. ADDQ AX, R8
  1333. ADCQ DX, R9
  1334. ADCQ $0, R10
  1335. ADDQ (104)(REG_P1), R8 // Z5
  1336. MOVQ R8, (40)(REG_P2) // Z5
  1337. ADCQ $0, R9
  1338. ADCQ $0, R10
  1339. MOVQ P503P1_7, AX
  1340. MULQ R14
  1341. ADDQ AX, R9
  1342. ADCQ DX, R10
  1343. ADDQ (112)(REG_P1), R9 // Z6
  1344. MOVQ R9, (48)(REG_P2) // Z6
  1345. ADCQ $0, R10
  1346. ADDQ (120)(REG_P1), R10 // Z7
  1347. MOVQ R10, (56)(REG_P2) // Z7
  1348. RET
  1349. TEXT ·redcWithMULX(SB), $0-16
  1350. MOVQ z+0(FP), DI
  1351. MOVQ x+8(FP), SI
  1352. REDC(DI, SI, MULS_128x320_MULX)
  1353. RET
  1354. TEXT ·redcWithMULXADX(SB), $0-16
  1355. MOVQ z+0(FP), DI
  1356. MOVQ x+8(FP), SI
  1357. REDC(DI, SI, MULS_128x320_MULXADX)
  1358. RET
  1359. TEXT ·fp503AddLazy(SB), NOSPLIT, $0-24
  1360. MOVQ z+0(FP), REG_P3
  1361. MOVQ x+8(FP), REG_P1
  1362. MOVQ y+16(FP), REG_P2
  1363. MOVQ (REG_P1), R8
  1364. MOVQ (8)(REG_P1), R9
  1365. MOVQ (16)(REG_P1), R10
  1366. MOVQ (24)(REG_P1), R11
  1367. MOVQ (32)(REG_P1), R12
  1368. MOVQ (40)(REG_P1), R13
  1369. MOVQ (48)(REG_P1), R14
  1370. MOVQ (56)(REG_P1), R15
  1371. ADDQ (REG_P2), R8
  1372. ADCQ (8)(REG_P2), R9
  1373. ADCQ (16)(REG_P2), R10
  1374. ADCQ (24)(REG_P2), R11
  1375. ADCQ (32)(REG_P2), R12
  1376. ADCQ (40)(REG_P2), R13
  1377. ADCQ (48)(REG_P2), R14
  1378. ADCQ (56)(REG_P2), R15
  1379. MOVQ R8, (REG_P3)
  1380. MOVQ R9, (8)(REG_P3)
  1381. MOVQ R10, (16)(REG_P3)
  1382. MOVQ R11, (24)(REG_P3)
  1383. MOVQ R12, (32)(REG_P3)
  1384. MOVQ R13, (40)(REG_P3)
  1385. MOVQ R14, (48)(REG_P3)
  1386. MOVQ R15, (56)(REG_P3)
  1387. RET
  1388. TEXT ·fp503X2AddLazy(SB), NOSPLIT, $0-24
  1389. MOVQ z+0(FP), REG_P3
  1390. MOVQ x+8(FP), REG_P1
  1391. MOVQ y+16(FP), REG_P2
  1392. MOVQ (REG_P1), R8
  1393. MOVQ (8)(REG_P1), R9
  1394. MOVQ (16)(REG_P1), R10
  1395. MOVQ (24)(REG_P1), R11
  1396. MOVQ (32)(REG_P1), R12
  1397. MOVQ (40)(REG_P1), R13
  1398. MOVQ (48)(REG_P1), R14
  1399. MOVQ (56)(REG_P1), R15
  1400. MOVQ (64)(REG_P1), AX
  1401. MOVQ (72)(REG_P1), BX
  1402. MOVQ (80)(REG_P1), CX
  1403. ADDQ (REG_P2), R8
  1404. ADCQ (8)(REG_P2), R9
  1405. ADCQ (16)(REG_P2), R10
  1406. ADCQ (24)(REG_P2), R11
  1407. ADCQ (32)(REG_P2), R12
  1408. ADCQ (40)(REG_P2), R13
  1409. ADCQ (48)(REG_P2), R14
  1410. ADCQ (56)(REG_P2), R15
  1411. ADCQ (64)(REG_P2), AX
  1412. ADCQ (72)(REG_P2), BX
  1413. ADCQ (80)(REG_P2), CX
  1414. MOVQ R8, (REG_P3)
  1415. MOVQ R9, (8)(REG_P3)
  1416. MOVQ R10, (16)(REG_P3)
  1417. MOVQ R11, (24)(REG_P3)
  1418. MOVQ R12, (32)(REG_P3)
  1419. MOVQ R13, (40)(REG_P3)
  1420. MOVQ R14, (48)(REG_P3)
  1421. MOVQ R15, (56)(REG_P3)
  1422. MOVQ AX, (64)(REG_P3)
  1423. MOVQ BX, (72)(REG_P3)
  1424. MOVQ CX, (80)(REG_P3)
  1425. MOVQ (88)(REG_P1), R8
  1426. MOVQ (96)(REG_P1), R9
  1427. MOVQ (104)(REG_P1), R10
  1428. MOVQ (112)(REG_P1), R11
  1429. MOVQ (120)(REG_P1), R12
  1430. ADCQ (88)(REG_P2), R8
  1431. ADCQ (96)(REG_P2), R9
  1432. ADCQ (104)(REG_P2), R10
  1433. ADCQ (112)(REG_P2), R11
  1434. ADCQ (120)(REG_P2), R12
  1435. MOVQ R8, (88)(REG_P3)
  1436. MOVQ R9, (96)(REG_P3)
  1437. MOVQ R10, (104)(REG_P3)
  1438. MOVQ R11, (112)(REG_P3)
  1439. MOVQ R12, (120)(REG_P3)
  1440. RET
  1441. TEXT ·fp503X2SubLazy(SB), NOSPLIT, $0-24
  1442. MOVQ z+0(FP), REG_P3
  1443. MOVQ x+8(FP), REG_P1
  1444. MOVQ y+16(FP), REG_P2
  1445. // Used later to store result of 0-borrow
  1446. XORQ CX, CX
  1447. // SUBC for first 11 limbs
  1448. MOVQ (REG_P1), R8
  1449. MOVQ (8)(REG_P1), R9
  1450. MOVQ (16)(REG_P1), R10
  1451. MOVQ (24)(REG_P1), R11
  1452. MOVQ (32)(REG_P1), R12
  1453. MOVQ (40)(REG_P1), R13
  1454. MOVQ (48)(REG_P1), R14
  1455. MOVQ (56)(REG_P1), R15
  1456. MOVQ (64)(REG_P1), AX
  1457. MOVQ (72)(REG_P1), BX
  1458. SUBQ (REG_P2), R8
  1459. SBBQ (8)(REG_P2), R9
  1460. SBBQ (16)(REG_P2), R10
  1461. SBBQ (24)(REG_P2), R11
  1462. SBBQ (32)(REG_P2), R12
  1463. SBBQ (40)(REG_P2), R13
  1464. SBBQ (48)(REG_P2), R14
  1465. SBBQ (56)(REG_P2), R15
  1466. SBBQ (64)(REG_P2), AX
  1467. SBBQ (72)(REG_P2), BX
  1468. MOVQ R8, (REG_P3)
  1469. MOVQ R9, (8)(REG_P3)
  1470. MOVQ R10, (16)(REG_P3)
  1471. MOVQ R11, (24)(REG_P3)
  1472. MOVQ R12, (32)(REG_P3)
  1473. MOVQ R13, (40)(REG_P3)
  1474. MOVQ R14, (48)(REG_P3)
  1475. MOVQ R15, (56)(REG_P3)
  1476. MOVQ AX, (64)(REG_P3)
  1477. MOVQ BX, (72)(REG_P3)
  1478. // SUBC for last 5 limbs
  1479. MOVQ (80)(REG_P1), R8
  1480. MOVQ (88)(REG_P1), R9
  1481. MOVQ (96)(REG_P1), R10
  1482. MOVQ (104)(REG_P1), R11
  1483. MOVQ (112)(REG_P1), R12
  1484. MOVQ (120)(REG_P1), R13
  1485. SBBQ (80)(REG_P2), R8
  1486. SBBQ (88)(REG_P2), R9
  1487. SBBQ (96)(REG_P2), R10
  1488. SBBQ (104)(REG_P2), R11
  1489. SBBQ (112)(REG_P2), R12
  1490. SBBQ (120)(REG_P2), R13
  1491. MOVQ R8, (80)(REG_P3)
  1492. MOVQ R9, (88)(REG_P3)
  1493. MOVQ R10, (96)(REG_P3)
  1494. MOVQ R11, (104)(REG_P3)
  1495. MOVQ R12, (112)(REG_P3)
  1496. MOVQ R13, (120)(REG_P3)
  1497. // Now the carry flag is 1 if x-y < 0. If so, add p*2^512.
  1498. SBBQ $0, CX
  1499. // Load p into registers:
  1500. MOVQ P503_0, R8
  1501. // P503_{1,2} = P503_0, so reuse R8
  1502. MOVQ P503_3, R9
  1503. MOVQ P503_4, R10
  1504. MOVQ P503_5, R11
  1505. MOVQ P503_6, R12
  1506. MOVQ P503_7, R13
  1507. ANDQ CX, R8
  1508. ANDQ CX, R9
  1509. ANDQ CX, R10
  1510. ANDQ CX, R11
  1511. ANDQ CX, R12
  1512. ANDQ CX, R13
  1513. MOVQ (64 )(REG_P3), AX; ADDQ R8, AX; MOVQ AX, (64 )(REG_P3)
  1514. MOVQ (64+ 8)(REG_P3), AX; ADCQ R8, AX; MOVQ AX, (64+ 8)(REG_P3)
  1515. MOVQ (64+16)(REG_P3), AX; ADCQ R8, AX; MOVQ AX, (64+16)(REG_P3)
  1516. MOVQ (64+24)(REG_P3), AX; ADCQ R9, AX; MOVQ AX, (64+24)(REG_P3)
  1517. MOVQ (64+32)(REG_P3), AX; ADCQ R10, AX; MOVQ AX, (64+32)(REG_P3)
  1518. MOVQ (64+40)(REG_P3), AX; ADCQ R11, AX; MOVQ AX, (64+40)(REG_P3)
  1519. MOVQ (64+48)(REG_P3), AX; ADCQ R12, AX; MOVQ AX, (64+48)(REG_P3)
  1520. MOVQ (64+56)(REG_P3), AX; ADCQ R13, AX; MOVQ AX, (64+56)(REG_P3)
  1521. RET