You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

4 vuotta sitten

  1. //*******************************************************************************************
  2. // Supersingular Isogeny Key Encapsulation Library
  3. //
  4. // Abstract: field arithmetic in x64 assembly for P751 on Linux
  5. //*******************************************************************************************
  6. .intel_syntax noprefix
  7. // Registers that are used for parameter passing:
  8. #define reg_p1 rdi
  9. #define reg_p2 rsi
  10. #define reg_p3 rdx
  11. // p751 + 1
  12. #define p751p1_5 0xEEB0000000000000
  13. #define p751p1_6 0xE3EC968549F878A8
  14. #define p751p1_7 0xDA959B1A13F7CC76
  15. #define p751p1_8 0x084E9867D6EBE876
  16. #define p751p1_9 0x8562B5045CB25748
  17. #define p751p1_10 0x0E12909F97BADC66
  18. #define p751p1_11 0x00006FE5D541F71C
  19. // p751 x 2
  20. #define p751x2_0 0xFFFFFFFFFFFFFFFE
  21. #define p751x2_1 0xFFFFFFFFFFFFFFFF
  22. #define p751x2_5 0xDD5FFFFFFFFFFFFF
  23. #define p751x2_6 0xC7D92D0A93F0F151
  24. #define p751x2_7 0xB52B363427EF98ED
  25. #define p751x2_8 0x109D30CFADD7D0ED
  26. #define p751x2_9 0x0AC56A08B964AE90
  27. #define p751x2_10 0x1C25213F2F75B8CD
  28. #define p751x2_11 0x0000DFCBAA83EE38
  29. .text
  30. //***********************************************************************
  31. // Field addition
  32. // Operation: c [reg_p3] = a [reg_p1] + b [reg_p2]
  33. //***********************************************************************
  34. .global fpadd751_asm
  35. fpadd751_asm:
  36. push r12
  37. push r13
  38. push r14
  39. push r15
  40. mov r8, [reg_p1]
  41. mov r9, [reg_p1+8]
  42. mov r10, [reg_p1+16]
  43. mov r11, [reg_p1+24]
  44. mov r12, [reg_p1+32]
  45. mov r13, [reg_p1+40]
  46. mov r14, [reg_p1+48]
  47. mov r15, [reg_p1+56]
  48. mov rcx, [reg_p1+64]
  49. add r8, [reg_p2]
  50. adc r9, [reg_p2+8]
  51. adc r10, [reg_p2+16]
  52. adc r11, [reg_p2+24]
  53. adc r12, [reg_p2+32]
  54. adc r13, [reg_p2+40]
  55. adc r14, [reg_p2+48]
  56. adc r15, [reg_p2+56]
  57. adc rcx, [reg_p2+64]
  58. mov rax, [reg_p1+72]
  59. adc rax, [reg_p2+72]
  60. mov [reg_p3+72], rax
  61. mov rax, [reg_p1+80]
  62. adc rax, [reg_p2+80]
  63. mov [reg_p3+80], rax
  64. mov rax, [reg_p1+88]
  65. adc rax, [reg_p2+88]
  66. mov [reg_p3+88], rax
  67. movq rax, p751x2_0
  68. sub r8, rax
  69. movq rax, p751x2_1
  70. sbb r9, rax
  71. sbb r10, rax
  72. sbb r11, rax
  73. sbb r12, rax
  74. movq rax, p751x2_5
  75. sbb r13, rax
  76. movq rax, p751x2_6
  77. sbb r14, rax
  78. movq rax, p751x2_7
  79. sbb r15, rax
  80. movq rax, p751x2_8
  81. sbb rcx, rax
  82. mov [reg_p3], r8
  83. mov [reg_p3+8], r9
  84. mov [reg_p3+16], r10
  85. mov [reg_p3+24], r11
  86. mov [reg_p3+32], r12
  87. mov [reg_p3+40], r13
  88. mov [reg_p3+48], r14
  89. mov [reg_p3+56], r15
  90. mov [reg_p3+64], rcx
  91. mov r8, [reg_p3+72]
  92. mov r9, [reg_p3+80]
  93. mov r10, [reg_p3+88]
  94. movq rax, p751x2_9
  95. sbb r8, rax
  96. movq rax, p751x2_10
  97. sbb r9, rax
  98. movq rax, p751x2_11
  99. sbb r10, rax
  100. mov [reg_p3+72], r8
  101. mov [reg_p3+80], r9
  102. mov [reg_p3+88], r10
  103. movq rax, 0
  104. sbb rax, 0
  105. mov rsi, p751x2_0
  106. and rsi, rax
  107. mov r8, p751x2_1
  108. and r8, rax
  109. movq r9, p751x2_5
  110. and r9, rax
  111. movq r10, p751x2_6
  112. and r10, rax
  113. movq r11, p751x2_7
  114. and r11, rax
  115. movq r12, p751x2_8
  116. and r12, rax
  117. movq r13, p751x2_9
  118. and r13, rax
  119. movq r14, p751x2_10
  120. and r14, rax
  121. movq r15, p751x2_11
  122. and r15, rax
  123. mov rax, [reg_p3]
  124. add rax, rsi
  125. mov [reg_p3], rax
  126. mov rax, [reg_p3+8]
  127. adc rax, r8
  128. mov [reg_p3+8], rax
  129. mov rax, [reg_p3+16]
  130. adc rax, r8
  131. mov [reg_p3+16], rax
  132. mov rax, [reg_p3+24]
  133. adc rax, r8
  134. mov [reg_p3+24], rax
  135. mov rax, [reg_p3+32]
  136. adc rax, r8
  137. mov [reg_p3+32], rax
  138. mov rax, [reg_p3+40]
  139. adc rax, r9
  140. mov [reg_p3+40], rax
  141. mov rax, [reg_p3+48]
  142. adc rax, r10
  143. mov [reg_p3+48], rax
  144. mov rax, [reg_p3+56]
  145. adc rax, r11
  146. mov [reg_p3+56], rax
  147. mov rax, [reg_p3+64]
  148. adc rax, r12
  149. mov [reg_p3+64], rax
  150. mov rax, [reg_p3+72]
  151. adc rax, r13
  152. mov [reg_p3+72], rax
  153. mov rax, [reg_p3+80]
  154. adc rax, r14
  155. mov [reg_p3+80], rax
  156. mov rax, [reg_p3+88]
  157. adc rax, r15
  158. mov [reg_p3+88], rax
  159. pop r15
  160. pop r14
  161. pop r13
  162. pop r12
  163. ret
  164. //***********************************************************************
  165. // Field subtraction
  166. // Operation: c [reg_p3] = a [reg_p1] - b [reg_p2]
  167. //***********************************************************************
  168. .global fpsub751_asm
  169. fpsub751_asm:
  170. push r12
  171. push r13
  172. push r14
  173. push r15
  174. mov r8, [reg_p1]
  175. mov r9, [reg_p1+8]
  176. mov r10, [reg_p1+16]
  177. mov r11, [reg_p1+24]
  178. mov r12, [reg_p1+32]
  179. mov r13, [reg_p1+40]
  180. mov r14, [reg_p1+48]
  181. mov r15, [reg_p1+56]
  182. mov rcx, [reg_p1+64]
  183. sub r8, [reg_p2]
  184. sbb r9, [reg_p2+8]
  185. sbb r10, [reg_p2+16]
  186. sbb r11, [reg_p2+24]
  187. sbb r12, [reg_p2+32]
  188. sbb r13, [reg_p2+40]
  189. sbb r14, [reg_p2+48]
  190. sbb r15, [reg_p2+56]
  191. sbb rcx, [reg_p2+64]
  192. mov [reg_p3], r8
  193. mov [reg_p3+8], r9
  194. mov [reg_p3+16], r10
  195. mov [reg_p3+24], r11
  196. mov [reg_p3+32], r12
  197. mov [reg_p3+40], r13
  198. mov [reg_p3+48], r14
  199. mov [reg_p3+56], r15
  200. mov [reg_p3+64], rcx
  201. mov rax, [reg_p1+72]
  202. sbb rax, [reg_p2+72]
  203. mov [reg_p3+72], rax
  204. mov rax, [reg_p1+80]
  205. sbb rax, [reg_p2+80]
  206. mov [reg_p3+80], rax
  207. mov rax, [reg_p1+88]
  208. sbb rax, [reg_p2+88]
  209. mov [reg_p3+88], rax
  210. movq rax, 0
  211. sbb rax, 0
  212. mov rsi, p751x2_0
  213. and rsi, rax
  214. mov r8, p751x2_1
  215. and r8, rax
  216. movq r9, p751x2_5
  217. and r9, rax
  218. movq r10, p751x2_6
  219. and r10, rax
  220. movq r11, p751x2_7
  221. and r11, rax
  222. movq r12, p751x2_8
  223. and r12, rax
  224. movq r13, p751x2_9
  225. and r13, rax
  226. movq r14, p751x2_10
  227. and r14, rax
  228. movq r15, p751x2_11
  229. and r15, rax
  230. mov rax, [reg_p3]
  231. add rax, rsi
  232. mov [reg_p3], rax
  233. mov rax, [reg_p3+8]
  234. adc rax, r8
  235. mov [reg_p3+8], rax
  236. mov rax, [reg_p3+16]
  237. adc rax, r8
  238. mov [reg_p3+16], rax
  239. mov rax, [reg_p3+24]
  240. adc rax, r8
  241. mov [reg_p3+24], rax
  242. mov rax, [reg_p3+32]
  243. adc rax, r8
  244. mov [reg_p3+32], rax
  245. mov rax, [reg_p3+40]
  246. adc rax, r9
  247. mov [reg_p3+40], rax
  248. mov rax, [reg_p3+48]
  249. adc rax, r10
  250. mov [reg_p3+48], rax
  251. mov rax, [reg_p3+56]
  252. adc rax, r11
  253. mov [reg_p3+56], rax
  254. mov rax, [reg_p3+64]
  255. adc rax, r12
  256. mov [reg_p3+64], rax
  257. mov rax, [reg_p3+72]
  258. adc rax, r13
  259. mov [reg_p3+72], rax
  260. mov rax, [reg_p3+80]
  261. adc rax, r14
  262. mov [reg_p3+80], rax
  263. mov rax, [reg_p3+88]
  264. adc rax, r15
  265. mov [reg_p3+88], rax
  266. pop r15
  267. pop r14
  268. pop r13
  269. pop r12
  270. ret
  271. #ifdef _MULX_
  272. ///////////////////////////////////////////////////////////////// MACRO
  273. // Schoolbook integer multiplication
  274. // Inputs: memory pointers M0 and M1
  275. // Outputs: memory pointer C
  276. // Temps: stack pointer for 15 64-bit values, regs T0:T7
  277. /////////////////////////////////////////////////////////////////
  278. #if _ADX_
  279. .macro MUL384_SCHOOL M0, M1, C, S, T0, T1, T2, T3, T4, T5, T6, T7
  280. mov rdx, \M0
  281. mulx \T0, \T1, \M1
  282. mulx \T2, \T3, 8\M1
  283. mov \C, \T1 // C0_final
  284. xor rax, rax
  285. mulx \T4, \T5, 16\M1
  286. adox \T0, \T3
  287. adox \T2, \T5
  288. mulx \T1, \T3, 24\M1
  289. adox \T4, \T3
  290. mulx \T5, \T6, 32\M1
  291. adox \T1, \T6
  292. mulx \T3, \T7, 40\M1
  293. adox \T5, \T7
  294. adox \T3, rax
  295. mov rdx, 8\M0
  296. mulx \T6, \T7, \M1
  297. adcx \T0, \T7
  298. mov 8\C, \T0 // C1_final
  299. adcx \T2, \T6
  300. mulx \T6, \T7, 8\M1
  301. mov \S, \T7 // store T7
  302. adcx \T4, \T6
  303. mulx \T0, \T6, 16\M1
  304. mov 8\S, \T6 // store T6
  305. adcx \T0, \T1
  306. mulx \T1, \T7, 24\M1
  307. adcx \T1, \T5
  308. mulx \T5, \T6, 32\M1
  309. adcx \T3, \T5
  310. mulx \T5, rdx, 40\M1
  311. adcx \T5, rax
  312. xor rax, rax
  313. adox \T2, \S
  314. adox \T4, 8\S
  315. adox \T0, \T7
  316. adox \T1, \T6
  317. adox \T3, rdx
  318. adox \T5, rax
  319. mov rdx, 16\M0
  320. mulx \T6, \T7, \M1
  321. adcx \T2, \T7
  322. mov 16\C, \T2 // C2_final
  323. adcx \T4, \T6
  324. mulx \T6, \T7, 8\M1
  325. mov \S, \T7 // store T7
  326. adcx \T0, \T6
  327. mulx \T2, \T6, 16\M1
  328. mov 8\S, \T6 // store T6
  329. adcx \T1, \T2
  330. mulx \T2, \T7, 24\M1
  331. adcx \T3, \T2
  332. mulx \T2, \T6, 32\M1
  333. adcx \T5, \T2
  334. mulx \T2, rdx, 40\M1
  335. adcx \T2, rax
  336. xor rax, rax
  337. adox \T4, \S
  338. adox \T0, 8\S
  339. adox \T1, \T7
  340. adox \T3, \T6
  341. adox \T5, rdx
  342. adox \T2, rax
  343. mov rdx, 24\M0
  344. mulx \T6, \T7, \M1
  345. adcx \T4, \T7
  346. mov 24\C, \T4 // C3_final
  347. adcx \T0, \T6
  348. mulx \T6, \T7, 8\M1
  349. mov \S, \T7 // store T7
  350. adcx \T1, \T6
  351. mulx \T4, \T6, 16\M1
  352. mov 8\S, \T6 // store T6
  353. adcx \T3, \T4
  354. mulx \T4, \T7, 24\M1
  355. adcx \T5, \T4
  356. mulx \T4, \T6, 32\M1
  357. adcx \T2, \T4
  358. mulx \T4, rdx, 40\M1
  359. adcx \T4, rax
  360. xor rax, rax
  361. adox \T0, \S
  362. adox \T1, 8\S
  363. adox \T3, \T7
  364. adox \T5, \T6
  365. adox \T2, rdx
  366. adox \T4, rax
  367. mov rdx, 32\M0
  368. mulx \T6, \T7, \M1
  369. adcx \T0, \T7
  370. mov 32\C, \T0 // C4_final
  371. adcx \T1, \T6
  372. mulx \T6, \T7, 8\M1
  373. mov \S, \T7 // store T7
  374. adcx \T3, \T6
  375. mulx \T0, \T6, 16\M1
  376. mov 8\S, \T6 // store T6
  377. adcx \T5, \T0
  378. mulx \T0, \T7, 24\M1
  379. adcx \T2, \T0
  380. mulx \T0, \T6, 32\M1
  381. adcx \T4, \T0
  382. mulx \T0, rdx, 40\M1
  383. adcx \T0, rax
  384. xor rax, rax
  385. adox \T1, \S
  386. adox \T3, 8\S
  387. adox \T5, \T7
  388. adox \T2, \T6
  389. adox \T4, rdx
  390. adox \T0, rax
  391. mov rdx, 40\M0
  392. mulx \T6, \T7, \M1
  393. adcx \T1, \T7
  394. mov 40\C, \T1 // C5_final
  395. adcx \T3, \T6
  396. mulx \T6, \T7, 8\M1
  397. mov \S, \T7 // store T7
  398. adcx \T5, \T6
  399. mulx \T1, \T6, 16\M1
  400. mov 8\S, \T6 // store T6
  401. adcx \T2, \T1
  402. mulx \T1, \T7, 24\M1
  403. adcx \T4, \T1
  404. mulx \T1, \T6, 32\M1
  405. adcx \T0, \T1
  406. mulx \T1, rdx, 40\M1
  407. adcx \T1, rax
  408. add \T3, \S
  409. adc \T5, 8\S
  410. adc \T2, \T7
  411. adc \T4, \T6
  412. adc \T0, rdx
  413. adc \T1, 0
  414. mov 48\C, \T3
  415. mov 56\C, \T5
  416. mov 64\C, \T2
  417. mov 72\C, \T4
  418. mov 80\C, \T0
  419. mov 88\C, \T1
  420. .endm
  421. #else
  422. .macro MUL384_SCHOOL M0, M1, C, S, T0, T1, T2, T3, T4, T5, T6, T7
  423. mov rdx, \M0
  424. mulx \T0, \T1, \M1
  425. mulx \T2, \T3, 8\M1
  426. mov \C, \T1 // C0_final
  427. mulx \T4, \T5, 16\M1
  428. add \T0, \T3
  429. adc \T2, \T5
  430. mulx \T1, \T3, 24\M1
  431. adc \T4, \T3
  432. mulx \T5, \T6, 32\M1
  433. adc \T1, \T6
  434. mulx \T3, \T7, 40\M1
  435. adc \T5, \T7
  436. adc \T3, 0
  437. mov rdx, 8\M0
  438. mulx \T6, \T7, \M1
  439. add \T0, \T7
  440. mov 8\C, \T0 // C1_final
  441. adc \T2, \T6
  442. mulx \T6, \T7, 8\M1
  443. mov \S, \T7 // store T7
  444. adc \T4, \T6
  445. mulx \T0, \T6, 16\M1
  446. mov 8\S, \T6 // store T6
  447. adc \T0, \T1
  448. mulx \T1, rax, 24\M1
  449. adc \T1, \T5
  450. mulx \T5, \T7, 32\M1
  451. adc \T3, \T5
  452. mulx \T5, \T6, 40\M1
  453. adc \T5, 0
  454. add \T2, \S
  455. adc \T4, 8\S
  456. adc \T0, rax
  457. adc \T1, \T7
  458. adc \T3, \T6
  459. adc \T5, 0
  460. mov rdx, 16\M0
  461. mulx \T6, \T7, \M1
  462. add \T2, \T7
  463. mov 16\C, \T2 // C2_final
  464. adc \T4, \T6
  465. mulx \T6, \T7, 8\M1
  466. mov \S, \T7 // store T7
  467. adc \T0, \T6
  468. mulx \T2, \T6, 16\M1
  469. mov 8\S, \T6 // store T6
  470. adc \T1, \T2
  471. mulx \T2, rax, 24\M1
  472. adc \T3, \T2
  473. mulx \T2, \T7, 32\M1
  474. adc \T5, \T2
  475. mulx \T2, \T6, 40\M1
  476. adc \T2, 0
  477. add \T4, \S
  478. adc \T0, 8\S
  479. adc \T1, rax
  480. adc \T3, \T7
  481. adc \T5, \T6
  482. adc \T2, 0
  483. mov rdx, 24\M0
  484. mulx \T6, \T7, \M1
  485. add \T4, \T7
  486. mov 24\C, \T4 // C3_final
  487. adc \T0, \T6
  488. mulx \T6, \T7, 8\M1
  489. mov \S, \T7 // store T7
  490. adc \T1, \T6
  491. mulx \T4, \T6, 16\M1
  492. mov 8\S, \T6 // store T6
  493. adc \T3, \T4
  494. mulx \T4, rax, 24\M1
  495. adc \T5, \T4
  496. mulx \T4, \T7, 32\M1
  497. adc \T2, \T4
  498. mulx \T4, \T6, 40\M1
  499. adc \T4, 0
  500. add \T0, \S
  501. adc \T1, 8\S
  502. adc \T3, rax
  503. adc \T5, \T7
  504. adc \T2, \T6
  505. adc \T4, 0
  506. mov rdx, 32\M0
  507. mulx \T6, \T7, \M1
  508. add \T0, \T7
  509. mov 32\C, \T0 // C4_final
  510. adc \T1, \T6
  511. mulx \T6, \T7, 8\M1
  512. mov \S, \T7 // store T7
  513. adc \T3, \T6
  514. mulx \T0, \T6, 16\M1
  515. mov 8\S, \T6 // store T6
  516. adc \T5, \T0
  517. mulx \T0, rax, 24\M1
  518. adc \T2, \T0
  519. mulx \T0, \T7, 32\M1
  520. adc \T4, \T0
  521. mulx \T0, \T6, 40\M1
  522. adc \T0, 0
  523. add \T1, \S
  524. adc \T3, 8\S
  525. adc \T5, rax
  526. adc \T2, \T7
  527. adc \T4, \T6
  528. adc \T0, 0
  529. mov rdx, 40\M0
  530. mulx \T6, \T7, \M1
  531. add \T1, \T7
  532. mov 40\C, \T1 // C5_final
  533. adc \T3, \T6
  534. mulx \T6, \T7, 8\M1
  535. mov \S, \T7 // store T7
  536. adc \T5, \T6
  537. mulx \T1, \T6, 16\M1
  538. mov 8\S, \T6 // store T6
  539. adc \T2, \T1
  540. mulx \T1, rax, 24\M1
  541. adc \T4, \T1
  542. mulx \T1, \T7, 32\M1
  543. adc \T0, \T1
  544. mulx \T1, \T6, 40\M1
  545. adc \T1, 0
  546. add \T3, \S
  547. mov 48\C, \T3
  548. adc \T5, 8\S
  549. mov 56\C, \T5
  550. adc \T2, rax
  551. mov 64\C, \T2
  552. adc \T4, \T7
  553. mov 72\C, \T4
  554. adc \T0, \T6
  555. mov 80\C, \T0
  556. adc \T1, 0
  557. mov 88\C, \T1
  558. .endm
  559. #endif
  560. //*****************************************************************************
  561. // 751-bit multiplication using Karatsuba (one level), schoolbook (two levels)
  562. //*****************************************************************************
  563. .global mul751_asm
  564. mul751_asm:
  565. push r12
  566. push r13
  567. push r14
  568. push r15
  569. mov rcx, reg_p3
  570. // [rsp] <- AH + AL, rax <- mask
  571. xor rax, rax
  572. mov r8, [reg_p1]
  573. mov r9, [reg_p1+8]
  574. mov r10, [reg_p1+16]
  575. mov r11, [reg_p1+24]
  576. mov r12, [reg_p1+32]
  577. mov r13, [reg_p1+40]
  578. push rbx
  579. push rbp
  580. sub rsp, 152
  581. add r8, [reg_p1+48]
  582. adc r9, [reg_p1+56]
  583. adc r10, [reg_p1+64]
  584. adc r11, [reg_p1+72]
  585. adc r12, [reg_p1+80]
  586. adc r13, [reg_p1+88]
  587. sbb rax, 0
  588. mov [rsp], r8
  589. mov [rsp+8], r9
  590. mov [rsp+16], r10
  591. mov [rsp+24], r11
  592. mov [rsp+32], r12
  593. mov [rsp+40], r13
  594. // [rsp+48] <- BH + BL, rdx <- mask
  595. xor rdx, rdx
  596. mov r8, [reg_p2]
  597. mov r9, [reg_p2+8]
  598. mov rbx, [reg_p2+16]
  599. mov rbp, [reg_p2+24]
  600. mov r14, [reg_p2+32]
  601. mov r15, [reg_p2+40]
  602. add r8, [reg_p2+48]
  603. adc r9, [reg_p2+56]
  604. adc rbx, [reg_p2+64]
  605. adc rbp, [reg_p2+72]
  606. adc r14, [reg_p2+80]
  607. adc r15, [reg_p2+88]
  608. sbb rdx, 0
  609. mov [rsp+48], r8
  610. mov [rsp+56], r9
  611. mov [rsp+64], rbx
  612. mov [rsp+72], rbp
  613. mov [rsp+80], r14
  614. mov [rsp+88], r15
  615. // [rcx] <- masked (BH + BL)
  616. and r8, rax
  617. and r9, rax
  618. and rbx, rax
  619. and rbp, rax
  620. and r14, rax
  621. and r15, rax
  622. mov [rcx], r8
  623. mov [rcx+8], r9
  624. mov [rcx+16], rbx /////
  625. mov [rcx+24], rbp /////
  626. // r8-r13 <- masked (AH + AL)
  627. mov r8, [rsp]
  628. mov r9, [rsp+8]
  629. and r8, rdx
  630. and r9, rdx
  631. and r10, rdx
  632. and r11, rdx
  633. and r12, rdx
  634. and r13, rdx
  635. // [rsp+96] <- masked (AH + AL) + masked (AH + AL)
  636. mov rax, [rcx]
  637. mov rdx, [rcx+8]
  638. add r8, rax
  639. adc r9, rdx
  640. adc r10, rbx
  641. adc r11, rbp
  642. adc r12, r14
  643. adc r13, r15
  644. mov [rsp+96], r8
  645. mov [rsp+104], r9
  646. mov [rsp+112], r10
  647. mov [rsp+120], r11
  648. // [rcx] <- AL x BL
  649. MUL384_SCHOOL [reg_p1], [reg_p2], [rcx], [rsp+128], r8, r9, r10, r11, rbx, rbp, r14, r15 // Result C0-C5
  650. // [rcx+96] <- (AH+AL) x (BH+BL), low part
  651. MUL384_SCHOOL [rsp], [rsp+48], [rcx+96], [rsp+128], r8, r9, r10, r11, rbx, rbp, r14, r15
  652. // [rsp] <- AH x BH
  653. MUL384_SCHOOL [reg_p1+48], [reg_p2+48], [rsp], [rsp+128], r8, r9, r10, r11, rbx, rbp, r14, r15
  654. // r8-r13 <- (AH+AL) x (BH+BL), final step
  655. mov r8, [rsp+96]
  656. mov r9, [rsp+104]
  657. mov r10, [rsp+112]
  658. mov r11, [rsp+120]
  659. mov rax, [rcx+144]
  660. add r8, rax
  661. mov rax, [rcx+152]
  662. adc r9, rax
  663. mov rax, [rcx+160]
  664. adc r10, rax
  665. mov rax, [rcx+168]
  666. adc r11, rax
  667. mov rax, [rcx+176]
  668. adc r12, rax
  669. mov rax, [rcx+184]
  670. adc r13, rax
  671. // rdi,rdx,rbx,rbp,r14,r15,r8-r13 <- (AH+AL) x (BH+BL) - ALxBL
  672. mov rdi, [rcx+96]
  673. sub rdi, [rcx]
  674. mov rdx, [rcx+104]
  675. sbb rdx, [rcx+8]
  676. mov rbx, [rcx+112]
  677. sbb rbx, [rcx+16]
  678. mov rbp, [rcx+120]
  679. sbb rbp, [rcx+24]
  680. mov r14, [rcx+128]
  681. sbb r14, [rcx+32]
  682. mov r15, [rcx+136]
  683. sbb r15, [rcx+40]
  684. sbb r8, [rcx+48]
  685. sbb r9, [rcx+56]
  686. sbb r10, [rcx+64]
  687. sbb r11, [rcx+72]
  688. sbb r12, [rcx+80]
  689. sbb r13, [rcx+88]
  690. // rdi,rdx,rbx,rbp,r14,r15,r8-r13 <- (AH+AL) x (BH+BL) - ALxBL - AHxBH
  691. sub rdi, [rsp]
  692. sbb rdx, [rsp+8]
  693. sbb rbx, [rsp+16]
  694. sbb rbp, [rsp+24]
  695. sbb r14, [rsp+32]
  696. sbb r15, [rsp+40]
  697. sbb r8, [rsp+48]
  698. sbb r9, [rsp+56]
  699. sbb r10, [rsp+64]
  700. sbb r11, [rsp+72]
  701. sbb r12, [rsp+80]
  702. sbb r13, [rsp+88]
  703. mov rax, [rcx+48]
  704. add rax, rdi
  705. mov [rcx+48], rax // Result C6-C11
  706. mov rax, [rcx+56]
  707. adc rax, rdx
  708. mov [rcx+56], rax
  709. mov rax, [rcx+64]
  710. adc rax, rbx
  711. mov [rcx+64], rax
  712. mov rax, [rcx+72]
  713. adc rax, rbp
  714. mov [rcx+72], rax
  715. mov rax, [rcx+80]
  716. adc rax, r14
  717. mov [rcx+80], rax
  718. mov rax, [rcx+88]
  719. adc rax, r15
  720. mov [rcx+88], rax
  721. mov rax, [rsp]
  722. adc r8, rax
  723. mov [rcx+96], r8 // Result C8-C15
  724. mov rax, [rsp+8]
  725. adc r9, rax
  726. mov [rcx+104], r9
  727. mov rax, [rsp+16]
  728. adc r10, rax
  729. mov [rcx+112], r10
  730. mov rax, [rsp+24]
  731. adc r11, rax
  732. mov [rcx+120], r11
  733. mov rax, [rsp+32]
  734. adc r12, rax
  735. mov [rcx+128], r12
  736. mov rax, [rsp+40]
  737. adc r13, rax
  738. mov [rcx+136], r13
  739. mov r8, [rsp+48]
  740. mov r9, [rsp+56]
  741. mov r10, [rsp+64]
  742. mov r11, [rsp+72]
  743. mov r12, [rsp+80]
  744. mov r13, [rsp+88]
  745. adc r8, 0
  746. adc r9, 0
  747. adc r10, 0
  748. adc r11, 0
  749. adc r12, 0
  750. adc r13, 0
  751. add rsp, 152
  752. mov [rcx+144], r8
  753. mov [rcx+152], r9
  754. mov [rcx+160], r10
  755. mov [rcx+168], r11
  756. mov [rcx+176], r12
  757. mov [rcx+184], r13
  758. pop rbp
  759. pop rbx
  760. pop r15
  761. pop r14
  762. pop r13
  763. pop r12
  764. ret
  765. #else
  766. //***********************************************************************
  767. // Integer multiplication
  768. // Based on Karatsuba method
  769. // Operation: c [reg_p3] = a [reg_p1] * b [reg_p2]
  770. // NOTE: a=c or b=c are not allowed
  771. //***********************************************************************
  772. .global mul751_asm
  773. mul751_asm:
  774. push r12
  775. push r13
  776. push r14
  777. mov rcx, reg_p3
  778. // rcx[0-5] <- AH+AL
  779. xor rax, rax
  780. mov r8, [reg_p1+48]
  781. mov r9, [reg_p1+56]
  782. mov r10, [reg_p1+64]
  783. mov r11, [reg_p1+72]
  784. mov r12, [reg_p1+80]
  785. mov r13, [reg_p1+88]
  786. add r8, [reg_p1]
  787. adc r9, [reg_p1+8]
  788. adc r10, [reg_p1+16]
  789. adc r11, [reg_p1+24]
  790. adc r12, [reg_p1+32]
  791. adc r13, [reg_p1+40]
  792. push r15
  793. mov [rcx], r8
  794. mov [rcx+8], r9
  795. mov [rcx+16], r10
  796. mov [rcx+24], r11
  797. mov [rcx+32], r12
  798. mov [rcx+40], r13
  799. sbb rax, 0
  800. sub rsp, 96 // Allocating space in stack
  801. // rcx[6-11] <- BH+BL
  802. xor rdx, rdx
  803. mov r8, [reg_p2+48]
  804. mov r9, [reg_p2+56]
  805. mov r10, [reg_p2+64]
  806. mov r11, [reg_p2+72]
  807. mov r12, [reg_p2+80]
  808. mov r13, [reg_p2+88]
  809. add r8, [reg_p2]
  810. adc r9, [reg_p2+8]
  811. adc r10, [reg_p2+16]
  812. adc r11, [reg_p2+24]
  813. adc r12, [reg_p2+32]
  814. adc r13, [reg_p2+40]
  815. mov [rcx+48], r8
  816. mov [rcx+56], r9
  817. mov [rcx+64], r10
  818. mov [rcx+72], r11
  819. mov [rcx+80], r12
  820. mov [rcx+88], r13
  821. sbb rdx, 0
  822. mov [rsp+80], rax
  823. mov [rsp+88], rdx
  824. // (rsp[0-8],r10,r8,r9) <- (AH+AL)*(BH+BL)
  825. mov r11, [rcx]
  826. mov rax, r8
  827. mul r11
  828. mov [rsp], rax // c0
  829. mov r14, rdx
  830. xor r15, r15
  831. mov rax, r9
  832. mul r11
  833. xor r9, r9
  834. add r14, rax
  835. adc r9, rdx
  836. mov r12, [rcx+8]
  837. mov rax, r8
  838. mul r12
  839. add r14, rax
  840. mov [rsp+8], r14 // c1
  841. adc r9, rdx
  842. adc r15, 0
  843. xor r8, r8
  844. mov rax, r10
  845. mul r11
  846. add r9, rax
  847. mov r13, [rcx+48]
  848. adc r15, rdx
  849. adc r8, 0
  850. mov rax, [rcx+16]
  851. mul r13
  852. add r9, rax
  853. adc r15, rdx
  854. mov rax, [rcx+56]
  855. adc r8, 0
  856. mul r12
  857. add r9, rax
  858. mov [rsp+16], r9 // c2
  859. adc r15, rdx
  860. adc r8, 0
  861. xor r9, r9
  862. mov rax, [rcx+72]
  863. mul r11
  864. add r15, rax
  865. adc r8, rdx
  866. adc r9, 0
  867. mov rax, [rcx+24]
  868. mul r13
  869. add r15, rax
  870. adc r8, rdx
  871. adc r9, 0
  872. mov rax, r10
  873. mul r12
  874. add r15, rax
  875. adc r8, rdx
  876. adc r9, 0
  877. mov r14, [rcx+16]
  878. mov rax, [rcx+56]
  879. mul r14
  880. add r15, rax
  881. mov [rsp+24], r15 // c3
  882. adc r8, rdx
  883. adc r9, 0
  884. xor r10, r10
  885. mov rax, [rcx+80]
  886. mul r11
  887. add r8, rax
  888. adc r9, rdx
  889. adc r10, 0
  890. mov rax, [rcx+64]
  891. mul r14
  892. add r8, rax
  893. adc r9, rdx
  894. adc r10, 0
  895. mov r15, [rcx+48]
  896. mov rax, [rcx+32]
  897. mul r15
  898. add r8, rax
  899. adc r9, rdx
  900. adc r10, 0
  901. mov rax, [rcx+72]
  902. mul r12
  903. add r8, rax
  904. adc r9, rdx
  905. adc r10, 0
  906. mov r13, [rcx+24]
  907. mov rax, [rcx+56]
  908. mul r13
  909. add r8, rax
  910. mov [rsp+32], r8 // c4
  911. adc r9, rdx
  912. adc r10, 0
  913. xor r8, r8
  914. mov rax, [rcx+88]
  915. mul r11
  916. add r9, rax
  917. adc r10, rdx
  918. adc r8, 0
  919. mov rax, [rcx+64]
  920. mul r13
  921. add r9, rax
  922. adc r10, rdx
  923. adc r8, 0
  924. mov rax, [rcx+72]
  925. mul r14
  926. add r9, rax
  927. adc r10, rdx
  928. adc r8, 0
  929. mov rax, [rcx+40]
  930. mul r15
  931. add r9, rax
  932. adc r10, rdx
  933. adc r8, 0
  934. mov rax, [rcx+80]
  935. mul r12
  936. add r9, rax
  937. adc r10, rdx
  938. adc r8, 0
  939. mov r15, [rcx+32]
  940. mov rax, [rcx+56]
  941. mul r15
  942. add r9, rax
  943. mov [rsp+40], r9 // c5
  944. adc r10, rdx
  945. adc r8, 0
  946. xor r9, r9
  947. mov rax, [rcx+64]
  948. mul r15
  949. add r10, rax
  950. adc r8, rdx
  951. adc r9, 0
  952. mov rax, [rcx+88]
  953. mul r12
  954. add r10, rax
  955. adc r8, rdx
  956. adc r9, 0
  957. mov rax, [rcx+80]
  958. mul r14
  959. add r10, rax
  960. adc r8, rdx
  961. adc r9, 0
  962. mov r11, [rcx+40]
  963. mov rax, [rcx+56]
  964. mul r11
  965. add r10, rax
  966. adc r8, rdx
  967. adc r9, 0
  968. mov rax, [rcx+72]
  969. mul r13
  970. add r10, rax
  971. mov [rsp+48], r10 // c6
  972. adc r8, rdx
  973. adc r9, 0
  974. xor r10, r10
  975. mov rax, [rcx+88]
  976. mul r14
  977. add r8, rax
  978. adc r9, rdx
  979. adc r10, 0
  980. mov rax, [rcx+64]
  981. mul r11
  982. add r8, rax
  983. adc r9, rdx
  984. adc r10, 0
  985. mov rax, [rcx+80]
  986. mul r13
  987. add r8, rax
  988. adc r9, rdx
  989. adc r10, 0
  990. mov rax, [rcx+72]
  991. mul r15
  992. add r8, rax
  993. mov [rsp+56], r8 // c7
  994. adc r9, rdx
  995. adc r10, 0
  996. xor r8, r8
  997. mov rax, [rcx+72]
  998. mul r11
  999. add r9, rax
  1000. adc r10, rdx
  1001. adc r8, 0
  1002. mov rax, [rcx+80]
  1003. mul r15
  1004. add r9, rax
  1005. adc r10, rdx
  1006. adc r8, 0
  1007. mov rax, [rcx+88]
  1008. mul r13
  1009. add r9, rax
  1010. mov [rsp+64], r9 // c8
  1011. adc r10, rdx
  1012. adc r8, 0
  1013. xor r9, r9
  1014. mov rax, [rcx+88]
  1015. mul r15
  1016. add r10, rax
  1017. adc r8, rdx
  1018. adc r9, 0
  1019. mov rax, [rcx+80]
  1020. mul r11
  1021. add r10, rax // c9
  1022. adc r8, rdx
  1023. adc r9, 0
  1024. mov rax, [rcx+88]
  1025. mul r11
  1026. add r8, rax // c10
  1027. adc r9, rdx // c11
  1028. mov rax, [rsp+88]
  1029. mov rdx, [rcx]
  1030. and r12, rax
  1031. and r14, rax
  1032. and rdx, rax
  1033. and r13, rax
  1034. and r15, rax
  1035. and r11, rax
  1036. mov rax, [rsp+48]
  1037. add rdx, rax
  1038. mov rax, [rsp+56]
  1039. adc r12, rax
  1040. mov rax, [rsp+64]
  1041. adc r14, rax
  1042. adc r13, r10
  1043. adc r15, r8
  1044. adc r11, r9
  1045. mov rax, [rsp+80]
  1046. mov [rsp+48], rdx
  1047. mov [rsp+56], r12
  1048. mov [rsp+64], r14
  1049. mov [rsp+72], r13
  1050. mov [rsp+80], r15
  1051. mov [rsp+88], r11
  1052. mov r8, [rcx+48]
  1053. mov r9, [rcx+56]
  1054. mov r10, [rcx+64]
  1055. mov r11, [rcx+72]
  1056. mov r12, [rcx+80]
  1057. mov r13, [rcx+88]
  1058. and r8, rax
  1059. and r9, rax
  1060. and r10, rax
  1061. and r11, rax
  1062. and r12, rax
  1063. and r13, rax
  1064. mov rax, [rsp+48]
  1065. add r8, rax
  1066. mov rax, [rsp+56]
  1067. adc r9, rax
  1068. mov rax, [rsp+64]
  1069. adc r10, rax
  1070. mov rax, [rsp+72]
  1071. adc r11, rax
  1072. mov rax, [rsp+80]
  1073. adc r12, rax
  1074. mov rax, [rsp+88]
  1075. adc r13, rax
  1076. mov [rsp+48], r8
  1077. mov [rsp+56], r9
  1078. mov [rsp+72], r11
  1079. // rcx[0-11] <- AL*BL
  1080. mov r11, [reg_p1]
  1081. mov rax, [reg_p2]
  1082. mul r11
  1083. xor r9, r9
  1084. mov [rcx], rax // c0
  1085. mov [rsp+64], r10
  1086. mov r8, rdx
  1087. mov rax, [reg_p2+8]
  1088. mul r11
  1089. xor r10, r10
  1090. add r8, rax
  1091. mov [rsp+80], r12
  1092. adc r9, rdx
  1093. mov r12, [reg_p1+8]
  1094. mov rax, [reg_p2]
  1095. mul r12
  1096. add r8, rax
  1097. mov [rcx+8], r8 // c1
  1098. adc r9, rdx
  1099. mov [rsp+88], r13
  1100. adc r10, 0
  1101. xor r8, r8
  1102. mov rax, [reg_p2+16]
  1103. mul r11
  1104. add r9, rax
  1105. adc r10, rdx
  1106. adc r8, 0
  1107. mov r13, [reg_p2]
  1108. mov rax, [reg_p1+16]
  1109. mul r13
  1110. add r9, rax
  1111. adc r10, rdx
  1112. adc r8, 0
  1113. mov rax, [reg_p2+8]
  1114. mul r12
  1115. add r9, rax
  1116. mov [rcx+16], r9 // c2
  1117. adc r10, rdx
  1118. adc r8, 0
  1119. xor r9, r9
  1120. mov rax, [reg_p2+24]
  1121. mul r11
  1122. add r10, rax
  1123. adc r8, rdx
  1124. adc r9, 0
  1125. mov rax, [reg_p1+24]
  1126. mul r13
  1127. add r10, rax
  1128. adc r8, rdx
  1129. adc r9, 0
  1130. mov rax, [reg_p2+16]
  1131. mul r12
  1132. add r10, rax
  1133. adc r8, rdx
  1134. adc r9, 0
  1135. mov r14, [reg_p1+16]
  1136. mov rax, [reg_p2+8]
  1137. mul r14
  1138. add r10, rax
  1139. mov [rcx+24], r10 // c3
  1140. adc r8, rdx
  1141. adc r9, 0
  1142. xor r10, r10
  1143. mov rax, [reg_p2+32]
  1144. mul r11
  1145. add r8, rax
  1146. adc r9, rdx
  1147. adc r10, 0
  1148. mov rax, [reg_p2+16]
  1149. mul r14
  1150. add r8, rax
  1151. adc r9, rdx
  1152. adc r10, 0
  1153. mov rax, [reg_p1+32]
  1154. mul r13
  1155. add r8, rax
  1156. adc r9, rdx
  1157. adc r10, 0
  1158. mov rax, [reg_p2+24]
  1159. mul r12
  1160. add r8, rax
  1161. adc r9, rdx
  1162. adc r10, 0
  1163. mov r13, [reg_p1+24]
  1164. mov rax, [reg_p2+8]
  1165. mul r13
  1166. add r8, rax
  1167. mov [rcx+32], r8 // c4
  1168. adc r9, rdx
  1169. adc r10, 0
  1170. xor r8, r8
  1171. mov rax, [reg_p2+40]
  1172. mul r11
  1173. add r9, rax
  1174. adc r10, rdx
  1175. adc r8, 0
  1176. mov rax, [reg_p2+16]
  1177. mul r13
  1178. add r9, rax
  1179. adc r10, rdx
  1180. adc r8, 0
  1181. mov rax, [reg_p2+24]
  1182. mul r14
  1183. add r9, rax
  1184. adc r10, rdx
  1185. adc r8, 0
  1186. mov r11, [reg_p1+40]
  1187. mov rax, [reg_p2]
  1188. mul r11
  1189. add r9, rax
  1190. adc r10, rdx
  1191. adc r8, 0
  1192. mov rax, [reg_p2+32]
  1193. mul r12
  1194. add r9, rax
  1195. adc r10, rdx
  1196. adc r8, 0
  1197. mov r15, [reg_p1+32]
  1198. mov rax, [reg_p2+8]
  1199. mul r15
  1200. add r9, rax
  1201. mov [rcx+40], r9 // c5
  1202. adc r10, rdx
  1203. adc r8, 0
  1204. xor r9, r9
  1205. mov rax, [reg_p2+16]
  1206. mul r15
  1207. add r10, rax
  1208. adc r8, rdx
  1209. adc r9, 0
  1210. mov rax, [reg_p2+40]
  1211. mul r12
  1212. add r10, rax
  1213. adc r8, rdx
  1214. adc r9, 0
  1215. mov rax, [reg_p2+32]
  1216. mul r14
  1217. add r10, rax
  1218. adc r8, rdx
  1219. adc r9, 0
  1220. mov rax, [reg_p2+8]
  1221. mul r11
  1222. add r10, rax
  1223. adc r8, rdx
  1224. adc r9, 0
  1225. mov rax, [reg_p2+24]
  1226. mul r13
  1227. add r10, rax
  1228. mov [rcx+48], r10 // c6
  1229. adc r8, rdx
  1230. adc r9, 0
  1231. xor r10, r10
  1232. mov rax, [reg_p2+40]
  1233. mul r14
  1234. add r8, rax
  1235. adc r9, rdx
  1236. adc r10, 0
  1237. mov rax, [reg_p2+16]
  1238. mul r11
  1239. add r8, rax
  1240. adc r9, rdx
  1241. adc r10, 0
  1242. mov rax, [reg_p2+32]
  1243. mul r13
  1244. add r8, rax
  1245. adc r9, rdx
  1246. adc r10, 0
  1247. mov rax, [reg_p2+24]
  1248. mul r15
  1249. add r8, rax
  1250. mov [rcx+56], r8 // c7
  1251. adc r9, rdx
  1252. adc r10, 0
  1253. xor r8, r8
  1254. mov rax, [reg_p2+24]
  1255. mul r11
  1256. add r9, rax
  1257. adc r10, rdx
  1258. adc r8, 0
  1259. mov rax, [reg_p2+32]
  1260. mul r15
  1261. add r9, rax
  1262. adc r10, rdx
  1263. adc r8, 0
  1264. mov rax, [reg_p2+40]
  1265. mul r13
  1266. add r9, rax
  1267. mov [rcx+64], r9 // c8
  1268. adc r10, rdx
  1269. adc r8, 0
  1270. xor r9, r9
  1271. mov rax, [reg_p2+40]
  1272. mul r15
  1273. add r10, rax
  1274. adc r8, rdx
  1275. adc r9, 0
  1276. mov rax, [reg_p2+32]
  1277. mul r11
  1278. add r10, rax
  1279. mov [rcx+72], r10 // c9
  1280. adc r8, rdx
  1281. adc r9, 0
  1282. mov rax, [reg_p2+40]
  1283. mul r11
  1284. add r8, rax
  1285. mov [rcx+80], r8 // c10
  1286. adc r9, rdx
  1287. mov [rcx+88], r9 // c11
  1288. // rcx[12-23] <- AH*BH
  1289. mov r11, [reg_p1+48]
  1290. mov rax, [reg_p2+48]
  1291. mul r11
  1292. xor r9, r9
  1293. mov [rcx+96], rax // c0
  1294. mov r8, rdx
  1295. mov rax, [reg_p2+56]
  1296. mul r11
  1297. xor r10, r10
  1298. add r8, rax
  1299. adc r9, rdx
  1300. mov r12, [reg_p1+56]
  1301. mov rax, [reg_p2+48]
  1302. mul r12
  1303. add r8, rax
  1304. mov [rcx+104], r8 // c1
  1305. adc r9, rdx
  1306. adc r10, 0
  1307. xor r8, r8
  1308. mov rax, [reg_p2+64]
  1309. mul r11
  1310. add r9, rax
  1311. adc r10, rdx
  1312. adc r8, 0
  1313. mov r13, [reg_p2+48]
  1314. mov rax, [reg_p1+64]
  1315. mul r13
  1316. add r9, rax
  1317. adc r10, rdx
  1318. adc r8, 0
  1319. mov rax, [reg_p2+56]
  1320. mul r12
  1321. add r9, rax
  1322. mov [rcx+112], r9 // c2
  1323. adc r10, rdx
  1324. adc r8, 0
  1325. xor r9, r9
  1326. mov rax, [reg_p2+72]
  1327. mul r11
  1328. add r10, rax
  1329. adc r8, rdx
  1330. adc r9, 0
  1331. mov rax, [reg_p1+72]
  1332. mul r13
  1333. add r10, rax
  1334. adc r8, rdx
  1335. adc r9, 0
  1336. mov rax, [reg_p2+64]
  1337. mul r12
  1338. add r10, rax
  1339. adc r8, rdx
  1340. adc r9, 0
  1341. mov r14, [reg_p1+64]
  1342. mov rax, [reg_p2+56]
  1343. mul r14
  1344. add r10, rax
  1345. mov [rcx+120], r10 // c3
  1346. adc r8, rdx
  1347. adc r9, 0
  1348. xor r10, r10
  1349. mov rax, [reg_p2+80]
  1350. mul r11
  1351. add r8, rax
  1352. adc r9, rdx
  1353. adc r10, 0
  1354. mov rax, [reg_p2+64]
  1355. mul r14
  1356. add r8, rax
  1357. adc r9, rdx
  1358. adc r10, 0
  1359. mov r15, [reg_p1+80]
  1360. mov rax, r13
  1361. mul r15
  1362. add r8, rax
  1363. adc r9, rdx
  1364. adc r10, 0
  1365. mov rax, [reg_p2+72]
  1366. mul r12
  1367. add r8, rax
  1368. adc r9, rdx
  1369. adc r10, 0
  1370. mov r13, [reg_p1+72]
  1371. mov rax, [reg_p2+56]
  1372. mul r13
  1373. add r8, rax
  1374. mov [rcx+128], r8 // c4
  1375. adc r9, rdx
  1376. adc r10, 0
  1377. xor r8, r8
  1378. mov rax, [reg_p2+88]
  1379. mul r11
  1380. add r9, rax
  1381. adc r10, rdx
  1382. adc r8, 0
  1383. mov rax, [reg_p2+64]
  1384. mul r13
  1385. add r9, rax
  1386. adc r10, rdx
  1387. adc r8, 0
  1388. mov rax, [reg_p2+72]
  1389. mul r14
  1390. add r9, rax
  1391. adc r10, rdx
  1392. adc r8, 0
  1393. mov r11, [reg_p1+88]
  1394. mov rax, [reg_p2+48]
  1395. mul r11
  1396. add r9, rax
  1397. adc r10, rdx
  1398. adc r8, 0
  1399. mov rax, [reg_p2+80]
  1400. mul r12
  1401. add r9, rax
  1402. adc r10, rdx
  1403. adc r8, 0
  1404. mov rax, [reg_p2+56]
  1405. mul r15
  1406. add r9, rax
  1407. mov [rcx+136], r9 // c5
  1408. adc r10, rdx
  1409. adc r8, 0
  1410. xor r9, r9
  1411. mov rax, [reg_p2+64]
  1412. mul r15
  1413. add r10, rax
  1414. adc r8, rdx
  1415. adc r9, 0
  1416. mov rax, [reg_p2+88]
  1417. mul r12
  1418. add r10, rax
  1419. adc r8, rdx
  1420. adc r9, 0
  1421. mov rax, [reg_p2+80]
  1422. mul r14
  1423. add r10, rax
  1424. adc r8, rdx
  1425. adc r9, 0
  1426. mov rax, [reg_p2+56]
  1427. mul r11
  1428. add r10, rax
  1429. adc r8, rdx
  1430. adc r9, 0
  1431. mov rax, [reg_p2+72]
  1432. mul r13
  1433. add r10, rax
  1434. mov [rcx+144], r10 // c6
  1435. adc r8, rdx
  1436. adc r9, 0
  1437. xor r10, r10
  1438. mov rax, [reg_p2+88]
  1439. mul r14
  1440. add r8, rax
  1441. adc r9, rdx
  1442. adc r10, 0
  1443. mov rax, [reg_p2+64]
  1444. mul r11
  1445. add r8, rax
  1446. adc r9, rdx
  1447. adc r10, 0
  1448. mov rax, [reg_p2+80]
  1449. mul r13
  1450. add r8, rax
  1451. adc r9, rdx
  1452. adc r10, 0
  1453. mov rax, [reg_p2+72]
  1454. mul r15
  1455. add r8, rax
  1456. mov [rcx+152], r8 // c7
  1457. adc r9, rdx
  1458. adc r10, 0
  1459. xor r8, r8
  1460. mov rax, [reg_p2+72]
  1461. mul r11
  1462. add r9, rax
  1463. adc r10, rdx
  1464. adc r8, 0
  1465. mov rax, [reg_p2+80]
  1466. mul r15
  1467. add r9, rax
  1468. adc r10, rdx
  1469. adc r8, 0
  1470. mov rax, [reg_p2+88]
  1471. mul r13
  1472. add r9, rax
  1473. mov [rcx+160], r9 // c8
  1474. adc r10, rdx
  1475. adc r8, 0
  1476. mov rax, [reg_p2+88]
  1477. mul r15
  1478. add r10, rax
  1479. adc r8, rdx
  1480. mov rax, [reg_p2+80]
  1481. mul r11
  1482. add r10, rax
  1483. mov [rcx+168], r10 // c9
  1484. adc r8, rdx
  1485. mov rax, [reg_p2+88]
  1486. mul r11
  1487. add r8, rax
  1488. mov [rcx+176], r8 // c10
  1489. adc rdx, 0
  1490. mov [rcx+184], rdx // c11
  1491. // [r8-r15,rax,rdx,rdi,[rsp]] <- (AH+AL)*(BH+BL) - AL*BL
  1492. mov r8, [rsp]
  1493. sub r8, [rcx]
  1494. mov r9, [rsp+8]
  1495. sbb r9, [rcx+8]
  1496. mov r10, [rsp+16]
  1497. sbb r10, [rcx+16]
  1498. mov r11, [rsp+24]
  1499. sbb r11, [rcx+24]
  1500. mov r12, [rsp+32]
  1501. sbb r12, [rcx+32]
  1502. mov r13, [rsp+40]
  1503. sbb r13, [rcx+40]
  1504. mov r14, [rsp+48]
  1505. sbb r14, [rcx+48]
  1506. mov r15, [rsp+56]
  1507. sbb r15, [rcx+56]
  1508. mov rax, [rsp+64]
  1509. sbb rax, [rcx+64]
  1510. mov rdx, [rsp+72]
  1511. sbb rdx, [rcx+72]
  1512. mov rdi, [rsp+80]
  1513. sbb rdi, [rcx+80]
  1514. mov rsi, [rsp+88]
  1515. sbb rsi, [rcx+88]
  1516. mov [rsp], rsi
  1517. // [r8-r15,rax,rdx,rdi,[rsp]] <- (AH+AL)*(BH+BL) - AL*BL - AH*BH
  1518. mov rsi, [rcx+96]
  1519. sub r8, rsi
  1520. mov rsi, [rcx+104]
  1521. sbb r9, rsi
  1522. mov rsi, [rcx+112]
  1523. sbb r10, rsi
  1524. mov rsi, [rcx+120]
  1525. sbb r11, rsi
  1526. mov rsi, [rcx+128]
  1527. sbb r12, rsi
  1528. mov rsi, [rcx+136]
  1529. sbb r13, rsi
  1530. mov rsi, [rcx+144]
  1531. sbb r14, rsi
  1532. mov rsi, [rcx+152]
  1533. sbb r15, rsi
  1534. mov rsi, [rcx+160]
  1535. sbb rax, rsi
  1536. mov rsi, [rcx+168]
  1537. sbb rdx, rsi
  1538. mov rsi, [rcx+176]
  1539. sbb rdi, rsi
  1540. mov rsi, [rsp]
  1541. sbb rsi, [rcx+184]
  1542. // Final result
  1543. add r8, [rcx+48]
  1544. mov [rcx+48], r8
  1545. adc r9, [rcx+56]
  1546. mov [rcx+56], r9
  1547. adc r10, [rcx+64]
  1548. mov [rcx+64], r10
  1549. adc r11, [rcx+72]
  1550. mov [rcx+72], r11
  1551. adc r12, [rcx+80]
  1552. mov [rcx+80], r12
  1553. adc r13, [rcx+88]
  1554. mov [rcx+88], r13
  1555. adc r14, [rcx+96]
  1556. mov [rcx+96], r14
  1557. adc r15, [rcx+104]
  1558. mov [rcx+104], r15
  1559. adc rax, [rcx+112]
  1560. mov [rcx+112], rax
  1561. adc rdx, [rcx+120]
  1562. mov [rcx+120], rdx
  1563. adc rdi, [rcx+128]
  1564. mov [rcx+128], rdi
  1565. adc rsi, [rcx+136]
  1566. mov [rcx+136], rsi
  1567. mov rax, [rcx+144]
  1568. adc rax, 0
  1569. mov [rcx+144], rax
  1570. mov rax, [rcx+152]
  1571. adc rax, 0
  1572. mov [rcx+152], rax
  1573. mov rax, [rcx+160]
  1574. adc rax, 0
  1575. mov [rcx+160], rax
  1576. mov rax, [rcx+168]
  1577. adc rax, 0
  1578. mov [rcx+168], rax
  1579. mov rax, [rcx+176]
  1580. adc rax, 0
  1581. mov [rcx+176], rax
  1582. mov rax, [rcx+184]
  1583. adc rax, 0
  1584. mov [rcx+184], rax
  1585. add rsp, 96 // Restoring space in stack
  1586. pop r15
  1587. pop r14
  1588. pop r13
  1589. pop r12
  1590. ret
  1591. #endif
  1592. //***********************************************************************
  1593. // Montgomery reduction
  1594. // Based on comba method
  1595. // Operation: c [reg_p2] = a [reg_p1]
  1596. // NOTE: a=c is not allowed
  1597. //***********************************************************************
  1598. .global rdc751_asm
  1599. rdc751_asm:
  1600. push r12
  1601. push r13
  1602. push r14
  1603. push r15
  1604. mov r11, [reg_p1]
  1605. movq rax, p751p1_5
  1606. mul r11
  1607. xor r8, r8
  1608. add rax, [reg_p1+40]
  1609. mov [reg_p2+40], rax // z5
  1610. adc r8, rdx
  1611. xor r9, r9
  1612. movq rax, p751p1_6
  1613. mul r11
  1614. xor r10, r10
  1615. add r8, rax
  1616. adc r9, rdx
  1617. mov r12, [reg_p1+8]
  1618. movq rax, p751p1_5
  1619. mul r12
  1620. add r8, rax
  1621. adc r9, rdx
  1622. adc r10, 0
  1623. add r8, [reg_p1+48]
  1624. mov [reg_p2+48], r8 // z6
  1625. adc r9, 0
  1626. adc r10, 0
  1627. xor r8, r8
  1628. movq rax, p751p1_7
  1629. mul r11
  1630. add r9, rax
  1631. adc r10, rdx
  1632. adc r8, 0
  1633. movq rax, p751p1_6
  1634. mul r12
  1635. add r9, rax
  1636. adc r10, rdx
  1637. adc r8, 0
  1638. mov r13, [reg_p1+16]
  1639. movq rax, p751p1_5
  1640. mul r13
  1641. add r9, rax
  1642. adc r10, rdx
  1643. adc r8, 0
  1644. add r9, [reg_p1+56]
  1645. mov [reg_p2+56], r9 // z7
  1646. adc r10, 0
  1647. adc r8, 0
  1648. xor r9, r9
  1649. movq rax, p751p1_8
  1650. mul r11
  1651. add r10, rax
  1652. adc r8, rdx
  1653. adc r9, 0
  1654. movq rax, p751p1_7
  1655. mul r12
  1656. add r10, rax
  1657. adc r8, rdx
  1658. adc r9, 0
  1659. movq rax, p751p1_6
  1660. mul r13
  1661. add r10, rax
  1662. adc r8, rdx
  1663. adc r9, 0
  1664. mov r14, [reg_p1+24]
  1665. movq rax, p751p1_5
  1666. mul r14
  1667. add r10, rax
  1668. adc r8, rdx
  1669. adc r9, 0
  1670. add r10, [reg_p1+64]
  1671. mov [reg_p2+64], r10 // z8
  1672. adc r8, 0
  1673. adc r9, 0
  1674. xor r10, r10
  1675. movq rax, p751p1_9
  1676. mul r11
  1677. add r8, rax
  1678. adc r9, rdx
  1679. adc r10, 0
  1680. movq rax, p751p1_8
  1681. mul r12
  1682. add r8, rax
  1683. adc r9, rdx
  1684. adc r10, 0
  1685. movq rax, p751p1_7
  1686. mul r13
  1687. add r8, rax
  1688. adc r9, rdx
  1689. adc r10, 0
  1690. movq rax, p751p1_6
  1691. mul r14
  1692. add r8, rax
  1693. adc r9, rdx
  1694. adc r10, 0
  1695. mov r15, [reg_p1+32]
  1696. movq rax, p751p1_5
  1697. mul r15
  1698. add r8, rax
  1699. adc r9, rdx
  1700. adc r10, 0
  1701. add r8, [reg_p1+72]
  1702. mov [reg_p2+72], r8 // z9
  1703. adc r9, 0
  1704. adc r10, 0
  1705. xor r8, r8
  1706. movq rax, p751p1_10
  1707. mul r11
  1708. add r9, rax
  1709. adc r10, rdx
  1710. adc r8, 0
  1711. movq rax, p751p1_9
  1712. mul r12
  1713. add r9, rax
  1714. adc r10, rdx
  1715. adc r8, 0
  1716. movq rax, p751p1_8
  1717. mul r13
  1718. add r9, rax
  1719. adc r10, rdx
  1720. adc r8, 0
  1721. movq rax, p751p1_7
  1722. mul r14
  1723. add r9, rax
  1724. adc r10, rdx
  1725. adc r8, 0
  1726. movq rax, p751p1_6
  1727. mul r15
  1728. add r9, rax
  1729. adc r10, rdx
  1730. adc r8, 0
  1731. mov rcx, [reg_p2+40]
  1732. movq rax, p751p1_5
  1733. mul rcx
  1734. add r9, rax
  1735. adc r10, rdx
  1736. adc r8, 0
  1737. add r9, [reg_p1+80]
  1738. mov [reg_p2+80], r9 // z10
  1739. adc r10, 0
  1740. adc r8, 0
  1741. xor r9, r9
  1742. movq rax, p751p1_11
  1743. mul r11
  1744. add r10, rax
  1745. adc r8, rdx
  1746. adc r9, 0
  1747. movq rax, p751p1_10
  1748. mul r12
  1749. add r10, rax
  1750. adc r8, rdx
  1751. adc r9, 0
  1752. movq rax, p751p1_9
  1753. mul r13
  1754. add r10, rax
  1755. adc r8, rdx
  1756. adc r9, 0
  1757. movq rax, p751p1_8
  1758. mul r14
  1759. add r10, rax
  1760. adc r8, rdx
  1761. adc r9, 0
  1762. movq rax, p751p1_7
  1763. mul r15
  1764. add r10, rax
  1765. adc r8, rdx
  1766. adc r9, 0
  1767. movq rax, p751p1_6
  1768. mul rcx
  1769. add r10, rax
  1770. adc r8, rdx
  1771. adc r9, 0
  1772. mov r11, [reg_p2+48]
  1773. movq rax, p751p1_5
  1774. mul r11
  1775. add r10, rax
  1776. adc r8, rdx
  1777. adc r9, 0
  1778. add r10, [reg_p1+88]
  1779. mov [reg_p2+88], r10 // z11
  1780. adc r8, 0
  1781. adc r9, 0
  1782. xor r10, r10
  1783. movq rax, p751p1_11
  1784. mul r12
  1785. add r8, rax
  1786. adc r9, rdx
  1787. adc r10, 0
  1788. movq rax, p751p1_10
  1789. mul r13
  1790. add r8, rax
  1791. adc r9, rdx
  1792. adc r10, 0
  1793. movq rax, p751p1_9
  1794. mul r14
  1795. add r8, rax
  1796. adc r9, rdx
  1797. adc r10, 0
  1798. movq rax, p751p1_8
  1799. mul r15
  1800. add r8, rax
  1801. adc r9, rdx
  1802. adc r10, 0
  1803. movq rax, p751p1_7
  1804. mul rcx
  1805. add r8, rax
  1806. adc r9, rdx
  1807. adc r10, 0
  1808. movq rax, p751p1_6
  1809. mul r11
  1810. add r8, rax
  1811. adc r9, rdx
  1812. adc r10, 0
  1813. mov r12, [reg_p2+56]
  1814. movq rax, p751p1_5
  1815. mul r12
  1816. add r8, rax
  1817. adc r9, rdx
  1818. adc r10, 0
  1819. add r8, [reg_p1+96]
  1820. mov [reg_p2], r8 // z0
  1821. adc r9, 0
  1822. adc r10, 0
  1823. xor r8, r8
  1824. movq rax, p751p1_11
  1825. mul r13
  1826. add r9, rax
  1827. adc r10, rdx
  1828. adc r8, 0
  1829. movq rax, p751p1_10
  1830. mul r14
  1831. add r9, rax
  1832. adc r10, rdx
  1833. adc r8, 0
  1834. movq rax, p751p1_9
  1835. mul r15
  1836. add r9, rax
  1837. adc r10, rdx
  1838. adc r8, 0
  1839. movq rax, p751p1_8
  1840. mul rcx
  1841. add r9, rax
  1842. adc r10, rdx
  1843. adc r8, 0
  1844. movq rax, p751p1_7
  1845. mul r11
  1846. add r9, rax
  1847. adc r10, rdx
  1848. adc r8, 0
  1849. movq rax, p751p1_6
  1850. mul r12
  1851. add r9, rax
  1852. adc r10, rdx
  1853. adc r8, 0
  1854. mov r13, [reg_p2+64]
  1855. movq rax, p751p1_5
  1856. mul r13
  1857. add r9, rax
  1858. adc r10, rdx
  1859. adc r8, 0
  1860. add r9, [reg_p1+104]
  1861. mov [reg_p2+8], r9 // z1
  1862. adc r10, 0
  1863. adc r8, 0
  1864. xor r9, r9
  1865. movq rax, p751p1_11
  1866. mul r14
  1867. add r10, rax
  1868. adc r8, rdx
  1869. adc r9, 0
  1870. movq rax, p751p1_10
  1871. mul r15
  1872. add r10, rax
  1873. adc r8, rdx
  1874. adc r9, 0
  1875. movq rax, p751p1_9
  1876. mul rcx
  1877. add r10, rax
  1878. adc r8, rdx
  1879. adc r9, 0
  1880. movq rax, p751p1_8
  1881. mul r11
  1882. add r10, rax
  1883. adc r8, rdx
  1884. adc r9, 0
  1885. movq rax, p751p1_7
  1886. mul r12
  1887. add r10, rax
  1888. adc r8, rdx
  1889. adc r9, 0
  1890. movq rax, p751p1_6
  1891. mul r13
  1892. add r10, rax
  1893. adc r8, rdx
  1894. adc r9, 0
  1895. mov r14, [reg_p2+72]
  1896. movq rax, p751p1_5
  1897. mul r14
  1898. add r10, rax
  1899. adc r8, rdx
  1900. adc r9, 0
  1901. add r10, [reg_p1+112]
  1902. mov [reg_p2+16], r10 // z2
  1903. adc r8, 0
  1904. adc r9, 0
  1905. xor r10, r10
  1906. movq rax, p751p1_11
  1907. mul r15
  1908. add r8, rax
  1909. adc r9, rdx
  1910. adc r10, 0
  1911. movq rax, p751p1_10
  1912. mul rcx
  1913. add r8, rax
  1914. adc r9, rdx
  1915. adc r10, 0
  1916. movq rax, p751p1_9
  1917. mul r11
  1918. add r8, rax
  1919. adc r9, rdx
  1920. adc r10, 0
  1921. movq rax, p751p1_8
  1922. mul r12
  1923. add r8, rax
  1924. adc r9, rdx
  1925. adc r10, 0
  1926. movq rax, p751p1_7
  1927. mul r13
  1928. add r8, rax
  1929. adc r9, rdx
  1930. adc r10, 0
  1931. movq rax, p751p1_6
  1932. mul r14
  1933. add r8, rax
  1934. adc r9, rdx
  1935. adc r10, 0
  1936. mov r15, [reg_p2+80]
  1937. movq rax, p751p1_5
  1938. mul r15
  1939. add r8, rax
  1940. adc r9, rdx
  1941. adc r10, 0
  1942. add r8, [reg_p1+120]
  1943. mov [reg_p2+24], r8 // z3
  1944. adc r9, 0
  1945. adc r10, 0
  1946. xor r8, r8
  1947. movq rax, p751p1_11
  1948. mul rcx
  1949. add r9, rax
  1950. adc r10, rdx
  1951. adc r8, 0
  1952. movq rax, p751p1_10
  1953. mul r11
  1954. add r9, rax
  1955. adc r10, rdx
  1956. adc r8, 0
  1957. movq rax, p751p1_9
  1958. mul r12
  1959. add r9, rax
  1960. adc r10, rdx
  1961. adc r8, 0
  1962. movq rax, p751p1_8
  1963. mul r13
  1964. add r9, rax
  1965. adc r10, rdx
  1966. adc r8, 0
  1967. movq rax, p751p1_7
  1968. mul r14
  1969. add r9, rax
  1970. adc r10, rdx
  1971. adc r8, 0
  1972. movq rax, p751p1_6
  1973. mul r15
  1974. add r9, rax
  1975. adc r10, rdx
  1976. adc r8, 0
  1977. mov rcx, [reg_p2+88]
  1978. movq rax, p751p1_5
  1979. mul rcx
  1980. add r9, rax
  1981. adc r10, rdx
  1982. adc r8, 0
  1983. add r9, [reg_p1+128]
  1984. mov [reg_p2+32], r9 // z4
  1985. adc r10, 0
  1986. adc r8, 0
  1987. xor r9, r9
  1988. movq rax, p751p1_11
  1989. mul r11
  1990. add r10, rax
  1991. adc r8, rdx
  1992. adc r9, 0
  1993. movq rax, p751p1_10
  1994. mul r12
  1995. add r10, rax
  1996. adc r8, rdx
  1997. adc r9, 0
  1998. movq rax, p751p1_9
  1999. mul r13
  2000. add r10, rax
  2001. adc r8, rdx
  2002. adc r9, 0
  2003. movq rax, p751p1_8
  2004. mul r14
  2005. add r10, rax
  2006. adc r8, rdx
  2007. adc r9, 0
  2008. movq rax, p751p1_7
  2009. mul r15
  2010. add r10, rax
  2011. adc r8, rdx
  2012. adc r9, 0
  2013. movq rax, p751p1_6
  2014. mul rcx
  2015. add r10, rax
  2016. adc r8, rdx
  2017. adc r9, 0
  2018. add r10, [reg_p1+136]
  2019. mov [reg_p2+40], r10 // z5
  2020. adc r8, 0
  2021. adc r9, 0
  2022. xor r10, r10
  2023. movq rax, p751p1_11
  2024. mul r12
  2025. add r8, rax
  2026. adc r9, rdx
  2027. adc r10, 0
  2028. movq rax, p751p1_10
  2029. mul r13
  2030. add r8, rax
  2031. adc r9, rdx
  2032. adc r10, 0
  2033. movq rax, p751p1_9
  2034. mul r14
  2035. add r8, rax
  2036. adc r9, rdx
  2037. adc r10, 0
  2038. movq rax, p751p1_8
  2039. mul r15
  2040. add r8, rax
  2041. adc r9, rdx
  2042. adc r10, 0
  2043. movq rax, p751p1_7
  2044. mul rcx
  2045. add r8, rax
  2046. adc r9, rdx
  2047. adc r10, 0
  2048. add r8, [reg_p1+144]
  2049. mov [reg_p2+48], r8 // z6
  2050. adc r9, 0
  2051. adc r10, 0
  2052. xor r8, r8
  2053. movq rax, p751p1_11
  2054. mul r13
  2055. add r9, rax
  2056. adc r10, rdx
  2057. adc r8, 0
  2058. movq rax, p751p1_10
  2059. mul r14
  2060. add r9, rax
  2061. adc r10, rdx
  2062. adc r8, 0
  2063. movq rax, p751p1_9
  2064. mul r15
  2065. add r9, rax
  2066. adc r10, rdx
  2067. adc r8, 0
  2068. movq rax, p751p1_8
  2069. mul rcx
  2070. add r9, rax
  2071. adc r10, rdx
  2072. adc r8, 0
  2073. add r9, [reg_p1+152]
  2074. mov [reg_p2+56], r9 // z7
  2075. adc r10, 0
  2076. adc r8, 0
  2077. xor r9, r9
  2078. movq rax, p751p1_11
  2079. mul r14
  2080. add r10, rax
  2081. adc r8, rdx
  2082. adc r9, 0
  2083. movq rax, p751p1_10
  2084. mul r15
  2085. add r10, rax
  2086. adc r8, rdx
  2087. adc r9, 0
  2088. movq rax, p751p1_9
  2089. mul rcx
  2090. add r10, rax
  2091. adc r8, rdx
  2092. adc r9, 0
  2093. add r10, [reg_p1+160]
  2094. mov [reg_p2+64], r10 // z8
  2095. adc r8, 0
  2096. adc r9, 0
  2097. xor r10, r10
  2098. movq rax, p751p1_11
  2099. mul r15
  2100. add r8, rax
  2101. adc r9, rdx
  2102. adc r10, 0
  2103. movq rax, p751p1_10
  2104. mul rcx
  2105. add r8, rax
  2106. adc r9, rdx
  2107. adc r10, 0
  2108. add r8, [reg_p1+168] // z9
  2109. mov [reg_p2+72], r8 // z9
  2110. adc r9, 0
  2111. adc r10, 0
  2112. movq rax, p751p1_11
  2113. mul rcx
  2114. add r9, rax
  2115. adc r10, rdx
  2116. add r9, [reg_p1+176] // z10
  2117. mov [reg_p2+80], r9 // z10
  2118. adc r10, 0
  2119. add r10, [reg_p1+184] // z11
  2120. mov [reg_p2+88], r10 // z11
  2121. pop r15
  2122. pop r14
  2123. pop r13
  2124. pop r12
  2125. ret
  2126. //***********************************************************************
  2127. // 751-bit multiprecision addition
  2128. // Operation: c [reg_p3] = a [reg_p1] + b [reg_p2]
  2129. //***********************************************************************
  2130. .global mp_add751_asm
  2131. mp_add751_asm:
  2132. push r12
  2133. push r13
  2134. push r14
  2135. push r15
  2136. push rbx
  2137. mov r8, [reg_p1]
  2138. mov r9, [reg_p1+8]
  2139. mov r10, [reg_p1+16]
  2140. mov r11, [reg_p1+24]
  2141. mov r12, [reg_p1+32]
  2142. mov r13, [reg_p1+40]
  2143. mov r14, [reg_p1+48]
  2144. mov r15, [reg_p1+56]
  2145. mov rax, [reg_p1+64]
  2146. mov rbx, [reg_p1+72]
  2147. mov rcx, [reg_p1+80]
  2148. mov rdi, [reg_p1+88]
  2149. add r8, [reg_p2]
  2150. adc r9, [reg_p2+8]
  2151. adc r10, [reg_p2+16]
  2152. adc r11, [reg_p2+24]
  2153. adc r12, [reg_p2+32]
  2154. adc r13, [reg_p2+40]
  2155. adc r14, [reg_p2+48]
  2156. adc r15, [reg_p2+56]
  2157. adc rax, [reg_p2+64]
  2158. adc rbx, [reg_p2+72]
  2159. adc rcx, [reg_p2+80]
  2160. adc rdi, [reg_p2+88]
  2161. mov [reg_p3], r8
  2162. mov [reg_p3+8], r9
  2163. mov [reg_p3+16], r10
  2164. mov [reg_p3+24], r11
  2165. mov [reg_p3+32], r12
  2166. mov [reg_p3+40], r13
  2167. mov [reg_p3+48], r14
  2168. mov [reg_p3+56], r15
  2169. mov [reg_p3+64], rax
  2170. mov [reg_p3+72], rbx
  2171. mov [reg_p3+80], rcx
  2172. mov [reg_p3+88], rdi
  2173. pop rbx
  2174. pop r15
  2175. pop r14
  2176. pop r13
  2177. pop r12
  2178. ret
  2179. //***********************************************************************
  2180. // 2x751-bit multiprecision addition
  2181. // Operation: c [reg_p3] = a [reg_p1] + b [reg_p2]
  2182. //***********************************************************************
  2183. .global mp_add751x2_asm
  2184. mp_add751x2_asm:
  2185. push r12
  2186. push r13
  2187. push r14
  2188. push r15
  2189. push rbx
  2190. mov r8, [reg_p1]
  2191. mov r9, [reg_p1+8]
  2192. mov r10, [reg_p1+16]
  2193. mov r11, [reg_p1+24]
  2194. mov r12, [reg_p1+32]
  2195. mov r13, [reg_p1+40]
  2196. mov r14, [reg_p1+48]
  2197. mov r15, [reg_p1+56]
  2198. mov rax, [reg_p1+64]
  2199. mov rbx, [reg_p1+72]
  2200. mov rcx, [reg_p1+80]
  2201. add r8, [reg_p2]
  2202. adc r9, [reg_p2+8]
  2203. adc r10, [reg_p2+16]
  2204. adc r11, [reg_p2+24]
  2205. adc r12, [reg_p2+32]
  2206. adc r13, [reg_p2+40]
  2207. adc r14, [reg_p2+48]
  2208. adc r15, [reg_p2+56]
  2209. adc rax, [reg_p2+64]
  2210. adc rbx, [reg_p2+72]
  2211. adc rcx, [reg_p2+80]
  2212. mov [reg_p3], r8
  2213. mov [reg_p3+8], r9
  2214. mov [reg_p3+16], r10
  2215. mov [reg_p3+24], r11
  2216. mov [reg_p3+32], r12
  2217. mov [reg_p3+40], r13
  2218. mov [reg_p3+48], r14
  2219. mov [reg_p3+56], r15
  2220. mov [reg_p3+64], rax
  2221. mov [reg_p3+72], rbx
  2222. mov [reg_p3+80], rcx
  2223. mov rax, [reg_p1+88]
  2224. adc rax, [reg_p2+88]
  2225. mov [reg_p3+88], rax
  2226. mov r8, [reg_p1+96]
  2227. mov r9, [reg_p1+104]
  2228. mov r10, [reg_p1+112]
  2229. mov r11, [reg_p1+120]
  2230. mov r12, [reg_p1+128]
  2231. mov r13, [reg_p1+136]
  2232. mov r14, [reg_p1+144]
  2233. mov r15, [reg_p1+152]
  2234. mov rax, [reg_p1+160]
  2235. mov rbx, [reg_p1+168]
  2236. mov rcx, [reg_p1+176]
  2237. mov rdi, [reg_p1+184]
  2238. adc r8, [reg_p2+96]
  2239. adc r9, [reg_p2+104]
  2240. adc r10, [reg_p2+112]
  2241. adc r11, [reg_p2+120]
  2242. adc r12, [reg_p2+128]
  2243. adc r13, [reg_p2+136]
  2244. adc r14, [reg_p2+144]
  2245. adc r15, [reg_p2+152]
  2246. adc rax, [reg_p2+160]
  2247. adc rbx, [reg_p2+168]
  2248. adc rcx, [reg_p2+176]
  2249. adc rdi, [reg_p2+184]
  2250. mov [reg_p3+96], r8
  2251. mov [reg_p3+104], r9
  2252. mov [reg_p3+112], r10
  2253. mov [reg_p3+120], r11
  2254. mov [reg_p3+128], r12
  2255. mov [reg_p3+136], r13
  2256. mov [reg_p3+144], r14
  2257. mov [reg_p3+152], r15
  2258. mov [reg_p3+160], rax
  2259. mov [reg_p3+168], rbx
  2260. mov [reg_p3+176], rcx
  2261. mov [reg_p3+184], rdi
  2262. pop rbx
  2263. pop r15
  2264. pop r14
  2265. pop r13
  2266. pop r12
  2267. ret
  2268. //***********************************************************************
  2269. // 2x751-bit multiprecision subtraction
  2270. // Operation: c [reg_p3] = a [reg_p1] - b [reg_p2]. Returns borrow mask
  2271. //***********************************************************************
  2272. .global mp_sub751x2_asm
  2273. mp_sub751x2_asm:
  2274. push r12
  2275. push r13
  2276. push r14
  2277. push r15
  2278. push rbx
  2279. mov r8, [reg_p1]
  2280. mov r9, [reg_p1+8]
  2281. mov r10, [reg_p1+16]
  2282. mov r11, [reg_p1+24]
  2283. mov r12, [reg_p1+32]
  2284. mov r13, [reg_p1+40]
  2285. mov r14, [reg_p1+48]
  2286. mov r15, [reg_p1+56]
  2287. mov rax, [reg_p1+64]
  2288. mov rbx, [reg_p1+72]
  2289. mov rcx, [reg_p1+80]
  2290. sub r8, [reg_p2]
  2291. sbb r9, [reg_p2+8]
  2292. sbb r10, [reg_p2+16]
  2293. sbb r11, [reg_p2+24]
  2294. sbb r12, [reg_p2+32]
  2295. sbb r13, [reg_p2+40]
  2296. sbb r14, [reg_p2+48]
  2297. sbb r15, [reg_p2+56]
  2298. sbb rax, [reg_p2+64]
  2299. sbb rbx, [reg_p2+72]
  2300. sbb rcx, [reg_p2+80]
  2301. mov [reg_p3], r8
  2302. mov [reg_p3+8], r9
  2303. mov [reg_p3+16], r10
  2304. mov [reg_p3+24], r11
  2305. mov [reg_p3+32], r12
  2306. mov [reg_p3+40], r13
  2307. mov [reg_p3+48], r14
  2308. mov [reg_p3+56], r15
  2309. mov [reg_p3+64], rax
  2310. mov [reg_p3+72], rbx
  2311. mov [reg_p3+80], rcx
  2312. mov rax, [reg_p1+88]
  2313. sbb rax, [reg_p2+88]
  2314. mov [reg_p3+88], rax
  2315. mov r8, [reg_p1+96]
  2316. mov r9, [reg_p1+104]
  2317. mov r10, [reg_p1+112]
  2318. mov r11, [reg_p1+120]
  2319. mov r12, [reg_p1+128]
  2320. mov r13, [reg_p1+136]
  2321. mov r14, [reg_p1+144]
  2322. mov r15, [reg_p1+152]
  2323. mov rax, [reg_p1+160]
  2324. mov rbx, [reg_p1+168]
  2325. mov rcx, [reg_p1+176]
  2326. mov rdi, [reg_p1+184]
  2327. sbb r8, [reg_p2+96]
  2328. sbb r9, [reg_p2+104]
  2329. sbb r10, [reg_p2+112]
  2330. sbb r11, [reg_p2+120]
  2331. sbb r12, [reg_p2+128]
  2332. sbb r13, [reg_p2+136]
  2333. sbb r14, [reg_p2+144]
  2334. sbb r15, [reg_p2+152]
  2335. sbb rax, [reg_p2+160]
  2336. sbb rbx, [reg_p2+168]
  2337. sbb rcx, [reg_p2+176]
  2338. sbb rdi, [reg_p2+184]
  2339. mov [reg_p3+96], r8
  2340. mov [reg_p3+104], r9
  2341. mov [reg_p3+112], r10
  2342. mov [reg_p3+120], r11
  2343. mov [reg_p3+128], r12
  2344. mov [reg_p3+136], r13
  2345. mov [reg_p3+144], r14
  2346. mov [reg_p3+152], r15
  2347. mov [reg_p3+160], rax
  2348. mov rax, 0
  2349. sbb rax, 0
  2350. mov [reg_p3+168], rbx
  2351. mov [reg_p3+176], rcx
  2352. mov [reg_p3+184], rdi
  2353. pop rbx
  2354. pop r15
  2355. pop r14
  2356. pop r13
  2357. pop r12
  2358. ret