Nie możesz wybrać więcej, niż 25 tematów Tematy muszą się zaczynać od litery lub cyfry, mogą zawierać myślniki ('-') i mogą mieć do 35 znaków.
 
 
 
 

453 wiersze
7.9 KiB

  1. .intel_syntax noprefix
  2. .section .rodata
  3. .set pbits, 511
  4. p:
  5. .quad 0x1b81b90533c6c87b, 0xc2721bf457aca835, 0x516730cc1f0b4f25, 0xa7aac6c567f35507
  6. .quad 0x5afbfcc69322c9cd, 0xb42d083aedc88c42, 0xfc8ab0d15e3e4c4a, 0x65b48e8f740f89bf
  7. .global fp_0
  8. fp_0: .quad 0, 0, 0, 0, 0, 0, 0, 0
  9. .global fp_1
  10. fp_1: /* 2^512 mod p */
  11. .quad 0xc8fc8df598726f0a, 0x7b1bc81750a6af95, 0x5d319e67c1e961b4, 0xb0aa7275301955f1
  12. .quad 0x4a080672d9ba6c64, 0x97a5ef8a246ee77b, 0x06ea9e5d4383676a, 0x3496e2e117e0ec80
  13. /* (2^512)^2 mod p */
  14. .r_squared_mod_p:
  15. .quad 0x36905b572ffc1724, 0x67086f4525f1f27d, 0x4faf3fbfd22370ca, 0x192ea214bcc584b1
  16. .quad 0x5dae03ee2f5de3d0, 0x1e9248731776b371, 0xad5f166e20e4f52d, 0x4ed759aea6f3917e
  17. /* -p^-1 mod 2^64 */
  18. .inv_min_p_mod_r:
  19. .quad 0x66c1301f632e294d
  20. .section .text
  21. .global fp_copy
  22. fp_copy:
  23. cld
  24. mov rcx, 8
  25. rep movsq
  26. ret
  27. .global fp_set
  28. fp_set:
  29. push rdi
  30. call u512_set
  31. pop rdi
  32. mov rsi, rdi
  33. jmp fp_enc
  34. .global fp_cswap
  35. fp_cswap:
  36. movzx rax, dl
  37. neg rax
  38. .set k, 0
  39. .rept 8
  40. mov rcx, [rdi + 8*k]
  41. mov rdx, [rsi + 8*k]
  42. mov r8, rcx
  43. xor r8, rdx
  44. and r8, rax
  45. xor rcx, r8
  46. xor rdx, r8
  47. mov [rdi + 8*k], rcx
  48. mov [rsi + 8*k], rdx
  49. .set k, k+1
  50. .endr
  51. ret
  52. .reduce_once:
  53. push rbp
  54. mov rbp, rdi
  55. mov rdi, [rbp + 0]
  56. sub rdi, [rip + p + 0]
  57. mov rsi, [rbp + 8]
  58. sbb rsi, [rip + p + 8]
  59. mov rdx, [rbp + 16]
  60. sbb rdx, [rip + p + 16]
  61. mov rcx, [rbp + 24]
  62. sbb rcx, [rip + p + 24]
  63. mov r8, [rbp + 32]
  64. sbb r8, [rip + p + 32]
  65. mov r9, [rbp + 40]
  66. sbb r9, [rip + p + 40]
  67. mov r10, [rbp + 48]
  68. sbb r10, [rip + p + 48]
  69. mov r11, [rbp + 56]
  70. sbb r11, [rip + p + 56]
  71. setnc al
  72. movzx rax, al
  73. neg rax
  74. .macro cswap2, r, m
  75. xor \r, \m
  76. and \r, rax
  77. xor \m, \r
  78. .endm
  79. cswap2 rdi, [rbp + 0]
  80. cswap2 rsi, [rbp + 8]
  81. cswap2 rdx, [rbp + 16]
  82. cswap2 rcx, [rbp + 24]
  83. cswap2 r8, [rbp + 32]
  84. cswap2 r9, [rbp + 40]
  85. cswap2 r10, [rbp + 48]
  86. cswap2 r11, [rbp + 56]
  87. pop rbp
  88. ret
  89. .global fp_add3
  90. fp_add3:
  91. push rdi
  92. call u512_add3
  93. pop rdi
  94. jmp .reduce_once
  95. .global fp_add2
  96. fp_add2:
  97. mov rdx, rdi
  98. jmp fp_add3
  99. .global fp_sub3
  100. fp_sub3:
  101. push rdi
  102. call u512_sub3
  103. pop rdi
  104. xor rsi, rsi
  105. xor rdx, rdx
  106. xor rcx, rcx
  107. xor r8, r8
  108. xor r9, r9
  109. xor r10, r10
  110. xor r11, r11
  111. test rax, rax
  112. cmovnz rax, [rip + p + 0]
  113. cmovnz rsi, [rip + p + 8]
  114. cmovnz rdx, [rip + p + 16]
  115. cmovnz rcx, [rip + p + 24]
  116. cmovnz r8, [rip + p + 32]
  117. cmovnz r9, [rip + p + 40]
  118. cmovnz r10, [rip + p + 48]
  119. cmovnz r11, [rip + p + 56]
  120. add [rdi + 0], rax
  121. adc [rdi + 8], rsi
  122. adc [rdi + 16], rdx
  123. adc [rdi + 24], rcx
  124. adc [rdi + 32], r8
  125. adc [rdi + 40], r9
  126. adc [rdi + 48], r10
  127. adc [rdi + 56], r11
  128. ret
  129. .global fp_sub2
  130. fp_sub2:
  131. mov rdx, rdi
  132. xchg rsi, rdx
  133. jmp fp_sub3
  134. /* Montgomery arithmetic */
  135. .global fp_enc
  136. fp_enc:
  137. lea rdx, [rip + .r_squared_mod_p]
  138. jmp fp_mul3
  139. .global fp_dec
  140. fp_dec:
  141. lea rdx, [rip + u512_1]
  142. jmp fp_mul3
  143. .global fp_mul3
  144. fp_mul3:
  145. push rbp
  146. push rbx
  147. push r12
  148. push r13
  149. push r14
  150. push r15
  151. push rdi
  152. mov rdi, rsi
  153. mov rsi, rdx
  154. xor r8, r8
  155. xor r9, r9
  156. xor r10, r10
  157. xor r11, r11
  158. xor r12, r12
  159. xor r13, r13
  160. xor r14, r14
  161. xor r15, r15
  162. xor rbp, rbp
  163. /* flags are already cleared */
  164. .macro MULSTEP, k, r0, r1, r2, r3, r4, r5, r6, r7, r8
  165. mov rdx, [rsi + 0]
  166. mulx rcx, rdx, [rdi + 8*\k]
  167. add rdx, \r0
  168. mulx rcx, rdx, [rip + .inv_min_p_mod_r]
  169. xor rax, rax /* clear flags */
  170. mulx rbx, rax, [rip + p + 0]
  171. adox \r0, rax
  172. mulx rcx, rax, [rip + p + 8]
  173. adcx \r1, rbx
  174. adox \r1, rax
  175. mulx rbx, rax, [rip + p + 16]
  176. adcx \r2, rcx
  177. adox \r2, rax
  178. mulx rcx, rax, [rip + p + 24]
  179. adcx \r3, rbx
  180. adox \r3, rax
  181. mulx rbx, rax, [rip + p + 32]
  182. adcx \r4, rcx
  183. adox \r4, rax
  184. mulx rcx, rax, [rip + p + 40]
  185. adcx \r5, rbx
  186. adox \r5, rax
  187. mulx rbx, rax, [rip + p + 48]
  188. adcx \r6, rcx
  189. adox \r6, rax
  190. mulx rcx, rax, [rip + p + 56]
  191. adcx \r7, rbx
  192. adox \r7, rax
  193. mov rax, 0
  194. adcx \r8, rcx
  195. adox \r8, rax
  196. mov rdx, [rdi + 8*\k]
  197. xor rax, rax /* clear flags */
  198. mulx rbx, rax, [rsi + 0]
  199. adox \r0, rax
  200. mulx rcx, rax, [rsi + 8]
  201. adcx \r1, rbx
  202. adox \r1, rax
  203. mulx rbx, rax, [rsi + 16]
  204. adcx \r2, rcx
  205. adox \r2, rax
  206. mulx rcx, rax, [rsi + 24]
  207. adcx \r3, rbx
  208. adox \r3, rax
  209. mulx rbx, rax, [rsi + 32]
  210. adcx \r4, rcx
  211. adox \r4, rax
  212. mulx rcx, rax, [rsi + 40]
  213. adcx \r5, rbx
  214. adox \r5, rax
  215. mulx rbx, rax, [rsi + 48]
  216. adcx \r6, rcx
  217. adox \r6, rax
  218. mulx rcx, rax, [rsi + 56]
  219. adcx \r7, rbx
  220. adox \r7, rax
  221. mov rax, 0
  222. adcx \r8, rcx
  223. adox \r8, rax
  224. .endm
  225. MULSTEP 0, r8, r9, r10, r11, r12, r13, r14, r15, rbp
  226. MULSTEP 1, r9, r10, r11, r12, r13, r14, r15, rbp, r8
  227. MULSTEP 2, r10, r11, r12, r13, r14, r15, rbp, r8, r9
  228. MULSTEP 3, r11, r12, r13, r14, r15, rbp, r8, r9, r10
  229. MULSTEP 4, r12, r13, r14, r15, rbp, r8, r9, r10, r11
  230. MULSTEP 5, r13, r14, r15, rbp, r8, r9, r10, r11, r12
  231. MULSTEP 6, r14, r15, rbp, r8, r9, r10, r11, r12, r13
  232. MULSTEP 7, r15, rbp, r8, r9, r10, r11, r12, r13, r14
  233. pop rdi
  234. mov [rdi + 0], rbp
  235. mov [rdi + 8], r8
  236. mov [rdi + 16], r9
  237. mov [rdi + 24], r10
  238. mov [rdi + 32], r11
  239. mov [rdi + 40], r12
  240. mov [rdi + 48], r13
  241. mov [rdi + 56], r14
  242. pop r15
  243. pop r14
  244. pop r13
  245. pop r12
  246. pop rbx
  247. pop rbp
  248. jmp .reduce_once
  249. .global fp_mul2
  250. fp_mul2:
  251. mov rdx, rdi
  252. jmp fp_mul3
  253. .global fp_sq2
  254. fp_sq2:
  255. /* TODO implement optimized Montgomery squaring */
  256. mov rdx, rsi
  257. jmp fp_mul3
  258. .global fp_sq1
  259. fp_sq1:
  260. mov rsi, rdi
  261. jmp fp_sq2
  262. /* (obviously) not constant time in the exponent! */
  263. .fp_pow:
  264. push rbx
  265. mov rbx, rsi
  266. push r12
  267. push r13
  268. push rdi
  269. sub rsp, 64
  270. mov rsi, rdi
  271. mov rdi, rsp
  272. call fp_copy
  273. mov rdi, [rsp + 64]
  274. lea rsi, [rip + fp_1]
  275. call fp_copy
  276. .macro POWSTEP, k
  277. mov r13, [rbx + 8*\k]
  278. xor r12, r12
  279. 0:
  280. test r13, 1
  281. jz 1f
  282. mov rdi, [rsp + 64]
  283. mov rsi, rsp
  284. call fp_mul2
  285. 1:
  286. mov rdi, rsp
  287. call fp_sq1
  288. shr r13
  289. inc r12
  290. test r12, 64
  291. jz 0b
  292. .endm
  293. POWSTEP 0
  294. POWSTEP 1
  295. POWSTEP 2
  296. POWSTEP 3
  297. POWSTEP 4
  298. POWSTEP 5
  299. POWSTEP 6
  300. POWSTEP 7
  301. add rsp, 64+8
  302. pop r13
  303. pop r12
  304. pop rbx
  305. ret
  306. .section .rodata
  307. .p_minus_2:
  308. .quad 0x1b81b90533c6c879, 0xc2721bf457aca835, 0x516730cc1f0b4f25, 0xa7aac6c567f35507
  309. .quad 0x5afbfcc69322c9cd, 0xb42d083aedc88c42, 0xfc8ab0d15e3e4c4a, 0x65b48e8f740f89bf
  310. .section .text
  311. /* TODO use a better addition chain? */
  312. .global fp_inv
  313. fp_inv:
  314. lea rsi, [rip + .p_minus_2]
  315. jmp .fp_pow
  316. .section .rodata
  317. .p_minus_1_halves:
  318. .quad 0x8dc0dc8299e3643d, 0xe1390dfa2bd6541a, 0xa8b398660f85a792, 0xd3d56362b3f9aa83
  319. .quad 0x2d7dfe63499164e6, 0x5a16841d76e44621, 0xfe455868af1f2625, 0x32da4747ba07c4df
  320. .section .text
  321. /* TODO use a better addition chain? */
  322. .global fp_issquare
  323. fp_issquare:
  324. push rdi
  325. lea rsi, [rip + .p_minus_1_halves]
  326. call .fp_pow
  327. pop rdi
  328. xor rax, rax
  329. .set k, 0
  330. .rept 8
  331. mov rsi, [rdi + 8*k]
  332. xor rsi, [rip + fp_1 + 8*k]
  333. or rax, rsi
  334. .set k, k+1
  335. .endr
  336. test rax, rax
  337. setz al
  338. movzx rax, al
  339. ret
  340. /* not constant time (but this shouldn't leak anything of importance) */
  341. .global fp_random
  342. fp_random:
  343. push rdi
  344. mov rsi, 64
  345. call randombytes
  346. pop rdi
  347. mov rax, 1
  348. shl rax, (pbits % 64)
  349. dec rax
  350. and [rdi + 56], rax
  351. .set k, 7
  352. .rept 8
  353. mov rax, [rip + p + 8*k]
  354. cmp [rdi + 8*k], rax
  355. jge fp_random
  356. jl 0f
  357. .set k, k-1
  358. .endr
  359. 0:
  360. ret