Vous ne pouvez pas sélectionner plus de 25 sujets Les noms de sujets doivent commencer par une lettre ou un nombre, peuvent contenir des tirets ('-') et peuvent comporter jusqu'à 35 caractères.
 
 
 
 
 
 

2735 lignes
60 KiB

  1. #!/usr/bin/env perl
  2. # Copyright (c) 2014, Intel Corporation.
  3. #
  4. # Permission to use, copy, modify, and/or distribute this software for any
  5. # purpose with or without fee is hereby granted, provided that the above
  6. # copyright notice and this permission notice appear in all copies.
  7. #
  8. # THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
  9. # WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
  10. # MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
  11. # SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
  12. # WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
  13. # OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
  14. # CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
  15. # Developers and authors:
  16. # Shay Gueron (1, 2), and Vlad Krasnov (1)
  17. # (1) Intel Corporation, Israel Development Center
  18. # (2) University of Haifa
  19. # Reference:
  20. # S.Gueron and V.Krasnov, "Fast Prime Field Elliptic Curve Cryptography with
  21. # 256 Bit Primes"
  22. # Further optimization by <appro@openssl.org>:
  23. #
  24. # this/original
  25. # Opteron +12-49%
  26. # Bulldozer +14-45%
  27. # P4 +18-46%
  28. # Westmere +12-34%
  29. # Sandy Bridge +9-35%
  30. # Ivy Bridge +9-35%
  31. # Haswell +8-37%
  32. # Broadwell +18-58%
  33. # Atom +15-50%
  34. # VIA Nano +43-160%
  35. #
  36. # Ranges denote minimum and maximum improvement coefficients depending
  37. # on benchmark.
  38. $flavour = shift;
  39. $output = shift;
  40. if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
  41. $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
  42. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  43. ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
  44. ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
  45. die "can't locate x86_64-xlate.pl";
  46. open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
  47. *STDOUT=*OUT;
  48. # TODO: enable these after testing. $avx goes to two and $addx to one.
  49. $avx=0;
  50. $addx=0;
  51. $code.=<<___;
  52. .text
  53. .extern OPENSSL_ia32cap_P
  54. # The polynomial
  55. .align 64
  56. .Lpoly:
  57. .quad 0xffffffffffffffff, 0x00000000ffffffff, 0x0000000000000000, 0xffffffff00000001
  58. .LOne:
  59. .long 1,1,1,1,1,1,1,1
  60. .LTwo:
  61. .long 2,2,2,2,2,2,2,2
  62. .LThree:
  63. .long 3,3,3,3,3,3,3,3
  64. .LONE_mont:
  65. .quad 0x0000000000000001, 0xffffffff00000000, 0xffffffffffffffff, 0x00000000fffffffe
  66. ___
  67. {
  68. ################################################################################
  69. # void ecp_nistz256_mul_by_2(uint64_t res[4], uint64_t a[4]);
  70. my ($a0,$a1,$a2,$a3)=map("%r$_",(8..11));
  71. my ($t0,$t1,$t2,$t3,$t4)=("%rax","%rdx","%rcx","%r12","%r13");
  72. my ($r_ptr,$a_ptr,$b_ptr)=("%rdi","%rsi","%rdx");
  73. $code.=<<___;
  74. .type ecp_nistz256_mul_by_2,\@function,2
  75. .align 64
  76. ecp_nistz256_mul_by_2:
  77. push %r12
  78. push %r13
  79. mov 8*0($a_ptr), $a0
  80. mov 8*1($a_ptr), $a1
  81. add $a0, $a0 # a0:a3+a0:a3
  82. mov 8*2($a_ptr), $a2
  83. adc $a1, $a1
  84. mov 8*3($a_ptr), $a3
  85. lea .Lpoly(%rip), $a_ptr
  86. mov $a0, $t0
  87. adc $a2, $a2
  88. adc $a3, $a3
  89. mov $a1, $t1
  90. sbb $t4, $t4
  91. sub 8*0($a_ptr), $a0
  92. mov $a2, $t2
  93. sbb 8*1($a_ptr), $a1
  94. sbb 8*2($a_ptr), $a2
  95. mov $a3, $t3
  96. sbb 8*3($a_ptr), $a3
  97. test $t4, $t4
  98. cmovz $t0, $a0
  99. cmovz $t1, $a1
  100. mov $a0, 8*0($r_ptr)
  101. cmovz $t2, $a2
  102. mov $a1, 8*1($r_ptr)
  103. cmovz $t3, $a3
  104. mov $a2, 8*2($r_ptr)
  105. mov $a3, 8*3($r_ptr)
  106. pop %r13
  107. pop %r12
  108. ret
  109. .size ecp_nistz256_mul_by_2,.-ecp_nistz256_mul_by_2
  110. ################################################################################
  111. # void ecp_nistz256_neg(uint64_t res[4], uint64_t a[4]);
  112. .globl ecp_nistz256_neg
  113. .type ecp_nistz256_neg,\@function,2
  114. .align 32
  115. ecp_nistz256_neg:
  116. push %r12
  117. push %r13
  118. xor $a0, $a0
  119. xor $a1, $a1
  120. xor $a2, $a2
  121. xor $a3, $a3
  122. xor $t4, $t4
  123. sub 8*0($a_ptr), $a0
  124. sbb 8*1($a_ptr), $a1
  125. sbb 8*2($a_ptr), $a2
  126. mov $a0, $t0
  127. sbb 8*3($a_ptr), $a3
  128. lea .Lpoly(%rip), $a_ptr
  129. mov $a1, $t1
  130. sbb \$0, $t4
  131. add 8*0($a_ptr), $a0
  132. mov $a2, $t2
  133. adc 8*1($a_ptr), $a1
  134. adc 8*2($a_ptr), $a2
  135. mov $a3, $t3
  136. adc 8*3($a_ptr), $a3
  137. test $t4, $t4
  138. cmovz $t0, $a0
  139. cmovz $t1, $a1
  140. mov $a0, 8*0($r_ptr)
  141. cmovz $t2, $a2
  142. mov $a1, 8*1($r_ptr)
  143. cmovz $t3, $a3
  144. mov $a2, 8*2($r_ptr)
  145. mov $a3, 8*3($r_ptr)
  146. pop %r13
  147. pop %r12
  148. ret
  149. .size ecp_nistz256_neg,.-ecp_nistz256_neg
  150. ___
  151. }
  152. {
  153. my ($r_ptr,$a_ptr,$b_org,$b_ptr)=("%rdi","%rsi","%rdx","%rbx");
  154. my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7)=map("%r$_",(8..15));
  155. my ($t0,$t1,$t2,$t3,$t4)=("%rcx","%rbp","%rbx","%rdx","%rax");
  156. my ($poly1,$poly3)=($acc6,$acc7);
  157. $code.=<<___;
  158. ################################################################################
  159. # void ecp_nistz256_mul_mont(
  160. # uint64_t res[4],
  161. # uint64_t a[4],
  162. # uint64_t b[4]);
  163. .globl ecp_nistz256_mul_mont
  164. .type ecp_nistz256_mul_mont,\@function,3
  165. .align 32
  166. ecp_nistz256_mul_mont:
  167. ___
  168. $code.=<<___ if ($addx);
  169. mov \$0x80100, %ecx
  170. and OPENSSL_ia32cap_P+8(%rip), %ecx
  171. ___
  172. $code.=<<___;
  173. .Lmul_mont:
  174. push %rbp
  175. push %rbx
  176. push %r12
  177. push %r13
  178. push %r14
  179. push %r15
  180. ___
  181. $code.=<<___ if ($addx);
  182. cmp \$0x80100, %ecx
  183. je .Lmul_montx
  184. ___
  185. $code.=<<___;
  186. mov $b_org, $b_ptr
  187. mov 8*0($b_org), %rax
  188. mov 8*0($a_ptr), $acc1
  189. mov 8*1($a_ptr), $acc2
  190. mov 8*2($a_ptr), $acc3
  191. mov 8*3($a_ptr), $acc4
  192. call __ecp_nistz256_mul_montq
  193. ___
  194. $code.=<<___ if ($addx);
  195. jmp .Lmul_mont_done
  196. .align 32
  197. .Lmul_montx:
  198. mov $b_org, $b_ptr
  199. mov 8*0($b_org), %rdx
  200. mov 8*0($a_ptr), $acc1
  201. mov 8*1($a_ptr), $acc2
  202. mov 8*2($a_ptr), $acc3
  203. mov 8*3($a_ptr), $acc4
  204. lea -128($a_ptr), $a_ptr # control u-op density
  205. call __ecp_nistz256_mul_montx
  206. ___
  207. $code.=<<___;
  208. .Lmul_mont_done:
  209. pop %r15
  210. pop %r14
  211. pop %r13
  212. pop %r12
  213. pop %rbx
  214. pop %rbp
  215. ret
  216. .size ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont
  217. .type __ecp_nistz256_mul_montq,\@abi-omnipotent
  218. .align 32
  219. __ecp_nistz256_mul_montq:
  220. ########################################################################
  221. # Multiply a by b[0]
  222. mov %rax, $t1
  223. mulq $acc1
  224. mov .Lpoly+8*1(%rip),$poly1
  225. mov %rax, $acc0
  226. mov $t1, %rax
  227. mov %rdx, $acc1
  228. mulq $acc2
  229. mov .Lpoly+8*3(%rip),$poly3
  230. add %rax, $acc1
  231. mov $t1, %rax
  232. adc \$0, %rdx
  233. mov %rdx, $acc2
  234. mulq $acc3
  235. add %rax, $acc2
  236. mov $t1, %rax
  237. adc \$0, %rdx
  238. mov %rdx, $acc3
  239. mulq $acc4
  240. add %rax, $acc3
  241. mov $acc0, %rax
  242. adc \$0, %rdx
  243. xor $acc5, $acc5
  244. mov %rdx, $acc4
  245. ########################################################################
  246. # First reduction step
  247. # Basically now we want to multiply acc[0] by p256,
  248. # and add the result to the acc.
  249. # Due to the special form of p256 we do some optimizations
  250. #
  251. # acc[0] x p256[0..1] = acc[0] x 2^96 - acc[0]
  252. # then we add acc[0] and get acc[0] x 2^96
  253. mov $acc0, $t1
  254. shl \$32, $acc0
  255. mulq $poly3
  256. shr \$32, $t1
  257. add $acc0, $acc1 # +=acc[0]<<96
  258. adc $t1, $acc2
  259. adc %rax, $acc3
  260. mov 8*1($b_ptr), %rax
  261. adc %rdx, $acc4
  262. adc \$0, $acc5
  263. xor $acc0, $acc0
  264. ########################################################################
  265. # Multiply by b[1]
  266. mov %rax, $t1
  267. mulq 8*0($a_ptr)
  268. add %rax, $acc1
  269. mov $t1, %rax
  270. adc \$0, %rdx
  271. mov %rdx, $t0
  272. mulq 8*1($a_ptr)
  273. add $t0, $acc2
  274. adc \$0, %rdx
  275. add %rax, $acc2
  276. mov $t1, %rax
  277. adc \$0, %rdx
  278. mov %rdx, $t0
  279. mulq 8*2($a_ptr)
  280. add $t0, $acc3
  281. adc \$0, %rdx
  282. add %rax, $acc3
  283. mov $t1, %rax
  284. adc \$0, %rdx
  285. mov %rdx, $t0
  286. mulq 8*3($a_ptr)
  287. add $t0, $acc4
  288. adc \$0, %rdx
  289. add %rax, $acc4
  290. mov $acc1, %rax
  291. adc %rdx, $acc5
  292. adc \$0, $acc0
  293. ########################################################################
  294. # Second reduction step
  295. mov $acc1, $t1
  296. shl \$32, $acc1
  297. mulq $poly3
  298. shr \$32, $t1
  299. add $acc1, $acc2
  300. adc $t1, $acc3
  301. adc %rax, $acc4
  302. mov 8*2($b_ptr), %rax
  303. adc %rdx, $acc5
  304. adc \$0, $acc0
  305. xor $acc1, $acc1
  306. ########################################################################
  307. # Multiply by b[2]
  308. mov %rax, $t1
  309. mulq 8*0($a_ptr)
  310. add %rax, $acc2
  311. mov $t1, %rax
  312. adc \$0, %rdx
  313. mov %rdx, $t0
  314. mulq 8*1($a_ptr)
  315. add $t0, $acc3
  316. adc \$0, %rdx
  317. add %rax, $acc3
  318. mov $t1, %rax
  319. adc \$0, %rdx
  320. mov %rdx, $t0
  321. mulq 8*2($a_ptr)
  322. add $t0, $acc4
  323. adc \$0, %rdx
  324. add %rax, $acc4
  325. mov $t1, %rax
  326. adc \$0, %rdx
  327. mov %rdx, $t0
  328. mulq 8*3($a_ptr)
  329. add $t0, $acc5
  330. adc \$0, %rdx
  331. add %rax, $acc5
  332. mov $acc2, %rax
  333. adc %rdx, $acc0
  334. adc \$0, $acc1
  335. ########################################################################
  336. # Third reduction step
  337. mov $acc2, $t1
  338. shl \$32, $acc2
  339. mulq $poly3
  340. shr \$32, $t1
  341. add $acc2, $acc3
  342. adc $t1, $acc4
  343. adc %rax, $acc5
  344. mov 8*3($b_ptr), %rax
  345. adc %rdx, $acc0
  346. adc \$0, $acc1
  347. xor $acc2, $acc2
  348. ########################################################################
  349. # Multiply by b[3]
  350. mov %rax, $t1
  351. mulq 8*0($a_ptr)
  352. add %rax, $acc3
  353. mov $t1, %rax
  354. adc \$0, %rdx
  355. mov %rdx, $t0
  356. mulq 8*1($a_ptr)
  357. add $t0, $acc4
  358. adc \$0, %rdx
  359. add %rax, $acc4
  360. mov $t1, %rax
  361. adc \$0, %rdx
  362. mov %rdx, $t0
  363. mulq 8*2($a_ptr)
  364. add $t0, $acc5
  365. adc \$0, %rdx
  366. add %rax, $acc5
  367. mov $t1, %rax
  368. adc \$0, %rdx
  369. mov %rdx, $t0
  370. mulq 8*3($a_ptr)
  371. add $t0, $acc0
  372. adc \$0, %rdx
  373. add %rax, $acc0
  374. mov $acc3, %rax
  375. adc %rdx, $acc1
  376. adc \$0, $acc2
  377. ########################################################################
  378. # Final reduction step
  379. mov $acc3, $t1
  380. shl \$32, $acc3
  381. mulq $poly3
  382. shr \$32, $t1
  383. add $acc3, $acc4
  384. adc $t1, $acc5
  385. mov $acc4, $t0
  386. adc %rax, $acc0
  387. adc %rdx, $acc1
  388. mov $acc5, $t1
  389. adc \$0, $acc2
  390. ########################################################################
  391. # Branch-less conditional subtraction of P
  392. sub \$-1, $acc4 # .Lpoly[0]
  393. mov $acc0, $t2
  394. sbb $poly1, $acc5 # .Lpoly[1]
  395. sbb \$0, $acc0 # .Lpoly[2]
  396. mov $acc1, $t3
  397. sbb $poly3, $acc1 # .Lpoly[3]
  398. sbb \$0, $acc2
  399. cmovc $t0, $acc4
  400. cmovc $t1, $acc5
  401. mov $acc4, 8*0($r_ptr)
  402. cmovc $t2, $acc0
  403. mov $acc5, 8*1($r_ptr)
  404. cmovc $t3, $acc1
  405. mov $acc0, 8*2($r_ptr)
  406. mov $acc1, 8*3($r_ptr)
  407. ret
  408. .size __ecp_nistz256_mul_montq,.-__ecp_nistz256_mul_montq
  409. ################################################################################
  410. # void ecp_nistz256_sqr_mont(
  411. # uint64_t res[4],
  412. # uint64_t a[4]);
  413. # we optimize the square according to S.Gueron and V.Krasnov,
  414. # "Speeding up Big-Number Squaring"
  415. .globl ecp_nistz256_sqr_mont
  416. .type ecp_nistz256_sqr_mont,\@function,2
  417. .align 32
  418. ecp_nistz256_sqr_mont:
  419. ___
  420. $code.=<<___ if ($addx);
  421. mov \$0x80100, %ecx
  422. and OPENSSL_ia32cap_P+8(%rip), %ecx
  423. ___
  424. $code.=<<___;
  425. push %rbp
  426. push %rbx
  427. push %r12
  428. push %r13
  429. push %r14
  430. push %r15
  431. ___
  432. $code.=<<___ if ($addx);
  433. cmp \$0x80100, %ecx
  434. je .Lsqr_montx
  435. ___
  436. $code.=<<___;
  437. mov 8*0($a_ptr), %rax
  438. mov 8*1($a_ptr), $acc6
  439. mov 8*2($a_ptr), $acc7
  440. mov 8*3($a_ptr), $acc0
  441. call __ecp_nistz256_sqr_montq
  442. ___
  443. $code.=<<___ if ($addx);
  444. jmp .Lsqr_mont_done
  445. .align 32
  446. .Lsqr_montx:
  447. mov 8*0($a_ptr), %rdx
  448. mov 8*1($a_ptr), $acc6
  449. mov 8*2($a_ptr), $acc7
  450. mov 8*3($a_ptr), $acc0
  451. lea -128($a_ptr), $a_ptr # control u-op density
  452. call __ecp_nistz256_sqr_montx
  453. ___
  454. $code.=<<___;
  455. .Lsqr_mont_done:
  456. pop %r15
  457. pop %r14
  458. pop %r13
  459. pop %r12
  460. pop %rbx
  461. pop %rbp
  462. ret
  463. .size ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont
  464. .type __ecp_nistz256_sqr_montq,\@abi-omnipotent
  465. .align 32
  466. __ecp_nistz256_sqr_montq:
  467. mov %rax, $acc5
  468. mulq $acc6 # a[1]*a[0]
  469. mov %rax, $acc1
  470. mov $acc7, %rax
  471. mov %rdx, $acc2
  472. mulq $acc5 # a[0]*a[2]
  473. add %rax, $acc2
  474. mov $acc0, %rax
  475. adc \$0, %rdx
  476. mov %rdx, $acc3
  477. mulq $acc5 # a[0]*a[3]
  478. add %rax, $acc3
  479. mov $acc7, %rax
  480. adc \$0, %rdx
  481. mov %rdx, $acc4
  482. #################################
  483. mulq $acc6 # a[1]*a[2]
  484. add %rax, $acc3
  485. mov $acc0, %rax
  486. adc \$0, %rdx
  487. mov %rdx, $t1
  488. mulq $acc6 # a[1]*a[3]
  489. add %rax, $acc4
  490. mov $acc0, %rax
  491. adc \$0, %rdx
  492. add $t1, $acc4
  493. mov %rdx, $acc5
  494. adc \$0, $acc5
  495. #################################
  496. mulq $acc7 # a[2]*a[3]
  497. xor $acc7, $acc7
  498. add %rax, $acc5
  499. mov 8*0($a_ptr), %rax
  500. mov %rdx, $acc6
  501. adc \$0, $acc6
  502. add $acc1, $acc1 # acc1:6<<1
  503. adc $acc2, $acc2
  504. adc $acc3, $acc3
  505. adc $acc4, $acc4
  506. adc $acc5, $acc5
  507. adc $acc6, $acc6
  508. adc \$0, $acc7
  509. mulq %rax
  510. mov %rax, $acc0
  511. mov 8*1($a_ptr), %rax
  512. mov %rdx, $t0
  513. mulq %rax
  514. add $t0, $acc1
  515. adc %rax, $acc2
  516. mov 8*2($a_ptr), %rax
  517. adc \$0, %rdx
  518. mov %rdx, $t0
  519. mulq %rax
  520. add $t0, $acc3
  521. adc %rax, $acc4
  522. mov 8*3($a_ptr), %rax
  523. adc \$0, %rdx
  524. mov %rdx, $t0
  525. mulq %rax
  526. add $t0, $acc5
  527. adc %rax, $acc6
  528. mov $acc0, %rax
  529. adc %rdx, $acc7
  530. mov .Lpoly+8*1(%rip), $a_ptr
  531. mov .Lpoly+8*3(%rip), $t1
  532. ##########################################
  533. # Now the reduction
  534. # First iteration
  535. mov $acc0, $t0
  536. shl \$32, $acc0
  537. mulq $t1
  538. shr \$32, $t0
  539. add $acc0, $acc1 # +=acc[0]<<96
  540. adc $t0, $acc2
  541. adc %rax, $acc3
  542. mov $acc1, %rax
  543. adc \$0, %rdx
  544. ##########################################
  545. # Second iteration
  546. mov $acc1, $t0
  547. shl \$32, $acc1
  548. mov %rdx, $acc0
  549. mulq $t1
  550. shr \$32, $t0
  551. add $acc1, $acc2
  552. adc $t0, $acc3
  553. adc %rax, $acc0
  554. mov $acc2, %rax
  555. adc \$0, %rdx
  556. ##########################################
  557. # Third iteration
  558. mov $acc2, $t0
  559. shl \$32, $acc2
  560. mov %rdx, $acc1
  561. mulq $t1
  562. shr \$32, $t0
  563. add $acc2, $acc3
  564. adc $t0, $acc0
  565. adc %rax, $acc1
  566. mov $acc3, %rax
  567. adc \$0, %rdx
  568. ###########################################
  569. # Last iteration
  570. mov $acc3, $t0
  571. shl \$32, $acc3
  572. mov %rdx, $acc2
  573. mulq $t1
  574. shr \$32, $t0
  575. add $acc3, $acc0
  576. adc $t0, $acc1
  577. adc %rax, $acc2
  578. adc \$0, %rdx
  579. xor $acc3, $acc3
  580. ############################################
  581. # Add the rest of the acc
  582. add $acc0, $acc4
  583. adc $acc1, $acc5
  584. mov $acc4, $acc0
  585. adc $acc2, $acc6
  586. adc %rdx, $acc7
  587. mov $acc5, $acc1
  588. adc \$0, $acc3
  589. sub \$-1, $acc4 # .Lpoly[0]
  590. mov $acc6, $acc2
  591. sbb $a_ptr, $acc5 # .Lpoly[1]
  592. sbb \$0, $acc6 # .Lpoly[2]
  593. mov $acc7, $t0
  594. sbb $t1, $acc7 # .Lpoly[3]
  595. sbb \$0, $acc3
  596. cmovc $acc0, $acc4
  597. cmovc $acc1, $acc5
  598. mov $acc4, 8*0($r_ptr)
  599. cmovc $acc2, $acc6
  600. mov $acc5, 8*1($r_ptr)
  601. cmovc $t0, $acc7
  602. mov $acc6, 8*2($r_ptr)
  603. mov $acc7, 8*3($r_ptr)
  604. ret
  605. .size __ecp_nistz256_sqr_montq,.-__ecp_nistz256_sqr_montq
  606. ___
  607. if ($addx) {
  608. $code.=<<___;
  609. .type __ecp_nistz256_mul_montx,\@abi-omnipotent
  610. .align 32
  611. __ecp_nistz256_mul_montx:
  612. ########################################################################
  613. # Multiply by b[0]
  614. mulx $acc1, $acc0, $acc1
  615. mulx $acc2, $t0, $acc2
  616. mov \$32, $poly1
  617. xor $acc5, $acc5 # cf=0
  618. mulx $acc3, $t1, $acc3
  619. mov .Lpoly+8*3(%rip), $poly3
  620. adc $t0, $acc1
  621. mulx $acc4, $t0, $acc4
  622. mov $acc0, %rdx
  623. adc $t1, $acc2
  624. shlx $poly1,$acc0,$t1
  625. adc $t0, $acc3
  626. shrx $poly1,$acc0,$t0
  627. adc \$0, $acc4
  628. ########################################################################
  629. # First reduction step
  630. add $t1, $acc1
  631. adc $t0, $acc2
  632. mulx $poly3, $t0, $t1
  633. mov 8*1($b_ptr), %rdx
  634. adc $t0, $acc3
  635. adc $t1, $acc4
  636. adc \$0, $acc5
  637. xor $acc0, $acc0 # $acc0=0,cf=0,of=0
  638. ########################################################################
  639. # Multiply by b[1]
  640. mulx 8*0+128($a_ptr), $t0, $t1
  641. adcx $t0, $acc1
  642. adox $t1, $acc2
  643. mulx 8*1+128($a_ptr), $t0, $t1
  644. adcx $t0, $acc2
  645. adox $t1, $acc3
  646. mulx 8*2+128($a_ptr), $t0, $t1
  647. adcx $t0, $acc3
  648. adox $t1, $acc4
  649. mulx 8*3+128($a_ptr), $t0, $t1
  650. mov $acc1, %rdx
  651. adcx $t0, $acc4
  652. shlx $poly1, $acc1, $t0
  653. adox $t1, $acc5
  654. shrx $poly1, $acc1, $t1
  655. adcx $acc0, $acc5
  656. adox $acc0, $acc0
  657. adc \$0, $acc0
  658. ########################################################################
  659. # Second reduction step
  660. add $t0, $acc2
  661. adc $t1, $acc3
  662. mulx $poly3, $t0, $t1
  663. mov 8*2($b_ptr), %rdx
  664. adc $t0, $acc4
  665. adc $t1, $acc5
  666. adc \$0, $acc0
  667. xor $acc1 ,$acc1 # $acc1=0,cf=0,of=0
  668. ########################################################################
  669. # Multiply by b[2]
  670. mulx 8*0+128($a_ptr), $t0, $t1
  671. adcx $t0, $acc2
  672. adox $t1, $acc3
  673. mulx 8*1+128($a_ptr), $t0, $t1
  674. adcx $t0, $acc3
  675. adox $t1, $acc4
  676. mulx 8*2+128($a_ptr), $t0, $t1
  677. adcx $t0, $acc4
  678. adox $t1, $acc5
  679. mulx 8*3+128($a_ptr), $t0, $t1
  680. mov $acc2, %rdx
  681. adcx $t0, $acc5
  682. shlx $poly1, $acc2, $t0
  683. adox $t1, $acc0
  684. shrx $poly1, $acc2, $t1
  685. adcx $acc1, $acc0
  686. adox $acc1, $acc1
  687. adc \$0, $acc1
  688. ########################################################################
  689. # Third reduction step
  690. add $t0, $acc3
  691. adc $t1, $acc4
  692. mulx $poly3, $t0, $t1
  693. mov 8*3($b_ptr), %rdx
  694. adc $t0, $acc5
  695. adc $t1, $acc0
  696. adc \$0, $acc1
  697. xor $acc2, $acc2 # $acc2=0,cf=0,of=0
  698. ########################################################################
  699. # Multiply by b[3]
  700. mulx 8*0+128($a_ptr), $t0, $t1
  701. adcx $t0, $acc3
  702. adox $t1, $acc4
  703. mulx 8*1+128($a_ptr), $t0, $t1
  704. adcx $t0, $acc4
  705. adox $t1, $acc5
  706. mulx 8*2+128($a_ptr), $t0, $t1
  707. adcx $t0, $acc5
  708. adox $t1, $acc0
  709. mulx 8*3+128($a_ptr), $t0, $t1
  710. mov $acc3, %rdx
  711. adcx $t0, $acc0
  712. shlx $poly1, $acc3, $t0
  713. adox $t1, $acc1
  714. shrx $poly1, $acc3, $t1
  715. adcx $acc2, $acc1
  716. adox $acc2, $acc2
  717. adc \$0, $acc2
  718. ########################################################################
  719. # Fourth reduction step
  720. add $t0, $acc4
  721. adc $t1, $acc5
  722. mulx $poly3, $t0, $t1
  723. mov $acc4, $t2
  724. mov .Lpoly+8*1(%rip), $poly1
  725. adc $t0, $acc0
  726. mov $acc5, $t3
  727. adc $t1, $acc1
  728. adc \$0, $acc2
  729. ########################################################################
  730. # Branch-less conditional subtraction of P
  731. xor %eax, %eax
  732. mov $acc0, $t0
  733. sbb \$-1, $acc4 # .Lpoly[0]
  734. sbb $poly1, $acc5 # .Lpoly[1]
  735. sbb \$0, $acc0 # .Lpoly[2]
  736. mov $acc1, $t1
  737. sbb $poly3, $acc1 # .Lpoly[3]
  738. sbb \$0, $acc2
  739. cmovc $t2, $acc4
  740. cmovc $t3, $acc5
  741. mov $acc4, 8*0($r_ptr)
  742. cmovc $t0, $acc0
  743. mov $acc5, 8*1($r_ptr)
  744. cmovc $t1, $acc1
  745. mov $acc0, 8*2($r_ptr)
  746. mov $acc1, 8*3($r_ptr)
  747. ret
  748. .size __ecp_nistz256_mul_montx,.-__ecp_nistz256_mul_montx
  749. .type __ecp_nistz256_sqr_montx,\@abi-omnipotent
  750. .align 32
  751. __ecp_nistz256_sqr_montx:
  752. mulx $acc6, $acc1, $acc2 # a[0]*a[1]
  753. mulx $acc7, $t0, $acc3 # a[0]*a[2]
  754. xor %eax, %eax
  755. adc $t0, $acc2
  756. mulx $acc0, $t1, $acc4 # a[0]*a[3]
  757. mov $acc6, %rdx
  758. adc $t1, $acc3
  759. adc \$0, $acc4
  760. xor $acc5, $acc5 # $acc5=0,cf=0,of=0
  761. #################################
  762. mulx $acc7, $t0, $t1 # a[1]*a[2]
  763. adcx $t0, $acc3
  764. adox $t1, $acc4
  765. mulx $acc0, $t0, $t1 # a[1]*a[3]
  766. mov $acc7, %rdx
  767. adcx $t0, $acc4
  768. adox $t1, $acc5
  769. adc \$0, $acc5
  770. #################################
  771. mulx $acc0, $t0, $acc6 # a[2]*a[3]
  772. mov 8*0+128($a_ptr), %rdx
  773. xor $acc7, $acc7 # $acc7=0,cf=0,of=0
  774. adcx $acc1, $acc1 # acc1:6<<1
  775. adox $t0, $acc5
  776. adcx $acc2, $acc2
  777. adox $acc7, $acc6 # of=0
  778. mulx %rdx, $acc0, $t1
  779. mov 8*1+128($a_ptr), %rdx
  780. adcx $acc3, $acc3
  781. adox $t1, $acc1
  782. adcx $acc4, $acc4
  783. mulx %rdx, $t0, $t4
  784. mov 8*2+128($a_ptr), %rdx
  785. adcx $acc5, $acc5
  786. adox $t0, $acc2
  787. adcx $acc6, $acc6
  788. .byte 0x67
  789. mulx %rdx, $t0, $t1
  790. mov 8*3+128($a_ptr), %rdx
  791. adox $t4, $acc3
  792. adcx $acc7, $acc7
  793. adox $t0, $acc4
  794. mov \$32, $a_ptr
  795. adox $t1, $acc5
  796. .byte 0x67,0x67
  797. mulx %rdx, $t0, $t4
  798. mov $acc0, %rdx
  799. adox $t0, $acc6
  800. shlx $a_ptr, $acc0, $t0
  801. adox $t4, $acc7
  802. shrx $a_ptr, $acc0, $t4
  803. mov .Lpoly+8*3(%rip), $t1
  804. # reduction step 1
  805. add $t0, $acc1
  806. adc $t4, $acc2
  807. mulx $t1, $t0, $acc0
  808. mov $acc1, %rdx
  809. adc $t0, $acc3
  810. shlx $a_ptr, $acc1, $t0
  811. adc \$0, $acc0
  812. shrx $a_ptr, $acc1, $t4
  813. # reduction step 2
  814. add $t0, $acc2
  815. adc $t4, $acc3
  816. mulx $t1, $t0, $acc1
  817. mov $acc2, %rdx
  818. adc $t0, $acc0
  819. shlx $a_ptr, $acc2, $t0
  820. adc \$0, $acc1
  821. shrx $a_ptr, $acc2, $t4
  822. # reduction step 3
  823. add $t0, $acc3
  824. adc $t4, $acc0
  825. mulx $t1, $t0, $acc2
  826. mov $acc3, %rdx
  827. adc $t0, $acc1
  828. shlx $a_ptr, $acc3, $t0
  829. adc \$0, $acc2
  830. shrx $a_ptr, $acc3, $t4
  831. # reduction step 4
  832. add $t0, $acc0
  833. adc $t4, $acc1
  834. mulx $t1, $t0, $acc3
  835. adc $t0, $acc2
  836. adc \$0, $acc3
  837. xor $t3, $t3 # cf=0
  838. adc $acc0, $acc4 # accumulate upper half
  839. mov .Lpoly+8*1(%rip), $a_ptr
  840. adc $acc1, $acc5
  841. mov $acc4, $acc0
  842. adc $acc2, $acc6
  843. adc $acc3, $acc7
  844. mov $acc5, $acc1
  845. adc \$0, $t3
  846. xor %eax, %eax # cf=0
  847. sbb \$-1, $acc4 # .Lpoly[0]
  848. mov $acc6, $acc2
  849. sbb $a_ptr, $acc5 # .Lpoly[1]
  850. sbb \$0, $acc6 # .Lpoly[2]
  851. mov $acc7, $acc3
  852. sbb $t1, $acc7 # .Lpoly[3]
  853. sbb \$0, $t3
  854. cmovc $acc0, $acc4
  855. cmovc $acc1, $acc5
  856. mov $acc4, 8*0($r_ptr)
  857. cmovc $acc2, $acc6
  858. mov $acc5, 8*1($r_ptr)
  859. cmovc $acc3, $acc7
  860. mov $acc6, 8*2($r_ptr)
  861. mov $acc7, 8*3($r_ptr)
  862. ret
  863. .size __ecp_nistz256_sqr_montx,.-__ecp_nistz256_sqr_montx
  864. ___
  865. }
  866. }
  867. {
  868. my ($r_ptr,$in_ptr)=("%rdi","%rsi");
  869. my ($acc0,$acc1,$acc2,$acc3)=map("%r$_",(8..11));
  870. my ($t0,$t1,$t2)=("%rcx","%r12","%r13");
  871. $code.=<<___;
  872. ################################################################################
  873. # void ecp_nistz256_from_mont(
  874. # uint64_t res[4],
  875. # uint64_t in[4]);
  876. # This one performs Montgomery multiplication by 1, so we only need the reduction
  877. .globl ecp_nistz256_from_mont
  878. .type ecp_nistz256_from_mont,\@function,2
  879. .align 32
  880. ecp_nistz256_from_mont:
  881. push %r12
  882. push %r13
  883. mov 8*0($in_ptr), %rax
  884. mov .Lpoly+8*3(%rip), $t2
  885. mov 8*1($in_ptr), $acc1
  886. mov 8*2($in_ptr), $acc2
  887. mov 8*3($in_ptr), $acc3
  888. mov %rax, $acc0
  889. mov .Lpoly+8*1(%rip), $t1
  890. #########################################
  891. # First iteration
  892. mov %rax, $t0
  893. shl \$32, $acc0
  894. mulq $t2
  895. shr \$32, $t0
  896. add $acc0, $acc1
  897. adc $t0, $acc2
  898. adc %rax, $acc3
  899. mov $acc1, %rax
  900. adc \$0, %rdx
  901. #########################################
  902. # Second iteration
  903. mov $acc1, $t0
  904. shl \$32, $acc1
  905. mov %rdx, $acc0
  906. mulq $t2
  907. shr \$32, $t0
  908. add $acc1, $acc2
  909. adc $t0, $acc3
  910. adc %rax, $acc0
  911. mov $acc2, %rax
  912. adc \$0, %rdx
  913. ##########################################
  914. # Third iteration
  915. mov $acc2, $t0
  916. shl \$32, $acc2
  917. mov %rdx, $acc1
  918. mulq $t2
  919. shr \$32, $t0
  920. add $acc2, $acc3
  921. adc $t0, $acc0
  922. adc %rax, $acc1
  923. mov $acc3, %rax
  924. adc \$0, %rdx
  925. ###########################################
  926. # Last iteration
  927. mov $acc3, $t0
  928. shl \$32, $acc3
  929. mov %rdx, $acc2
  930. mulq $t2
  931. shr \$32, $t0
  932. add $acc3, $acc0
  933. adc $t0, $acc1
  934. mov $acc0, $t0
  935. adc %rax, $acc2
  936. mov $acc1, $in_ptr
  937. adc \$0, %rdx
  938. sub \$-1, $acc0
  939. mov $acc2, %rax
  940. sbb $t1, $acc1
  941. sbb \$0, $acc2
  942. mov %rdx, $acc3
  943. sbb $t2, %rdx
  944. sbb $t2, $t2
  945. cmovnz $t0, $acc0
  946. cmovnz $in_ptr, $acc1
  947. mov $acc0, 8*0($r_ptr)
  948. cmovnz %rax, $acc2
  949. mov $acc1, 8*1($r_ptr)
  950. cmovz %rdx, $acc3
  951. mov $acc2, 8*2($r_ptr)
  952. mov $acc3, 8*3($r_ptr)
  953. pop %r13
  954. pop %r12
  955. ret
  956. .size ecp_nistz256_from_mont,.-ecp_nistz256_from_mont
  957. ___
  958. }
  959. {
  960. my ($val,$in_t,$index)=$win64?("%rcx","%rdx","%r8d"):("%rdi","%rsi","%edx");
  961. my ($ONE,$INDEX,$Ra,$Rb,$Rc,$Rd,$Re,$Rf)=map("%xmm$_",(0..7));
  962. my ($M0,$T0a,$T0b,$T0c,$T0d,$T0e,$T0f,$TMP0)=map("%xmm$_",(8..15));
  963. my ($M1,$T2a,$T2b,$TMP2,$M2,$T2a,$T2b,$TMP2)=map("%xmm$_",(8..15));
  964. $code.=<<___;
  965. ################################################################################
  966. # void ecp_nistz256_select_w5(uint64_t *val, uint64_t *in_t, int index);
  967. .globl ecp_nistz256_select_w5
  968. .type ecp_nistz256_select_w5,\@abi-omnipotent
  969. .align 32
  970. ecp_nistz256_select_w5:
  971. ___
  972. $code.=<<___ if ($avx>1);
  973. mov OPENSSL_ia32cap_P+8(%rip), %eax
  974. test \$`1<<5`, %eax
  975. jnz .Lavx2_select_w5
  976. ___
  977. $code.=<<___ if ($win64);
  978. lea -0x88(%rsp), %rax
  979. .LSEH_begin_ecp_nistz256_select_w5:
  980. .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax), %rsp
  981. .byte 0x0f,0x29,0x70,0xe0 #movaps %xmm6, -0x20(%rax)
  982. .byte 0x0f,0x29,0x78,0xf0 #movaps %xmm7, -0x10(%rax)
  983. .byte 0x44,0x0f,0x29,0x00 #movaps %xmm8, 0(%rax)
  984. .byte 0x44,0x0f,0x29,0x48,0x10 #movaps %xmm9, 0x10(%rax)
  985. .byte 0x44,0x0f,0x29,0x50,0x20 #movaps %xmm10, 0x20(%rax)
  986. .byte 0x44,0x0f,0x29,0x58,0x30 #movaps %xmm11, 0x30(%rax)
  987. .byte 0x44,0x0f,0x29,0x60,0x40 #movaps %xmm12, 0x40(%rax)
  988. .byte 0x44,0x0f,0x29,0x68,0x50 #movaps %xmm13, 0x50(%rax)
  989. .byte 0x44,0x0f,0x29,0x70,0x60 #movaps %xmm14, 0x60(%rax)
  990. .byte 0x44,0x0f,0x29,0x78,0x70 #movaps %xmm15, 0x70(%rax)
  991. ___
  992. $code.=<<___;
  993. movdqa .LOne(%rip), $ONE
  994. movd $index, $INDEX
  995. pxor $Ra, $Ra
  996. pxor $Rb, $Rb
  997. pxor $Rc, $Rc
  998. pxor $Rd, $Rd
  999. pxor $Re, $Re
  1000. pxor $Rf, $Rf
  1001. movdqa $ONE, $M0
  1002. pshufd \$0, $INDEX, $INDEX
  1003. mov \$16, %rax
  1004. .Lselect_loop_sse_w5:
  1005. movdqa $M0, $TMP0
  1006. paddd $ONE, $M0
  1007. pcmpeqd $INDEX, $TMP0
  1008. movdqa 16*0($in_t), $T0a
  1009. movdqa 16*1($in_t), $T0b
  1010. movdqa 16*2($in_t), $T0c
  1011. movdqa 16*3($in_t), $T0d
  1012. movdqa 16*4($in_t), $T0e
  1013. movdqa 16*5($in_t), $T0f
  1014. lea 16*6($in_t), $in_t
  1015. pand $TMP0, $T0a
  1016. pand $TMP0, $T0b
  1017. por $T0a, $Ra
  1018. pand $TMP0, $T0c
  1019. por $T0b, $Rb
  1020. pand $TMP0, $T0d
  1021. por $T0c, $Rc
  1022. pand $TMP0, $T0e
  1023. por $T0d, $Rd
  1024. pand $TMP0, $T0f
  1025. por $T0e, $Re
  1026. por $T0f, $Rf
  1027. dec %rax
  1028. jnz .Lselect_loop_sse_w5
  1029. movdqu $Ra, 16*0($val)
  1030. movdqu $Rb, 16*1($val)
  1031. movdqu $Rc, 16*2($val)
  1032. movdqu $Rd, 16*3($val)
  1033. movdqu $Re, 16*4($val)
  1034. movdqu $Rf, 16*5($val)
  1035. ___
  1036. $code.=<<___ if ($win64);
  1037. movaps (%rsp), %xmm6
  1038. movaps 0x10(%rsp), %xmm7
  1039. movaps 0x20(%rsp), %xmm8
  1040. movaps 0x30(%rsp), %xmm9
  1041. movaps 0x40(%rsp), %xmm10
  1042. movaps 0x50(%rsp), %xmm11
  1043. movaps 0x60(%rsp), %xmm12
  1044. movaps 0x70(%rsp), %xmm13
  1045. movaps 0x80(%rsp), %xmm14
  1046. movaps 0x90(%rsp), %xmm15
  1047. lea 0xa8(%rsp), %rsp
  1048. .LSEH_end_ecp_nistz256_select_w5:
  1049. ___
  1050. $code.=<<___;
  1051. ret
  1052. .size ecp_nistz256_select_w5,.-ecp_nistz256_select_w5
  1053. ################################################################################
  1054. # void ecp_nistz256_select_w7(uint64_t *val, uint64_t *in_t, int index);
  1055. .globl ecp_nistz256_select_w7
  1056. .type ecp_nistz256_select_w7,\@abi-omnipotent
  1057. .align 32
  1058. ecp_nistz256_select_w7:
  1059. ___
  1060. $code.=<<___ if ($avx>1);
  1061. mov OPENSSL_ia32cap_P+8(%rip), %eax
  1062. test \$`1<<5`, %eax
  1063. jnz .Lavx2_select_w7
  1064. ___
  1065. $code.=<<___ if ($win64);
  1066. lea -0x88(%rsp), %rax
  1067. .LSEH_begin_ecp_nistz256_select_w7:
  1068. .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax), %rsp
  1069. .byte 0x0f,0x29,0x70,0xe0 #movaps %xmm6, -0x20(%rax)
  1070. .byte 0x0f,0x29,0x78,0xf0 #movaps %xmm7, -0x10(%rax)
  1071. .byte 0x44,0x0f,0x29,0x00 #movaps %xmm8, 0(%rax)
  1072. .byte 0x44,0x0f,0x29,0x48,0x10 #movaps %xmm9, 0x10(%rax)
  1073. .byte 0x44,0x0f,0x29,0x50,0x20 #movaps %xmm10, 0x20(%rax)
  1074. .byte 0x44,0x0f,0x29,0x58,0x30 #movaps %xmm11, 0x30(%rax)
  1075. .byte 0x44,0x0f,0x29,0x60,0x40 #movaps %xmm12, 0x40(%rax)
  1076. .byte 0x44,0x0f,0x29,0x68,0x50 #movaps %xmm13, 0x50(%rax)
  1077. .byte 0x44,0x0f,0x29,0x70,0x60 #movaps %xmm14, 0x60(%rax)
  1078. .byte 0x44,0x0f,0x29,0x78,0x70 #movaps %xmm15, 0x70(%rax)
  1079. ___
  1080. $code.=<<___;
  1081. movdqa .LOne(%rip), $M0
  1082. movd $index, $INDEX
  1083. pxor $Ra, $Ra
  1084. pxor $Rb, $Rb
  1085. pxor $Rc, $Rc
  1086. pxor $Rd, $Rd
  1087. movdqa $M0, $ONE
  1088. pshufd \$0, $INDEX, $INDEX
  1089. mov \$64, %rax
  1090. .Lselect_loop_sse_w7:
  1091. movdqa $M0, $TMP0
  1092. paddd $ONE, $M0
  1093. movdqa 16*0($in_t), $T0a
  1094. movdqa 16*1($in_t), $T0b
  1095. pcmpeqd $INDEX, $TMP0
  1096. movdqa 16*2($in_t), $T0c
  1097. movdqa 16*3($in_t), $T0d
  1098. lea 16*4($in_t), $in_t
  1099. pand $TMP0, $T0a
  1100. pand $TMP0, $T0b
  1101. por $T0a, $Ra
  1102. pand $TMP0, $T0c
  1103. por $T0b, $Rb
  1104. pand $TMP0, $T0d
  1105. por $T0c, $Rc
  1106. prefetcht0 255($in_t)
  1107. por $T0d, $Rd
  1108. dec %rax
  1109. jnz .Lselect_loop_sse_w7
  1110. movdqu $Ra, 16*0($val)
  1111. movdqu $Rb, 16*1($val)
  1112. movdqu $Rc, 16*2($val)
  1113. movdqu $Rd, 16*3($val)
  1114. ___
  1115. $code.=<<___ if ($win64);
  1116. movaps (%rsp), %xmm6
  1117. movaps 0x10(%rsp), %xmm7
  1118. movaps 0x20(%rsp), %xmm8
  1119. movaps 0x30(%rsp), %xmm9
  1120. movaps 0x40(%rsp), %xmm10
  1121. movaps 0x50(%rsp), %xmm11
  1122. movaps 0x60(%rsp), %xmm12
  1123. movaps 0x70(%rsp), %xmm13
  1124. movaps 0x80(%rsp), %xmm14
  1125. movaps 0x90(%rsp), %xmm15
  1126. lea 0xa8(%rsp), %rsp
  1127. .LSEH_end_ecp_nistz256_select_w7:
  1128. ___
  1129. $code.=<<___;
  1130. ret
  1131. .size ecp_nistz256_select_w7,.-ecp_nistz256_select_w7
  1132. ___
  1133. }
  1134. if ($avx>1) {
  1135. my ($val,$in_t,$index)=$win64?("%rcx","%rdx","%r8d"):("%rdi","%rsi","%edx");
  1136. my ($TWO,$INDEX,$Ra,$Rb,$Rc)=map("%ymm$_",(0..4));
  1137. my ($M0,$T0a,$T0b,$T0c,$TMP0)=map("%ymm$_",(5..9));
  1138. my ($M1,$T1a,$T1b,$T1c,$TMP1)=map("%ymm$_",(10..14));
  1139. $code.=<<___;
  1140. ################################################################################
  1141. # void ecp_nistz256_avx2_select_w5(uint64_t *val, uint64_t *in_t, int index);
  1142. .type ecp_nistz256_avx2_select_w5,\@abi-omnipotent
  1143. .align 32
  1144. ecp_nistz256_avx2_select_w5:
  1145. .Lavx2_select_w5:
  1146. vzeroupper
  1147. ___
  1148. $code.=<<___ if ($win64);
  1149. lea -0x88(%rsp), %rax
  1150. .LSEH_begin_ecp_nistz256_avx2_select_w5:
  1151. .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax), %rsp
  1152. .byte 0xc5,0xf8,0x29,0x70,0xe0 #vmovaps %xmm6, -0x20(%rax)
  1153. .byte 0xc5,0xf8,0x29,0x78,0xf0 #vmovaps %xmm7, -0x10(%rax)
  1154. .byte 0xc5,0x78,0x29,0x40,0x00 #vmovaps %xmm8, 8(%rax)
  1155. .byte 0xc5,0x78,0x29,0x48,0x10 #vmovaps %xmm9, 0x10(%rax)
  1156. .byte 0xc5,0x78,0x29,0x50,0x20 #vmovaps %xmm10, 0x20(%rax)
  1157. .byte 0xc5,0x78,0x29,0x58,0x30 #vmovaps %xmm11, 0x30(%rax)
  1158. .byte 0xc5,0x78,0x29,0x60,0x40 #vmovaps %xmm12, 0x40(%rax)
  1159. .byte 0xc5,0x78,0x29,0x68,0x50 #vmovaps %xmm13, 0x50(%rax)
  1160. .byte 0xc5,0x78,0x29,0x70,0x60 #vmovaps %xmm14, 0x60(%rax)
  1161. .byte 0xc5,0x78,0x29,0x78,0x70 #vmovaps %xmm15, 0x70(%rax)
  1162. ___
  1163. $code.=<<___;
  1164. vmovdqa .LTwo(%rip), $TWO
  1165. vpxor $Ra, $Ra, $Ra
  1166. vpxor $Rb, $Rb, $Rb
  1167. vpxor $Rc, $Rc, $Rc
  1168. vmovdqa .LOne(%rip), $M0
  1169. vmovdqa .LTwo(%rip), $M1
  1170. vmovd $index, %xmm1
  1171. vpermd $INDEX, $Ra, $INDEX
  1172. mov \$8, %rax
  1173. .Lselect_loop_avx2_w5:
  1174. vmovdqa 32*0($in_t), $T0a
  1175. vmovdqa 32*1($in_t), $T0b
  1176. vmovdqa 32*2($in_t), $T0c
  1177. vmovdqa 32*3($in_t), $T1a
  1178. vmovdqa 32*4($in_t), $T1b
  1179. vmovdqa 32*5($in_t), $T1c
  1180. vpcmpeqd $INDEX, $M0, $TMP0
  1181. vpcmpeqd $INDEX, $M1, $TMP1
  1182. vpaddd $TWO, $M0, $M0
  1183. vpaddd $TWO, $M1, $M1
  1184. lea 32*6($in_t), $in_t
  1185. vpand $TMP0, $T0a, $T0a
  1186. vpand $TMP0, $T0b, $T0b
  1187. vpand $TMP0, $T0c, $T0c
  1188. vpand $TMP1, $T1a, $T1a
  1189. vpand $TMP1, $T1b, $T1b
  1190. vpand $TMP1, $T1c, $T1c
  1191. vpxor $T0a, $Ra, $Ra
  1192. vpxor $T0b, $Rb, $Rb
  1193. vpxor $T0c, $Rc, $Rc
  1194. vpxor $T1a, $Ra, $Ra
  1195. vpxor $T1b, $Rb, $Rb
  1196. vpxor $T1c, $Rc, $Rc
  1197. dec %rax
  1198. jnz .Lselect_loop_avx2_w5
  1199. vmovdqu $Ra, 32*0($val)
  1200. vmovdqu $Rb, 32*1($val)
  1201. vmovdqu $Rc, 32*2($val)
  1202. vzeroupper
  1203. ___
  1204. $code.=<<___ if ($win64);
  1205. movaps (%rsp), %xmm6
  1206. movaps 0x10(%rsp), %xmm7
  1207. movaps 0x20(%rsp), %xmm8
  1208. movaps 0x30(%rsp), %xmm9
  1209. movaps 0x40(%rsp), %xmm10
  1210. movaps 0x50(%rsp), %xmm11
  1211. movaps 0x60(%rsp), %xmm12
  1212. movaps 0x70(%rsp), %xmm13
  1213. movaps 0x80(%rsp), %xmm14
  1214. movaps 0x90(%rsp), %xmm15
  1215. lea 0xa8(%rsp), %rsp
  1216. .LSEH_end_ecp_nistz256_avx2_select_w5:
  1217. ___
  1218. $code.=<<___;
  1219. ret
  1220. .size ecp_nistz256_avx2_select_w5,.-ecp_nistz256_avx2_select_w5
  1221. ___
  1222. }
  1223. if ($avx>1) {
  1224. my ($val,$in_t,$index)=$win64?("%rcx","%rdx","%r8d"):("%rdi","%rsi","%edx");
  1225. my ($THREE,$INDEX,$Ra,$Rb)=map("%ymm$_",(0..3));
  1226. my ($M0,$T0a,$T0b,$TMP0)=map("%ymm$_",(4..7));
  1227. my ($M1,$T1a,$T1b,$TMP1)=map("%ymm$_",(8..11));
  1228. my ($M2,$T2a,$T2b,$TMP2)=map("%ymm$_",(12..15));
  1229. $code.=<<___;
  1230. ################################################################################
  1231. # void ecp_nistz256_avx2_select_w7(uint64_t *val, uint64_t *in_t, int index);
  1232. .globl ecp_nistz256_avx2_select_w7
  1233. .type ecp_nistz256_avx2_select_w7,\@abi-omnipotent
  1234. .align 32
  1235. ecp_nistz256_avx2_select_w7:
  1236. .Lavx2_select_w7:
  1237. vzeroupper
  1238. ___
  1239. $code.=<<___ if ($win64);
  1240. lea -0x88(%rsp), %rax
  1241. .LSEH_begin_ecp_nistz256_avx2_select_w7:
  1242. .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax), %rsp
  1243. .byte 0xc5,0xf8,0x29,0x70,0xe0 #vmovaps %xmm6, -0x20(%rax)
  1244. .byte 0xc5,0xf8,0x29,0x78,0xf0 #vmovaps %xmm7, -0x10(%rax)
  1245. .byte 0xc5,0x78,0x29,0x40,0x00 #vmovaps %xmm8, 8(%rax)
  1246. .byte 0xc5,0x78,0x29,0x48,0x10 #vmovaps %xmm9, 0x10(%rax)
  1247. .byte 0xc5,0x78,0x29,0x50,0x20 #vmovaps %xmm10, 0x20(%rax)
  1248. .byte 0xc5,0x78,0x29,0x58,0x30 #vmovaps %xmm11, 0x30(%rax)
  1249. .byte 0xc5,0x78,0x29,0x60,0x40 #vmovaps %xmm12, 0x40(%rax)
  1250. .byte 0xc5,0x78,0x29,0x68,0x50 #vmovaps %xmm13, 0x50(%rax)
  1251. .byte 0xc5,0x78,0x29,0x70,0x60 #vmovaps %xmm14, 0x60(%rax)
  1252. .byte 0xc5,0x78,0x29,0x78,0x70 #vmovaps %xmm15, 0x70(%rax)
  1253. ___
  1254. $code.=<<___;
  1255. vmovdqa .LThree(%rip), $THREE
  1256. vpxor $Ra, $Ra, $Ra
  1257. vpxor $Rb, $Rb, $Rb
  1258. vmovdqa .LOne(%rip), $M0
  1259. vmovdqa .LTwo(%rip), $M1
  1260. vmovdqa .LThree(%rip), $M2
  1261. vmovd $index, %xmm1
  1262. vpermd $INDEX, $Ra, $INDEX
  1263. # Skip index = 0, because it is implicitly the point at infinity
  1264. mov \$21, %rax
  1265. .Lselect_loop_avx2_w7:
  1266. vmovdqa 32*0($in_t), $T0a
  1267. vmovdqa 32*1($in_t), $T0b
  1268. vmovdqa 32*2($in_t), $T1a
  1269. vmovdqa 32*3($in_t), $T1b
  1270. vmovdqa 32*4($in_t), $T2a
  1271. vmovdqa 32*5($in_t), $T2b
  1272. vpcmpeqd $INDEX, $M0, $TMP0
  1273. vpcmpeqd $INDEX, $M1, $TMP1
  1274. vpcmpeqd $INDEX, $M2, $TMP2
  1275. vpaddd $THREE, $M0, $M0
  1276. vpaddd $THREE, $M1, $M1
  1277. vpaddd $THREE, $M2, $M2
  1278. lea 32*6($in_t), $in_t
  1279. vpand $TMP0, $T0a, $T0a
  1280. vpand $TMP0, $T0b, $T0b
  1281. vpand $TMP1, $T1a, $T1a
  1282. vpand $TMP1, $T1b, $T1b
  1283. vpand $TMP2, $T2a, $T2a
  1284. vpand $TMP2, $T2b, $T2b
  1285. vpxor $T0a, $Ra, $Ra
  1286. vpxor $T0b, $Rb, $Rb
  1287. vpxor $T1a, $Ra, $Ra
  1288. vpxor $T1b, $Rb, $Rb
  1289. vpxor $T2a, $Ra, $Ra
  1290. vpxor $T2b, $Rb, $Rb
  1291. dec %rax
  1292. jnz .Lselect_loop_avx2_w7
  1293. vmovdqa 32*0($in_t), $T0a
  1294. vmovdqa 32*1($in_t), $T0b
  1295. vpcmpeqd $INDEX, $M0, $TMP0
  1296. vpand $TMP0, $T0a, $T0a
  1297. vpand $TMP0, $T0b, $T0b
  1298. vpxor $T0a, $Ra, $Ra
  1299. vpxor $T0b, $Rb, $Rb
  1300. vmovdqu $Ra, 32*0($val)
  1301. vmovdqu $Rb, 32*1($val)
  1302. vzeroupper
  1303. ___
  1304. $code.=<<___ if ($win64);
  1305. movaps (%rsp), %xmm6
  1306. movaps 0x10(%rsp), %xmm7
  1307. movaps 0x20(%rsp), %xmm8
  1308. movaps 0x30(%rsp), %xmm9
  1309. movaps 0x40(%rsp), %xmm10
  1310. movaps 0x50(%rsp), %xmm11
  1311. movaps 0x60(%rsp), %xmm12
  1312. movaps 0x70(%rsp), %xmm13
  1313. movaps 0x80(%rsp), %xmm14
  1314. movaps 0x90(%rsp), %xmm15
  1315. lea 0xa8(%rsp), %rsp
  1316. .LSEH_end_ecp_nistz256_avx2_select_w7:
  1317. ___
  1318. $code.=<<___;
  1319. ret
  1320. .size ecp_nistz256_avx2_select_w7,.-ecp_nistz256_avx2_select_w7
  1321. ___
  1322. } else {
  1323. $code.=<<___;
  1324. .globl ecp_nistz256_avx2_select_w7
  1325. .type ecp_nistz256_avx2_select_w7,\@function,3
  1326. .align 32
  1327. ecp_nistz256_avx2_select_w7:
  1328. .byte 0x0f,0x0b # ud2
  1329. ret
  1330. .size ecp_nistz256_avx2_select_w7,.-ecp_nistz256_avx2_select_w7
  1331. ___
  1332. }
  1333. {{{
  1334. ########################################################################
  1335. # This block implements higher level point_double, point_add and
  1336. # point_add_affine. The key to performance in this case is to allow
  1337. # out-of-order execution logic to overlap computations from next step
  1338. # with tail processing from current step. By using tailored calling
  1339. # sequence we minimize inter-step overhead to give processor better
  1340. # shot at overlapping operations...
  1341. #
  1342. # You will notice that input data is copied to stack. Trouble is that
  1343. # there are no registers to spare for holding original pointers and
  1344. # reloading them, pointers, would create undesired dependencies on
  1345. # effective addresses calculation paths. In other words it's too done
  1346. # to favour out-of-order execution logic.
  1347. # <appro@openssl.org>
  1348. my ($r_ptr,$a_ptr,$b_org,$b_ptr)=("%rdi","%rsi","%rdx","%rbx");
  1349. my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7)=map("%r$_",(8..15));
  1350. my ($t0,$t1,$t2,$t3,$t4)=("%rax","%rbp","%rcx",$acc4,$acc4);
  1351. my ($poly1,$poly3)=($acc6,$acc7);
  1352. sub load_for_mul () {
  1353. my ($a,$b,$src0) = @_;
  1354. my $bias = $src0 eq "%rax" ? 0 : -128;
  1355. " mov $b, $src0
  1356. lea $b, $b_ptr
  1357. mov 8*0+$a, $acc1
  1358. mov 8*1+$a, $acc2
  1359. lea $bias+$a, $a_ptr
  1360. mov 8*2+$a, $acc3
  1361. mov 8*3+$a, $acc4"
  1362. }
  1363. sub load_for_sqr () {
  1364. my ($a,$src0) = @_;
  1365. my $bias = $src0 eq "%rax" ? 0 : -128;
  1366. " mov 8*0+$a, $src0
  1367. mov 8*1+$a, $acc6
  1368. lea $bias+$a, $a_ptr
  1369. mov 8*2+$a, $acc7
  1370. mov 8*3+$a, $acc0"
  1371. }
  1372. {
  1373. ########################################################################
  1374. # operate in 4-5-0-1 "name space" that matches multiplication output
  1375. #
  1376. my ($a0,$a1,$a2,$a3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3);
  1377. $code.=<<___;
  1378. .type __ecp_nistz256_add_toq,\@abi-omnipotent
  1379. .align 32
  1380. __ecp_nistz256_add_toq:
  1381. add 8*0($b_ptr), $a0
  1382. adc 8*1($b_ptr), $a1
  1383. mov $a0, $t0
  1384. adc 8*2($b_ptr), $a2
  1385. adc 8*3($b_ptr), $a3
  1386. mov $a1, $t1
  1387. sbb $t4, $t4
  1388. sub \$-1, $a0
  1389. mov $a2, $t2
  1390. sbb $poly1, $a1
  1391. sbb \$0, $a2
  1392. mov $a3, $t3
  1393. sbb $poly3, $a3
  1394. test $t4, $t4
  1395. cmovz $t0, $a0
  1396. cmovz $t1, $a1
  1397. mov $a0, 8*0($r_ptr)
  1398. cmovz $t2, $a2
  1399. mov $a1, 8*1($r_ptr)
  1400. cmovz $t3, $a3
  1401. mov $a2, 8*2($r_ptr)
  1402. mov $a3, 8*3($r_ptr)
  1403. ret
  1404. .size __ecp_nistz256_add_toq,.-__ecp_nistz256_add_toq
  1405. .type __ecp_nistz256_sub_fromq,\@abi-omnipotent
  1406. .align 32
  1407. __ecp_nistz256_sub_fromq:
  1408. sub 8*0($b_ptr), $a0
  1409. sbb 8*1($b_ptr), $a1
  1410. mov $a0, $t0
  1411. sbb 8*2($b_ptr), $a2
  1412. sbb 8*3($b_ptr), $a3
  1413. mov $a1, $t1
  1414. sbb $t4, $t4
  1415. add \$-1, $a0
  1416. mov $a2, $t2
  1417. adc $poly1, $a1
  1418. adc \$0, $a2
  1419. mov $a3, $t3
  1420. adc $poly3, $a3
  1421. test $t4, $t4
  1422. cmovz $t0, $a0
  1423. cmovz $t1, $a1
  1424. mov $a0, 8*0($r_ptr)
  1425. cmovz $t2, $a2
  1426. mov $a1, 8*1($r_ptr)
  1427. cmovz $t3, $a3
  1428. mov $a2, 8*2($r_ptr)
  1429. mov $a3, 8*3($r_ptr)
  1430. ret
  1431. .size __ecp_nistz256_sub_fromq,.-__ecp_nistz256_sub_fromq
  1432. .type __ecp_nistz256_subq,\@abi-omnipotent
  1433. .align 32
  1434. __ecp_nistz256_subq:
  1435. sub $a0, $t0
  1436. sbb $a1, $t1
  1437. mov $t0, $a0
  1438. sbb $a2, $t2
  1439. sbb $a3, $t3
  1440. mov $t1, $a1
  1441. sbb $t4, $t4
  1442. add \$-1, $t0
  1443. mov $t2, $a2
  1444. adc $poly1, $t1
  1445. adc \$0, $t2
  1446. mov $t3, $a3
  1447. adc $poly3, $t3
  1448. test $t4, $t4
  1449. cmovnz $t0, $a0
  1450. cmovnz $t1, $a1
  1451. cmovnz $t2, $a2
  1452. cmovnz $t3, $a3
  1453. ret
  1454. .size __ecp_nistz256_subq,.-__ecp_nistz256_subq
  1455. .type __ecp_nistz256_mul_by_2q,\@abi-omnipotent
  1456. .align 32
  1457. __ecp_nistz256_mul_by_2q:
  1458. add $a0, $a0 # a0:a3+a0:a3
  1459. adc $a1, $a1
  1460. mov $a0, $t0
  1461. adc $a2, $a2
  1462. adc $a3, $a3
  1463. mov $a1, $t1
  1464. sbb $t4, $t4
  1465. sub \$-1, $a0
  1466. mov $a2, $t2
  1467. sbb $poly1, $a1
  1468. sbb \$0, $a2
  1469. mov $a3, $t3
  1470. sbb $poly3, $a3
  1471. test $t4, $t4
  1472. cmovz $t0, $a0
  1473. cmovz $t1, $a1
  1474. mov $a0, 8*0($r_ptr)
  1475. cmovz $t2, $a2
  1476. mov $a1, 8*1($r_ptr)
  1477. cmovz $t3, $a3
  1478. mov $a2, 8*2($r_ptr)
  1479. mov $a3, 8*3($r_ptr)
  1480. ret
  1481. .size __ecp_nistz256_mul_by_2q,.-__ecp_nistz256_mul_by_2q
  1482. ___
  1483. }
  1484. sub gen_double () {
  1485. my $x = shift;
  1486. my ($src0,$sfx,$bias);
  1487. my ($S,$M,$Zsqr,$in_x,$tmp0)=map(32*$_,(0..4));
  1488. if ($x ne "x") {
  1489. $src0 = "%rax";
  1490. $sfx = "";
  1491. $bias = 0;
  1492. $code.=<<___;
  1493. .globl ecp_nistz256_point_double
  1494. .type ecp_nistz256_point_double,\@function,2
  1495. .align 32
  1496. ecp_nistz256_point_double:
  1497. ___
  1498. $code.=<<___ if ($addx);
  1499. mov \$0x80100, %ecx
  1500. and OPENSSL_ia32cap_P+8(%rip), %ecx
  1501. cmp \$0x80100, %ecx
  1502. je .Lpoint_doublex
  1503. ___
  1504. } else {
  1505. $src0 = "%rdx";
  1506. $sfx = "x";
  1507. $bias = 128;
  1508. $code.=<<___;
  1509. .type ecp_nistz256_point_doublex,\@function,2
  1510. .align 32
  1511. ecp_nistz256_point_doublex:
  1512. .Lpoint_doublex:
  1513. ___
  1514. }
  1515. $code.=<<___;
  1516. push %rbp
  1517. push %rbx
  1518. push %r12
  1519. push %r13
  1520. push %r14
  1521. push %r15
  1522. sub \$32*5+8, %rsp
  1523. .Lpoint_double_shortcut$x:
  1524. movdqu 0x00($a_ptr), %xmm0 # copy *(P256_POINT *)$a_ptr.x
  1525. mov $a_ptr, $b_ptr # backup copy
  1526. movdqu 0x10($a_ptr), %xmm1
  1527. mov 0x20+8*0($a_ptr), $acc4 # load in_y in "5-4-0-1" order
  1528. mov 0x20+8*1($a_ptr), $acc5
  1529. mov 0x20+8*2($a_ptr), $acc0
  1530. mov 0x20+8*3($a_ptr), $acc1
  1531. mov .Lpoly+8*1(%rip), $poly1
  1532. mov .Lpoly+8*3(%rip), $poly3
  1533. movdqa %xmm0, $in_x(%rsp)
  1534. movdqa %xmm1, $in_x+0x10(%rsp)
  1535. lea 0x20($r_ptr), $acc2
  1536. lea 0x40($r_ptr), $acc3
  1537. movq $r_ptr, %xmm0
  1538. movq $acc2, %xmm1
  1539. movq $acc3, %xmm2
  1540. lea $S(%rsp), $r_ptr
  1541. call __ecp_nistz256_mul_by_2$x # p256_mul_by_2(S, in_y);
  1542. mov 0x40+8*0($a_ptr), $src0
  1543. mov 0x40+8*1($a_ptr), $acc6
  1544. mov 0x40+8*2($a_ptr), $acc7
  1545. mov 0x40+8*3($a_ptr), $acc0
  1546. lea 0x40-$bias($a_ptr), $a_ptr
  1547. lea $Zsqr(%rsp), $r_ptr
  1548. call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Zsqr, in_z);
  1549. `&load_for_sqr("$S(%rsp)", "$src0")`
  1550. lea $S(%rsp), $r_ptr
  1551. call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(S, S);
  1552. mov 0x20($b_ptr), $src0 # $b_ptr is still valid
  1553. mov 0x40+8*0($b_ptr), $acc1
  1554. mov 0x40+8*1($b_ptr), $acc2
  1555. mov 0x40+8*2($b_ptr), $acc3
  1556. mov 0x40+8*3($b_ptr), $acc4
  1557. lea 0x40-$bias($b_ptr), $a_ptr
  1558. lea 0x20($b_ptr), $b_ptr
  1559. movq %xmm2, $r_ptr
  1560. call __ecp_nistz256_mul_mont$x # p256_mul_mont(res_z, in_z, in_y);
  1561. call __ecp_nistz256_mul_by_2$x # p256_mul_by_2(res_z, res_z);
  1562. mov $in_x+8*0(%rsp), $acc4 # "5-4-0-1" order
  1563. mov $in_x+8*1(%rsp), $acc5
  1564. lea $Zsqr(%rsp), $b_ptr
  1565. mov $in_x+8*2(%rsp), $acc0
  1566. mov $in_x+8*3(%rsp), $acc1
  1567. lea $M(%rsp), $r_ptr
  1568. call __ecp_nistz256_add_to$x # p256_add(M, in_x, Zsqr);
  1569. mov $in_x+8*0(%rsp), $acc4 # "5-4-0-1" order
  1570. mov $in_x+8*1(%rsp), $acc5
  1571. lea $Zsqr(%rsp), $b_ptr
  1572. mov $in_x+8*2(%rsp), $acc0
  1573. mov $in_x+8*3(%rsp), $acc1
  1574. lea $Zsqr(%rsp), $r_ptr
  1575. call __ecp_nistz256_sub_from$x # p256_sub(Zsqr, in_x, Zsqr);
  1576. `&load_for_sqr("$S(%rsp)", "$src0")`
  1577. movq %xmm1, $r_ptr
  1578. call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(res_y, S);
  1579. ___
  1580. {
  1581. ######## ecp_nistz256_div_by_2(res_y, res_y); ##########################
  1582. # operate in 4-5-6-7 "name space" that matches squaring output
  1583. #
  1584. my ($poly1,$poly3)=($a_ptr,$t1);
  1585. my ($a0,$a1,$a2,$a3,$t3,$t4,$t1)=($acc4,$acc5,$acc6,$acc7,$acc0,$acc1,$acc2);
  1586. $code.=<<___;
  1587. xor $t4, $t4
  1588. mov $a0, $t0
  1589. add \$-1, $a0
  1590. mov $a1, $t1
  1591. adc $poly1, $a1
  1592. mov $a2, $t2
  1593. adc \$0, $a2
  1594. mov $a3, $t3
  1595. adc $poly3, $a3
  1596. adc \$0, $t4
  1597. xor $a_ptr, $a_ptr # borrow $a_ptr
  1598. test \$1, $t0
  1599. cmovz $t0, $a0
  1600. cmovz $t1, $a1
  1601. cmovz $t2, $a2
  1602. cmovz $t3, $a3
  1603. cmovz $a_ptr, $t4
  1604. mov $a1, $t0 # a0:a3>>1
  1605. shr \$1, $a0
  1606. shl \$63, $t0
  1607. mov $a2, $t1
  1608. shr \$1, $a1
  1609. or $t0, $a0
  1610. shl \$63, $t1
  1611. mov $a3, $t2
  1612. shr \$1, $a2
  1613. or $t1, $a1
  1614. shl \$63, $t2
  1615. mov $a0, 8*0($r_ptr)
  1616. shr \$1, $a3
  1617. mov $a1, 8*1($r_ptr)
  1618. shl \$63, $t4
  1619. or $t2, $a2
  1620. or $t4, $a3
  1621. mov $a2, 8*2($r_ptr)
  1622. mov $a3, 8*3($r_ptr)
  1623. ___
  1624. }
  1625. $code.=<<___;
  1626. `&load_for_mul("$M(%rsp)", "$Zsqr(%rsp)", "$src0")`
  1627. lea $M(%rsp), $r_ptr
  1628. call __ecp_nistz256_mul_mont$x # p256_mul_mont(M, M, Zsqr);
  1629. lea $tmp0(%rsp), $r_ptr
  1630. call __ecp_nistz256_mul_by_2$x
  1631. lea $M(%rsp), $b_ptr
  1632. lea $M(%rsp), $r_ptr
  1633. call __ecp_nistz256_add_to$x # p256_mul_by_3(M, M);
  1634. `&load_for_mul("$S(%rsp)", "$in_x(%rsp)", "$src0")`
  1635. lea $S(%rsp), $r_ptr
  1636. call __ecp_nistz256_mul_mont$x # p256_mul_mont(S, S, in_x);
  1637. lea $tmp0(%rsp), $r_ptr
  1638. call __ecp_nistz256_mul_by_2$x # p256_mul_by_2(tmp0, S);
  1639. `&load_for_sqr("$M(%rsp)", "$src0")`
  1640. movq %xmm0, $r_ptr
  1641. call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(res_x, M);
  1642. lea $tmp0(%rsp), $b_ptr
  1643. mov $acc6, $acc0 # harmonize sqr output and sub input
  1644. mov $acc7, $acc1
  1645. mov $a_ptr, $poly1
  1646. mov $t1, $poly3
  1647. call __ecp_nistz256_sub_from$x # p256_sub(res_x, res_x, tmp0);
  1648. mov $S+8*0(%rsp), $t0
  1649. mov $S+8*1(%rsp), $t1
  1650. mov $S+8*2(%rsp), $t2
  1651. mov $S+8*3(%rsp), $acc2 # "4-5-0-1" order
  1652. lea $S(%rsp), $r_ptr
  1653. call __ecp_nistz256_sub$x # p256_sub(S, S, res_x);
  1654. mov $M(%rsp), $src0
  1655. lea $M(%rsp), $b_ptr
  1656. mov $acc4, $acc6 # harmonize sub output and mul input
  1657. xor %ecx, %ecx
  1658. mov $acc4, $S+8*0(%rsp) # have to save:-(
  1659. mov $acc5, $acc2
  1660. mov $acc5, $S+8*1(%rsp)
  1661. cmovz $acc0, $acc3
  1662. mov $acc0, $S+8*2(%rsp)
  1663. lea $S-$bias(%rsp), $a_ptr
  1664. cmovz $acc1, $acc4
  1665. mov $acc1, $S+8*3(%rsp)
  1666. mov $acc6, $acc1
  1667. lea $S(%rsp), $r_ptr
  1668. call __ecp_nistz256_mul_mont$x # p256_mul_mont(S, S, M);
  1669. movq %xmm1, $b_ptr
  1670. movq %xmm1, $r_ptr
  1671. call __ecp_nistz256_sub_from$x # p256_sub(res_y, S, res_y);
  1672. add \$32*5+8, %rsp
  1673. pop %r15
  1674. pop %r14
  1675. pop %r13
  1676. pop %r12
  1677. pop %rbx
  1678. pop %rbp
  1679. ret
  1680. .size ecp_nistz256_point_double$sfx,.-ecp_nistz256_point_double$sfx
  1681. ___
  1682. }
  1683. &gen_double("q");
  1684. sub gen_add () {
  1685. my $x = shift;
  1686. my ($src0,$sfx,$bias);
  1687. my ($H,$Hsqr,$R,$Rsqr,$Hcub,
  1688. $U1,$U2,$S1,$S2,
  1689. $res_x,$res_y,$res_z,
  1690. $in1_x,$in1_y,$in1_z,
  1691. $in2_x,$in2_y,$in2_z)=map(32*$_,(0..17));
  1692. my ($Z1sqr, $Z2sqr) = ($Hsqr, $Rsqr);
  1693. if ($x ne "x") {
  1694. $src0 = "%rax";
  1695. $sfx = "";
  1696. $bias = 0;
  1697. $code.=<<___;
  1698. .globl ecp_nistz256_point_add
  1699. .type ecp_nistz256_point_add,\@function,3
  1700. .align 32
  1701. ecp_nistz256_point_add:
  1702. ___
  1703. $code.=<<___ if ($addx);
  1704. mov \$0x80100, %ecx
  1705. and OPENSSL_ia32cap_P+8(%rip), %ecx
  1706. cmp \$0x80100, %ecx
  1707. je .Lpoint_addx
  1708. ___
  1709. } else {
  1710. $src0 = "%rdx";
  1711. $sfx = "x";
  1712. $bias = 128;
  1713. $code.=<<___;
  1714. .type ecp_nistz256_point_addx,\@function,3
  1715. .align 32
  1716. ecp_nistz256_point_addx:
  1717. .Lpoint_addx:
  1718. ___
  1719. }
  1720. $code.=<<___;
  1721. push %rbp
  1722. push %rbx
  1723. push %r12
  1724. push %r13
  1725. push %r14
  1726. push %r15
  1727. sub \$32*18+8, %rsp
  1728. movdqu 0x00($a_ptr), %xmm0 # copy *(P256_POINT *)$a_ptr
  1729. movdqu 0x10($a_ptr), %xmm1
  1730. movdqu 0x20($a_ptr), %xmm2
  1731. movdqu 0x30($a_ptr), %xmm3
  1732. movdqu 0x40($a_ptr), %xmm4
  1733. movdqu 0x50($a_ptr), %xmm5
  1734. mov $a_ptr, $b_ptr # reassign
  1735. mov $b_org, $a_ptr # reassign
  1736. movdqa %xmm0, $in1_x(%rsp)
  1737. movdqa %xmm1, $in1_x+0x10(%rsp)
  1738. por %xmm0, %xmm1
  1739. movdqa %xmm2, $in1_y(%rsp)
  1740. movdqa %xmm3, $in1_y+0x10(%rsp)
  1741. por %xmm2, %xmm3
  1742. movdqa %xmm4, $in1_z(%rsp)
  1743. movdqa %xmm5, $in1_z+0x10(%rsp)
  1744. por %xmm1, %xmm3
  1745. movdqu 0x00($a_ptr), %xmm0 # copy *(P256_POINT *)$b_ptr
  1746. pshufd \$0xb1, %xmm3, %xmm5
  1747. movdqu 0x10($a_ptr), %xmm1
  1748. movdqu 0x20($a_ptr), %xmm2
  1749. por %xmm3, %xmm5
  1750. movdqu 0x30($a_ptr), %xmm3
  1751. mov 0x40+8*0($a_ptr), $src0 # load original in2_z
  1752. mov 0x40+8*1($a_ptr), $acc6
  1753. mov 0x40+8*2($a_ptr), $acc7
  1754. mov 0x40+8*3($a_ptr), $acc0
  1755. movdqa %xmm0, $in2_x(%rsp)
  1756. pshufd \$0x1e, %xmm5, %xmm4
  1757. movdqa %xmm1, $in2_x+0x10(%rsp)
  1758. por %xmm0, %xmm1
  1759. movq $r_ptr, %xmm0 # save $r_ptr
  1760. movdqa %xmm2, $in2_y(%rsp)
  1761. movdqa %xmm3, $in2_y+0x10(%rsp)
  1762. por %xmm2, %xmm3
  1763. por %xmm4, %xmm5
  1764. pxor %xmm4, %xmm4
  1765. por %xmm1, %xmm3
  1766. lea 0x40-$bias($a_ptr), $a_ptr # $a_ptr is still valid
  1767. mov $src0, $in2_z+8*0(%rsp) # make in2_z copy
  1768. mov $acc6, $in2_z+8*1(%rsp)
  1769. mov $acc7, $in2_z+8*2(%rsp)
  1770. mov $acc0, $in2_z+8*3(%rsp)
  1771. lea $Z2sqr(%rsp), $r_ptr # Z2^2
  1772. call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Z2sqr, in2_z);
  1773. pcmpeqd %xmm4, %xmm5
  1774. pshufd \$0xb1, %xmm3, %xmm4
  1775. por %xmm3, %xmm4
  1776. pshufd \$0, %xmm5, %xmm5 # in1infty
  1777. pshufd \$0x1e, %xmm4, %xmm3
  1778. por %xmm3, %xmm4
  1779. pxor %xmm3, %xmm3
  1780. pcmpeqd %xmm3, %xmm4
  1781. pshufd \$0, %xmm4, %xmm4 # in2infty
  1782. mov 0x40+8*0($b_ptr), $src0 # load original in1_z
  1783. mov 0x40+8*1($b_ptr), $acc6
  1784. mov 0x40+8*2($b_ptr), $acc7
  1785. mov 0x40+8*3($b_ptr), $acc0
  1786. movq $b_ptr, %xmm1
  1787. lea 0x40-$bias($b_ptr), $a_ptr
  1788. lea $Z1sqr(%rsp), $r_ptr # Z1^2
  1789. call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Z1sqr, in1_z);
  1790. `&load_for_mul("$Z2sqr(%rsp)", "$in2_z(%rsp)", "$src0")`
  1791. lea $S1(%rsp), $r_ptr # S1 = Z2^3
  1792. call __ecp_nistz256_mul_mont$x # p256_mul_mont(S1, Z2sqr, in2_z);
  1793. `&load_for_mul("$Z1sqr(%rsp)", "$in1_z(%rsp)", "$src0")`
  1794. lea $S2(%rsp), $r_ptr # S2 = Z1^3
  1795. call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, Z1sqr, in1_z);
  1796. `&load_for_mul("$S1(%rsp)", "$in1_y(%rsp)", "$src0")`
  1797. lea $S1(%rsp), $r_ptr # S1 = Y1*Z2^3
  1798. call __ecp_nistz256_mul_mont$x # p256_mul_mont(S1, S1, in1_y);
  1799. `&load_for_mul("$S2(%rsp)", "$in2_y(%rsp)", "$src0")`
  1800. lea $S2(%rsp), $r_ptr # S2 = Y2*Z1^3
  1801. call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, S2, in2_y);
  1802. lea $S1(%rsp), $b_ptr
  1803. lea $R(%rsp), $r_ptr # R = S2 - S1
  1804. call __ecp_nistz256_sub_from$x # p256_sub(R, S2, S1);
  1805. or $acc5, $acc4 # see if result is zero
  1806. movdqa %xmm4, %xmm2
  1807. or $acc0, $acc4
  1808. or $acc1, $acc4
  1809. por %xmm5, %xmm2 # in1infty || in2infty
  1810. movq $acc4, %xmm3
  1811. `&load_for_mul("$Z2sqr(%rsp)", "$in1_x(%rsp)", "$src0")`
  1812. lea $U1(%rsp), $r_ptr # U1 = X1*Z2^2
  1813. call __ecp_nistz256_mul_mont$x # p256_mul_mont(U1, in1_x, Z2sqr);
  1814. `&load_for_mul("$Z1sqr(%rsp)", "$in2_x(%rsp)", "$src0")`
  1815. lea $U2(%rsp), $r_ptr # U2 = X2*Z1^2
  1816. call __ecp_nistz256_mul_mont$x # p256_mul_mont(U2, in2_x, Z1sqr);
  1817. lea $U1(%rsp), $b_ptr
  1818. lea $H(%rsp), $r_ptr # H = U2 - U1
  1819. call __ecp_nistz256_sub_from$x # p256_sub(H, U2, U1);
  1820. or $acc5, $acc4 # see if result is zero
  1821. or $acc0, $acc4
  1822. or $acc1, $acc4
  1823. .byte 0x3e # predict taken
  1824. jnz .Ladd_proceed$x # is_equal(U1,U2)?
  1825. movq %xmm2, $acc0
  1826. movq %xmm3, $acc1
  1827. test $acc0, $acc0
  1828. jnz .Ladd_proceed$x # (in1infty || in2infty)?
  1829. test $acc1, $acc1
  1830. jz .Ladd_double$x # is_equal(S1,S2)?
  1831. movq %xmm0, $r_ptr # restore $r_ptr
  1832. pxor %xmm0, %xmm0
  1833. movdqu %xmm0, 0x00($r_ptr)
  1834. movdqu %xmm0, 0x10($r_ptr)
  1835. movdqu %xmm0, 0x20($r_ptr)
  1836. movdqu %xmm0, 0x30($r_ptr)
  1837. movdqu %xmm0, 0x40($r_ptr)
  1838. movdqu %xmm0, 0x50($r_ptr)
  1839. jmp .Ladd_done$x
  1840. .align 32
  1841. .Ladd_double$x:
  1842. movq %xmm1, $a_ptr # restore $a_ptr
  1843. movq %xmm0, $r_ptr # restore $r_ptr
  1844. add \$`32*(18-5)`, %rsp # difference in frame sizes
  1845. jmp .Lpoint_double_shortcut$x
  1846. .align 32
  1847. .Ladd_proceed$x:
  1848. `&load_for_sqr("$R(%rsp)", "$src0")`
  1849. lea $Rsqr(%rsp), $r_ptr # R^2
  1850. call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Rsqr, R);
  1851. `&load_for_mul("$H(%rsp)", "$in1_z(%rsp)", "$src0")`
  1852. lea $res_z(%rsp), $r_ptr # Z3 = H*Z1*Z2
  1853. call __ecp_nistz256_mul_mont$x # p256_mul_mont(res_z, H, in1_z);
  1854. `&load_for_sqr("$H(%rsp)", "$src0")`
  1855. lea $Hsqr(%rsp), $r_ptr # H^2
  1856. call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Hsqr, H);
  1857. `&load_for_mul("$res_z(%rsp)", "$in2_z(%rsp)", "$src0")`
  1858. lea $res_z(%rsp), $r_ptr # Z3 = H*Z1*Z2
  1859. call __ecp_nistz256_mul_mont$x # p256_mul_mont(res_z, res_z, in2_z);
  1860. `&load_for_mul("$Hsqr(%rsp)", "$H(%rsp)", "$src0")`
  1861. lea $Hcub(%rsp), $r_ptr # H^3
  1862. call __ecp_nistz256_mul_mont$x # p256_mul_mont(Hcub, Hsqr, H);
  1863. `&load_for_mul("$Hsqr(%rsp)", "$U1(%rsp)", "$src0")`
  1864. lea $U2(%rsp), $r_ptr # U1*H^2
  1865. call __ecp_nistz256_mul_mont$x # p256_mul_mont(U2, U1, Hsqr);
  1866. ___
  1867. {
  1868. #######################################################################
  1869. # operate in 4-5-0-1 "name space" that matches multiplication output
  1870. #
  1871. my ($acc0,$acc1,$acc2,$acc3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3);
  1872. my ($poly1, $poly3)=($acc6,$acc7);
  1873. $code.=<<___;
  1874. #lea $U2(%rsp), $a_ptr
  1875. #lea $Hsqr(%rsp), $r_ptr # 2*U1*H^2
  1876. #call __ecp_nistz256_mul_by_2 # ecp_nistz256_mul_by_2(Hsqr, U2);
  1877. add $acc0, $acc0 # a0:a3+a0:a3
  1878. lea $Rsqr(%rsp), $a_ptr
  1879. adc $acc1, $acc1
  1880. mov $acc0, $t0
  1881. adc $acc2, $acc2
  1882. adc $acc3, $acc3
  1883. mov $acc1, $t1
  1884. sbb $t4, $t4
  1885. sub \$-1, $acc0
  1886. mov $acc2, $t2
  1887. sbb $poly1, $acc1
  1888. sbb \$0, $acc2
  1889. mov $acc3, $t3
  1890. sbb $poly3, $acc3
  1891. test $t4, $t4
  1892. cmovz $t0, $acc0
  1893. mov 8*0($a_ptr), $t0
  1894. cmovz $t1, $acc1
  1895. mov 8*1($a_ptr), $t1
  1896. cmovz $t2, $acc2
  1897. mov 8*2($a_ptr), $t2
  1898. cmovz $t3, $acc3
  1899. mov 8*3($a_ptr), $t3
  1900. call __ecp_nistz256_sub$x # p256_sub(res_x, Rsqr, Hsqr);
  1901. lea $Hcub(%rsp), $b_ptr
  1902. lea $res_x(%rsp), $r_ptr
  1903. call __ecp_nistz256_sub_from$x # p256_sub(res_x, res_x, Hcub);
  1904. mov $U2+8*0(%rsp), $t0
  1905. mov $U2+8*1(%rsp), $t1
  1906. mov $U2+8*2(%rsp), $t2
  1907. mov $U2+8*3(%rsp), $t3
  1908. lea $res_y(%rsp), $r_ptr
  1909. call __ecp_nistz256_sub$x # p256_sub(res_y, U2, res_x);
  1910. mov $acc0, 8*0($r_ptr) # save the result, as
  1911. mov $acc1, 8*1($r_ptr) # __ecp_nistz256_sub doesn't
  1912. mov $acc2, 8*2($r_ptr)
  1913. mov $acc3, 8*3($r_ptr)
  1914. ___
  1915. }
  1916. $code.=<<___;
  1917. `&load_for_mul("$S1(%rsp)", "$Hcub(%rsp)", "$src0")`
  1918. lea $S2(%rsp), $r_ptr
  1919. call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, S1, Hcub);
  1920. `&load_for_mul("$R(%rsp)", "$res_y(%rsp)", "$src0")`
  1921. lea $res_y(%rsp), $r_ptr
  1922. call __ecp_nistz256_mul_mont$x # p256_mul_mont(res_y, R, res_y);
  1923. lea $S2(%rsp), $b_ptr
  1924. lea $res_y(%rsp), $r_ptr
  1925. call __ecp_nistz256_sub_from$x # p256_sub(res_y, res_y, S2);
  1926. movq %xmm0, $r_ptr # restore $r_ptr
  1927. movdqa %xmm5, %xmm0 # copy_conditional(res_z, in2_z, in1infty);
  1928. movdqa %xmm5, %xmm1
  1929. pandn $res_z(%rsp), %xmm0
  1930. movdqa %xmm5, %xmm2
  1931. pandn $res_z+0x10(%rsp), %xmm1
  1932. movdqa %xmm5, %xmm3
  1933. pand $in2_z(%rsp), %xmm2
  1934. pand $in2_z+0x10(%rsp), %xmm3
  1935. por %xmm0, %xmm2
  1936. por %xmm1, %xmm3
  1937. movdqa %xmm4, %xmm0 # copy_conditional(res_z, in1_z, in2infty);
  1938. movdqa %xmm4, %xmm1
  1939. pandn %xmm2, %xmm0
  1940. movdqa %xmm4, %xmm2
  1941. pandn %xmm3, %xmm1
  1942. movdqa %xmm4, %xmm3
  1943. pand $in1_z(%rsp), %xmm2
  1944. pand $in1_z+0x10(%rsp), %xmm3
  1945. por %xmm0, %xmm2
  1946. por %xmm1, %xmm3
  1947. movdqu %xmm2, 0x40($r_ptr)
  1948. movdqu %xmm3, 0x50($r_ptr)
  1949. movdqa %xmm5, %xmm0 # copy_conditional(res_x, in2_x, in1infty);
  1950. movdqa %xmm5, %xmm1
  1951. pandn $res_x(%rsp), %xmm0
  1952. movdqa %xmm5, %xmm2
  1953. pandn $res_x+0x10(%rsp), %xmm1
  1954. movdqa %xmm5, %xmm3
  1955. pand $in2_x(%rsp), %xmm2
  1956. pand $in2_x+0x10(%rsp), %xmm3
  1957. por %xmm0, %xmm2
  1958. por %xmm1, %xmm3
  1959. movdqa %xmm4, %xmm0 # copy_conditional(res_x, in1_x, in2infty);
  1960. movdqa %xmm4, %xmm1
  1961. pandn %xmm2, %xmm0
  1962. movdqa %xmm4, %xmm2
  1963. pandn %xmm3, %xmm1
  1964. movdqa %xmm4, %xmm3
  1965. pand $in1_x(%rsp), %xmm2
  1966. pand $in1_x+0x10(%rsp), %xmm3
  1967. por %xmm0, %xmm2
  1968. por %xmm1, %xmm3
  1969. movdqu %xmm2, 0x00($r_ptr)
  1970. movdqu %xmm3, 0x10($r_ptr)
  1971. movdqa %xmm5, %xmm0 # copy_conditional(res_y, in2_y, in1infty);
  1972. movdqa %xmm5, %xmm1
  1973. pandn $res_y(%rsp), %xmm0
  1974. movdqa %xmm5, %xmm2
  1975. pandn $res_y+0x10(%rsp), %xmm1
  1976. movdqa %xmm5, %xmm3
  1977. pand $in2_y(%rsp), %xmm2
  1978. pand $in2_y+0x10(%rsp), %xmm3
  1979. por %xmm0, %xmm2
  1980. por %xmm1, %xmm3
  1981. movdqa %xmm4, %xmm0 # copy_conditional(res_y, in1_y, in2infty);
  1982. movdqa %xmm4, %xmm1
  1983. pandn %xmm2, %xmm0
  1984. movdqa %xmm4, %xmm2
  1985. pandn %xmm3, %xmm1
  1986. movdqa %xmm4, %xmm3
  1987. pand $in1_y(%rsp), %xmm2
  1988. pand $in1_y+0x10(%rsp), %xmm3
  1989. por %xmm0, %xmm2
  1990. por %xmm1, %xmm3
  1991. movdqu %xmm2, 0x20($r_ptr)
  1992. movdqu %xmm3, 0x30($r_ptr)
  1993. .Ladd_done$x:
  1994. add \$32*18+8, %rsp
  1995. pop %r15
  1996. pop %r14
  1997. pop %r13
  1998. pop %r12
  1999. pop %rbx
  2000. pop %rbp
  2001. ret
  2002. .size ecp_nistz256_point_add$sfx,.-ecp_nistz256_point_add$sfx
  2003. ___
  2004. }
  2005. &gen_add("q");
  2006. sub gen_add_affine () {
  2007. my $x = shift;
  2008. my ($src0,$sfx,$bias);
  2009. my ($U2,$S2,$H,$R,$Hsqr,$Hcub,$Rsqr,
  2010. $res_x,$res_y,$res_z,
  2011. $in1_x,$in1_y,$in1_z,
  2012. $in2_x,$in2_y)=map(32*$_,(0..14));
  2013. my $Z1sqr = $S2;
  2014. if ($x ne "x") {
  2015. $src0 = "%rax";
  2016. $sfx = "";
  2017. $bias = 0;
  2018. $code.=<<___;
  2019. .globl ecp_nistz256_point_add_affine
  2020. .type ecp_nistz256_point_add_affine,\@function,3
  2021. .align 32
  2022. ecp_nistz256_point_add_affine:
  2023. ___
  2024. $code.=<<___ if ($addx);
  2025. mov \$0x80100, %ecx
  2026. and OPENSSL_ia32cap_P+8(%rip), %ecx
  2027. cmp \$0x80100, %ecx
  2028. je .Lpoint_add_affinex
  2029. ___
  2030. } else {
  2031. $src0 = "%rdx";
  2032. $sfx = "x";
  2033. $bias = 128;
  2034. $code.=<<___;
  2035. .type ecp_nistz256_point_add_affinex,\@function,3
  2036. .align 32
  2037. ecp_nistz256_point_add_affinex:
  2038. .Lpoint_add_affinex:
  2039. ___
  2040. }
  2041. $code.=<<___;
  2042. push %rbp
  2043. push %rbx
  2044. push %r12
  2045. push %r13
  2046. push %r14
  2047. push %r15
  2048. sub \$32*15+8, %rsp
  2049. movdqu 0x00($a_ptr), %xmm0 # copy *(P256_POINT *)$a_ptr
  2050. mov $b_org, $b_ptr # reassign
  2051. movdqu 0x10($a_ptr), %xmm1
  2052. movdqu 0x20($a_ptr), %xmm2
  2053. movdqu 0x30($a_ptr), %xmm3
  2054. movdqu 0x40($a_ptr), %xmm4
  2055. movdqu 0x50($a_ptr), %xmm5
  2056. mov 0x40+8*0($a_ptr), $src0 # load original in1_z
  2057. mov 0x40+8*1($a_ptr), $acc6
  2058. mov 0x40+8*2($a_ptr), $acc7
  2059. mov 0x40+8*3($a_ptr), $acc0
  2060. movdqa %xmm0, $in1_x(%rsp)
  2061. movdqa %xmm1, $in1_x+0x10(%rsp)
  2062. por %xmm0, %xmm1
  2063. movdqa %xmm2, $in1_y(%rsp)
  2064. movdqa %xmm3, $in1_y+0x10(%rsp)
  2065. por %xmm2, %xmm3
  2066. movdqa %xmm4, $in1_z(%rsp)
  2067. movdqa %xmm5, $in1_z+0x10(%rsp)
  2068. por %xmm1, %xmm3
  2069. movdqu 0x00($b_ptr), %xmm0 # copy *(P256_POINT_AFFINE *)$b_ptr
  2070. pshufd \$0xb1, %xmm3, %xmm5
  2071. movdqu 0x10($b_ptr), %xmm1
  2072. movdqu 0x20($b_ptr), %xmm2
  2073. por %xmm3, %xmm5
  2074. movdqu 0x30($b_ptr), %xmm3
  2075. movdqa %xmm0, $in2_x(%rsp)
  2076. pshufd \$0x1e, %xmm5, %xmm4
  2077. movdqa %xmm1, $in2_x+0x10(%rsp)
  2078. por %xmm0, %xmm1
  2079. movq $r_ptr, %xmm0 # save $r_ptr
  2080. movdqa %xmm2, $in2_y(%rsp)
  2081. movdqa %xmm3, $in2_y+0x10(%rsp)
  2082. por %xmm2, %xmm3
  2083. por %xmm4, %xmm5
  2084. pxor %xmm4, %xmm4
  2085. por %xmm1, %xmm3
  2086. lea 0x40-$bias($a_ptr), $a_ptr # $a_ptr is still valid
  2087. lea $Z1sqr(%rsp), $r_ptr # Z1^2
  2088. call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Z1sqr, in1_z);
  2089. pcmpeqd %xmm4, %xmm5
  2090. pshufd \$0xb1, %xmm3, %xmm4
  2091. mov 0x00($b_ptr), $src0 # $b_ptr is still valid
  2092. #lea 0x00($b_ptr), $b_ptr
  2093. mov $acc4, $acc1 # harmonize sqr output and mul input
  2094. por %xmm3, %xmm4
  2095. pshufd \$0, %xmm5, %xmm5 # in1infty
  2096. pshufd \$0x1e, %xmm4, %xmm3
  2097. mov $acc5, $acc2
  2098. por %xmm3, %xmm4
  2099. pxor %xmm3, %xmm3
  2100. mov $acc6, $acc3
  2101. pcmpeqd %xmm3, %xmm4
  2102. pshufd \$0, %xmm4, %xmm4 # in2infty
  2103. lea $Z1sqr-$bias(%rsp), $a_ptr
  2104. mov $acc7, $acc4
  2105. lea $U2(%rsp), $r_ptr # U2 = X2*Z1^2
  2106. call __ecp_nistz256_mul_mont$x # p256_mul_mont(U2, Z1sqr, in2_x);
  2107. lea $in1_x(%rsp), $b_ptr
  2108. lea $H(%rsp), $r_ptr # H = U2 - U1
  2109. call __ecp_nistz256_sub_from$x # p256_sub(H, U2, in1_x);
  2110. `&load_for_mul("$Z1sqr(%rsp)", "$in1_z(%rsp)", "$src0")`
  2111. lea $S2(%rsp), $r_ptr # S2 = Z1^3
  2112. call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, Z1sqr, in1_z);
  2113. `&load_for_mul("$H(%rsp)", "$in1_z(%rsp)", "$src0")`
  2114. lea $res_z(%rsp), $r_ptr # Z3 = H*Z1*Z2
  2115. call __ecp_nistz256_mul_mont$x # p256_mul_mont(res_z, H, in1_z);
  2116. `&load_for_mul("$S2(%rsp)", "$in2_y(%rsp)", "$src0")`
  2117. lea $S2(%rsp), $r_ptr # S2 = Y2*Z1^3
  2118. call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, S2, in2_y);
  2119. lea $in1_y(%rsp), $b_ptr
  2120. lea $R(%rsp), $r_ptr # R = S2 - S1
  2121. call __ecp_nistz256_sub_from$x # p256_sub(R, S2, in1_y);
  2122. `&load_for_sqr("$H(%rsp)", "$src0")`
  2123. lea $Hsqr(%rsp), $r_ptr # H^2
  2124. call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Hsqr, H);
  2125. `&load_for_sqr("$R(%rsp)", "$src0")`
  2126. lea $Rsqr(%rsp), $r_ptr # R^2
  2127. call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Rsqr, R);
  2128. `&load_for_mul("$H(%rsp)", "$Hsqr(%rsp)", "$src0")`
  2129. lea $Hcub(%rsp), $r_ptr # H^3
  2130. call __ecp_nistz256_mul_mont$x # p256_mul_mont(Hcub, Hsqr, H);
  2131. `&load_for_mul("$Hsqr(%rsp)", "$in1_x(%rsp)", "$src0")`
  2132. lea $U2(%rsp), $r_ptr # U1*H^2
  2133. call __ecp_nistz256_mul_mont$x # p256_mul_mont(U2, in1_x, Hsqr);
  2134. ___
  2135. {
  2136. #######################################################################
  2137. # operate in 4-5-0-1 "name space" that matches multiplication output
  2138. #
  2139. my ($acc0,$acc1,$acc2,$acc3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3);
  2140. my ($poly1, $poly3)=($acc6,$acc7);
  2141. $code.=<<___;
  2142. #lea $U2(%rsp), $a_ptr
  2143. #lea $Hsqr(%rsp), $r_ptr # 2*U1*H^2
  2144. #call __ecp_nistz256_mul_by_2 # ecp_nistz256_mul_by_2(Hsqr, U2);
  2145. add $acc0, $acc0 # a0:a3+a0:a3
  2146. lea $Rsqr(%rsp), $a_ptr
  2147. adc $acc1, $acc1
  2148. mov $acc0, $t0
  2149. adc $acc2, $acc2
  2150. adc $acc3, $acc3
  2151. mov $acc1, $t1
  2152. sbb $t4, $t4
  2153. sub \$-1, $acc0
  2154. mov $acc2, $t2
  2155. sbb $poly1, $acc1
  2156. sbb \$0, $acc2
  2157. mov $acc3, $t3
  2158. sbb $poly3, $acc3
  2159. test $t4, $t4
  2160. cmovz $t0, $acc0
  2161. mov 8*0($a_ptr), $t0
  2162. cmovz $t1, $acc1
  2163. mov 8*1($a_ptr), $t1
  2164. cmovz $t2, $acc2
  2165. mov 8*2($a_ptr), $t2
  2166. cmovz $t3, $acc3
  2167. mov 8*3($a_ptr), $t3
  2168. call __ecp_nistz256_sub$x # p256_sub(res_x, Rsqr, Hsqr);
  2169. lea $Hcub(%rsp), $b_ptr
  2170. lea $res_x(%rsp), $r_ptr
  2171. call __ecp_nistz256_sub_from$x # p256_sub(res_x, res_x, Hcub);
  2172. mov $U2+8*0(%rsp), $t0
  2173. mov $U2+8*1(%rsp), $t1
  2174. mov $U2+8*2(%rsp), $t2
  2175. mov $U2+8*3(%rsp), $t3
  2176. lea $H(%rsp), $r_ptr
  2177. call __ecp_nistz256_sub$x # p256_sub(H, U2, res_x);
  2178. mov $acc0, 8*0($r_ptr) # save the result, as
  2179. mov $acc1, 8*1($r_ptr) # __ecp_nistz256_sub doesn't
  2180. mov $acc2, 8*2($r_ptr)
  2181. mov $acc3, 8*3($r_ptr)
  2182. ___
  2183. }
  2184. $code.=<<___;
  2185. `&load_for_mul("$Hcub(%rsp)", "$in1_y(%rsp)", "$src0")`
  2186. lea $S2(%rsp), $r_ptr
  2187. call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, Hcub, in1_y);
  2188. `&load_for_mul("$H(%rsp)", "$R(%rsp)", "$src0")`
  2189. lea $H(%rsp), $r_ptr
  2190. call __ecp_nistz256_mul_mont$x # p256_mul_mont(H, H, R);
  2191. lea $S2(%rsp), $b_ptr
  2192. lea $res_y(%rsp), $r_ptr
  2193. call __ecp_nistz256_sub_from$x # p256_sub(res_y, H, S2);
  2194. movq %xmm0, $r_ptr # restore $r_ptr
  2195. movdqa %xmm5, %xmm0 # copy_conditional(res_z, ONE, in1infty);
  2196. movdqa %xmm5, %xmm1
  2197. pandn $res_z(%rsp), %xmm0
  2198. movdqa %xmm5, %xmm2
  2199. pandn $res_z+0x10(%rsp), %xmm1
  2200. movdqa %xmm5, %xmm3
  2201. pand .LONE_mont(%rip), %xmm2
  2202. pand .LONE_mont+0x10(%rip), %xmm3
  2203. por %xmm0, %xmm2
  2204. por %xmm1, %xmm3
  2205. movdqa %xmm4, %xmm0 # copy_conditional(res_z, in1_z, in2infty);
  2206. movdqa %xmm4, %xmm1
  2207. pandn %xmm2, %xmm0
  2208. movdqa %xmm4, %xmm2
  2209. pandn %xmm3, %xmm1
  2210. movdqa %xmm4, %xmm3
  2211. pand $in1_z(%rsp), %xmm2
  2212. pand $in1_z+0x10(%rsp), %xmm3
  2213. por %xmm0, %xmm2
  2214. por %xmm1, %xmm3
  2215. movdqu %xmm2, 0x40($r_ptr)
  2216. movdqu %xmm3, 0x50($r_ptr)
  2217. movdqa %xmm5, %xmm0 # copy_conditional(res_x, in2_x, in1infty);
  2218. movdqa %xmm5, %xmm1
  2219. pandn $res_x(%rsp), %xmm0
  2220. movdqa %xmm5, %xmm2
  2221. pandn $res_x+0x10(%rsp), %xmm1
  2222. movdqa %xmm5, %xmm3
  2223. pand $in2_x(%rsp), %xmm2
  2224. pand $in2_x+0x10(%rsp), %xmm3
  2225. por %xmm0, %xmm2
  2226. por %xmm1, %xmm3
  2227. movdqa %xmm4, %xmm0 # copy_conditional(res_x, in1_x, in2infty);
  2228. movdqa %xmm4, %xmm1
  2229. pandn %xmm2, %xmm0
  2230. movdqa %xmm4, %xmm2
  2231. pandn %xmm3, %xmm1
  2232. movdqa %xmm4, %xmm3
  2233. pand $in1_x(%rsp), %xmm2
  2234. pand $in1_x+0x10(%rsp), %xmm3
  2235. por %xmm0, %xmm2
  2236. por %xmm1, %xmm3
  2237. movdqu %xmm2, 0x00($r_ptr)
  2238. movdqu %xmm3, 0x10($r_ptr)
  2239. movdqa %xmm5, %xmm0 # copy_conditional(res_y, in2_y, in1infty);
  2240. movdqa %xmm5, %xmm1
  2241. pandn $res_y(%rsp), %xmm0
  2242. movdqa %xmm5, %xmm2
  2243. pandn $res_y+0x10(%rsp), %xmm1
  2244. movdqa %xmm5, %xmm3
  2245. pand $in2_y(%rsp), %xmm2
  2246. pand $in2_y+0x10(%rsp), %xmm3
  2247. por %xmm0, %xmm2
  2248. por %xmm1, %xmm3
  2249. movdqa %xmm4, %xmm0 # copy_conditional(res_y, in1_y, in2infty);
  2250. movdqa %xmm4, %xmm1
  2251. pandn %xmm2, %xmm0
  2252. movdqa %xmm4, %xmm2
  2253. pandn %xmm3, %xmm1
  2254. movdqa %xmm4, %xmm3
  2255. pand $in1_y(%rsp), %xmm2
  2256. pand $in1_y+0x10(%rsp), %xmm3
  2257. por %xmm0, %xmm2
  2258. por %xmm1, %xmm3
  2259. movdqu %xmm2, 0x20($r_ptr)
  2260. movdqu %xmm3, 0x30($r_ptr)
  2261. add \$32*15+8, %rsp
  2262. pop %r15
  2263. pop %r14
  2264. pop %r13
  2265. pop %r12
  2266. pop %rbx
  2267. pop %rbp
  2268. ret
  2269. .size ecp_nistz256_point_add_affine$sfx,.-ecp_nistz256_point_add_affine$sfx
  2270. ___
  2271. }
  2272. &gen_add_affine("q");
  2273. ########################################################################
  2274. # AD*X magic
  2275. #
  2276. if ($addx) { {
  2277. ########################################################################
  2278. # operate in 4-5-0-1 "name space" that matches multiplication output
  2279. #
  2280. my ($a0,$a1,$a2,$a3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3);
  2281. $code.=<<___;
  2282. .type __ecp_nistz256_add_tox,\@abi-omnipotent
  2283. .align 32
  2284. __ecp_nistz256_add_tox:
  2285. xor $t4, $t4
  2286. adc 8*0($b_ptr), $a0
  2287. adc 8*1($b_ptr), $a1
  2288. mov $a0, $t0
  2289. adc 8*2($b_ptr), $a2
  2290. adc 8*3($b_ptr), $a3
  2291. mov $a1, $t1
  2292. adc \$0, $t4
  2293. xor $t3, $t3
  2294. sbb \$-1, $a0
  2295. mov $a2, $t2
  2296. sbb $poly1, $a1
  2297. sbb \$0, $a2
  2298. mov $a3, $t3
  2299. sbb $poly3, $a3
  2300. bt \$0, $t4
  2301. cmovnc $t0, $a0
  2302. cmovnc $t1, $a1
  2303. mov $a0, 8*0($r_ptr)
  2304. cmovnc $t2, $a2
  2305. mov $a1, 8*1($r_ptr)
  2306. cmovnc $t3, $a3
  2307. mov $a2, 8*2($r_ptr)
  2308. mov $a3, 8*3($r_ptr)
  2309. ret
  2310. .size __ecp_nistz256_add_tox,.-__ecp_nistz256_add_tox
  2311. .type __ecp_nistz256_sub_fromx,\@abi-omnipotent
  2312. .align 32
  2313. __ecp_nistz256_sub_fromx:
  2314. xor $t4, $t4
  2315. sbb 8*0($b_ptr), $a0
  2316. sbb 8*1($b_ptr), $a1
  2317. mov $a0, $t0
  2318. sbb 8*2($b_ptr), $a2
  2319. sbb 8*3($b_ptr), $a3
  2320. mov $a1, $t1
  2321. sbb \$0, $t4
  2322. xor $t3, $t3
  2323. adc \$-1, $a0
  2324. mov $a2, $t2
  2325. adc $poly1, $a1
  2326. adc \$0, $a2
  2327. mov $a3, $t3
  2328. adc $poly3, $a3
  2329. bt \$0, $t4
  2330. cmovnc $t0, $a0
  2331. cmovnc $t1, $a1
  2332. mov $a0, 8*0($r_ptr)
  2333. cmovnc $t2, $a2
  2334. mov $a1, 8*1($r_ptr)
  2335. cmovnc $t3, $a3
  2336. mov $a2, 8*2($r_ptr)
  2337. mov $a3, 8*3($r_ptr)
  2338. ret
  2339. .size __ecp_nistz256_sub_fromx,.-__ecp_nistz256_sub_fromx
  2340. .type __ecp_nistz256_subx,\@abi-omnipotent
  2341. .align 32
  2342. __ecp_nistz256_subx:
  2343. xor $t4, $t4
  2344. sbb $a0, $t0
  2345. sbb $a1, $t1
  2346. mov $t0, $a0
  2347. sbb $a2, $t2
  2348. sbb $a3, $t3
  2349. mov $t1, $a1
  2350. sbb \$0, $t4
  2351. xor $a3 ,$a3
  2352. adc \$-1, $t0
  2353. mov $t2, $a2
  2354. adc $poly1, $t1
  2355. adc \$0, $t2
  2356. mov $t3, $a3
  2357. adc $poly3, $t3
  2358. bt \$0, $t4
  2359. cmovc $t0, $a0
  2360. cmovc $t1, $a1
  2361. cmovc $t2, $a2
  2362. cmovc $t3, $a3
  2363. ret
  2364. .size __ecp_nistz256_subx,.-__ecp_nistz256_subx
  2365. .type __ecp_nistz256_mul_by_2x,\@abi-omnipotent
  2366. .align 32
  2367. __ecp_nistz256_mul_by_2x:
  2368. xor $t4, $t4
  2369. adc $a0, $a0 # a0:a3+a0:a3
  2370. adc $a1, $a1
  2371. mov $a0, $t0
  2372. adc $a2, $a2
  2373. adc $a3, $a3
  2374. mov $a1, $t1
  2375. adc \$0, $t4
  2376. xor $t3, $t3
  2377. sbb \$-1, $a0
  2378. mov $a2, $t2
  2379. sbb $poly1, $a1
  2380. sbb \$0, $a2
  2381. mov $a3, $t3
  2382. sbb $poly3, $a3
  2383. bt \$0, $t4
  2384. cmovnc $t0, $a0
  2385. cmovnc $t1, $a1
  2386. mov $a0, 8*0($r_ptr)
  2387. cmovnc $t2, $a2
  2388. mov $a1, 8*1($r_ptr)
  2389. cmovnc $t3, $a3
  2390. mov $a2, 8*2($r_ptr)
  2391. mov $a3, 8*3($r_ptr)
  2392. ret
  2393. .size __ecp_nistz256_mul_by_2x,.-__ecp_nistz256_mul_by_2x
  2394. ___
  2395. }
  2396. &gen_double("x");
  2397. &gen_add("x");
  2398. &gen_add_affine("x");
  2399. }
  2400. }}}
  2401. $code =~ s/\`([^\`]*)\`/eval $1/gem;
  2402. print $code;
  2403. close STDOUT;