You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

1582 line
32 KiB

  1. #! /usr/bin/env perl
  2. # Copyright 2005-2016 The OpenSSL Project Authors. All Rights Reserved.
  3. #
  4. # Licensed under the OpenSSL license (the "License"). You may not use
  5. # this file except in compliance with the License. You can obtain a copy
  6. # in the file LICENSE in the source distribution or at
  7. # https://www.openssl.org/source/license.html
  8. # ====================================================================
  9. # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  10. # project. The module is, however, dual licensed under OpenSSL and
  11. # CRYPTOGAMS licenses depending on where you obtain it. For further
  12. # details see http://www.openssl.org/~appro/cryptogams/.
  13. # ====================================================================
  14. # October 2005.
  15. #
  16. # Montgomery multiplication routine for x86_64. While it gives modest
  17. # 9% improvement of rsa4096 sign on Opteron, rsa512 sign runs more
  18. # than twice, >2x, as fast. Most common rsa1024 sign is improved by
  19. # respectful 50%. It remains to be seen if loop unrolling and
  20. # dedicated squaring routine can provide further improvement...
  21. # July 2011.
  22. #
  23. # Add dedicated squaring procedure. Performance improvement varies
  24. # from platform to platform, but in average it's ~5%/15%/25%/33%
  25. # for 512-/1024-/2048-/4096-bit RSA *sign* benchmarks respectively.
  26. # August 2011.
  27. #
  28. # Unroll and modulo-schedule inner loops in such manner that they
  29. # are "fallen through" for input lengths of 8, which is critical for
  30. # 1024-bit RSA *sign*. Average performance improvement in comparison
  31. # to *initial* version of this module from 2005 is ~0%/30%/40%/45%
  32. # for 512-/1024-/2048-/4096-bit RSA *sign* benchmarks respectively.
  33. # June 2013.
  34. #
  35. # Optimize reduction in squaring procedure and improve 1024+-bit RSA
  36. # sign performance by 10-16% on Intel Sandy Bridge and later
  37. # (virtually same on non-Intel processors).
  38. # August 2013.
  39. #
  40. # Add MULX/ADOX/ADCX code path.
  41. $flavour = shift;
  42. $output = shift;
  43. if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
  44. $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
  45. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  46. ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
  47. ( $xlate="${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate) or
  48. die "can't locate x86_64-xlate.pl";
  49. open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
  50. *STDOUT=*OUT;
  51. # In upstream, this is controlled by shelling out to the compiler to check
  52. # versions, but BoringSSL is intended to be used with pre-generated perlasm
  53. # output, so this isn't useful anyway.
  54. $addx = 1;
  55. # int bn_mul_mont(
  56. $rp="%rdi"; # BN_ULONG *rp,
  57. $ap="%rsi"; # const BN_ULONG *ap,
  58. $bp="%rdx"; # const BN_ULONG *bp,
  59. $np="%rcx"; # const BN_ULONG *np,
  60. $n0="%r8"; # const BN_ULONG *n0,
  61. # TODO(davidben): The code below treats $num as an int, but C passes in a
  62. # size_t.
  63. $num="%r9"; # size_t num);
  64. $lo0="%r10";
  65. $hi0="%r11";
  66. $hi1="%r13";
  67. $i="%r14";
  68. $j="%r15";
  69. $m0="%rbx";
  70. $m1="%rbp";
  71. $code=<<___;
  72. .text
  73. .extern OPENSSL_ia32cap_P
  74. .globl bn_mul_mont
  75. .type bn_mul_mont,\@function,6
  76. .align 16
  77. bn_mul_mont:
  78. .cfi_startproc
  79. mov ${num}d,${num}d
  80. mov %rsp,%rax
  81. .cfi_def_cfa_register %rax
  82. test \$3,${num}d
  83. jnz .Lmul_enter
  84. cmp \$8,${num}d
  85. jb .Lmul_enter
  86. ___
  87. $code.=<<___ if ($addx);
  88. leaq OPENSSL_ia32cap_P(%rip),%r11
  89. mov 8(%r11),%r11d
  90. ___
  91. $code.=<<___;
  92. cmp $ap,$bp
  93. jne .Lmul4x_enter
  94. test \$7,${num}d
  95. jz .Lsqr8x_enter
  96. jmp .Lmul4x_enter
  97. .align 16
  98. .Lmul_enter:
  99. push %rbx
  100. .cfi_push %rbx
  101. push %rbp
  102. .cfi_push %rbp
  103. push %r12
  104. .cfi_push %r12
  105. push %r13
  106. .cfi_push %r13
  107. push %r14
  108. .cfi_push %r14
  109. push %r15
  110. .cfi_push %r15
  111. neg $num
  112. mov %rsp,%r11
  113. lea -16(%rsp,$num,8),%r10 # future alloca(8*(num+2))
  114. neg $num # restore $num
  115. and \$-1024,%r10 # minimize TLB usage
  116. # An OS-agnostic version of __chkstk.
  117. #
  118. # Some OSes (Windows) insist on stack being "wired" to
  119. # physical memory in strictly sequential manner, i.e. if stack
  120. # allocation spans two pages, then reference to farmost one can
  121. # be punishable by SEGV. But page walking can do good even on
  122. # other OSes, because it guarantees that villain thread hits
  123. # the guard page before it can make damage to innocent one...
  124. sub %r10,%r11
  125. and \$-4096,%r11
  126. lea (%r10,%r11),%rsp
  127. mov (%rsp),%r11
  128. cmp %r10,%rsp
  129. ja .Lmul_page_walk
  130. jmp .Lmul_page_walk_done
  131. .align 16
  132. .Lmul_page_walk:
  133. lea -4096(%rsp),%rsp
  134. mov (%rsp),%r11
  135. cmp %r10,%rsp
  136. ja .Lmul_page_walk
  137. .Lmul_page_walk_done:
  138. mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp
  139. .cfi_cfa_expression %rsp+8,$num,8,mul,plus,deref,+8
  140. .Lmul_body:
  141. mov $bp,%r12 # reassign $bp
  142. ___
  143. $bp="%r12";
  144. $code.=<<___;
  145. mov ($n0),$n0 # pull n0[0] value
  146. mov ($bp),$m0 # m0=bp[0]
  147. mov ($ap),%rax
  148. xor $i,$i # i=0
  149. xor $j,$j # j=0
  150. mov $n0,$m1
  151. mulq $m0 # ap[0]*bp[0]
  152. mov %rax,$lo0
  153. mov ($np),%rax
  154. imulq $lo0,$m1 # "tp[0]"*n0
  155. mov %rdx,$hi0
  156. mulq $m1 # np[0]*m1
  157. add %rax,$lo0 # discarded
  158. mov 8($ap),%rax
  159. adc \$0,%rdx
  160. mov %rdx,$hi1
  161. lea 1($j),$j # j++
  162. jmp .L1st_enter
  163. .align 16
  164. .L1st:
  165. add %rax,$hi1
  166. mov ($ap,$j,8),%rax
  167. adc \$0,%rdx
  168. add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0]
  169. mov $lo0,$hi0
  170. adc \$0,%rdx
  171. mov $hi1,-16(%rsp,$j,8) # tp[j-1]
  172. mov %rdx,$hi1
  173. .L1st_enter:
  174. mulq $m0 # ap[j]*bp[0]
  175. add %rax,$hi0
  176. mov ($np,$j,8),%rax
  177. adc \$0,%rdx
  178. lea 1($j),$j # j++
  179. mov %rdx,$lo0
  180. mulq $m1 # np[j]*m1
  181. cmp $num,$j
  182. jne .L1st
  183. add %rax,$hi1
  184. mov ($ap),%rax # ap[0]
  185. adc \$0,%rdx
  186. add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0]
  187. adc \$0,%rdx
  188. mov $hi1,-16(%rsp,$j,8) # tp[j-1]
  189. mov %rdx,$hi1
  190. mov $lo0,$hi0
  191. xor %rdx,%rdx
  192. add $hi0,$hi1
  193. adc \$0,%rdx
  194. mov $hi1,-8(%rsp,$num,8)
  195. mov %rdx,(%rsp,$num,8) # store upmost overflow bit
  196. lea 1($i),$i # i++
  197. jmp .Louter
  198. .align 16
  199. .Louter:
  200. mov ($bp,$i,8),$m0 # m0=bp[i]
  201. xor $j,$j # j=0
  202. mov $n0,$m1
  203. mov (%rsp),$lo0
  204. mulq $m0 # ap[0]*bp[i]
  205. add %rax,$lo0 # ap[0]*bp[i]+tp[0]
  206. mov ($np),%rax
  207. adc \$0,%rdx
  208. imulq $lo0,$m1 # tp[0]*n0
  209. mov %rdx,$hi0
  210. mulq $m1 # np[0]*m1
  211. add %rax,$lo0 # discarded
  212. mov 8($ap),%rax
  213. adc \$0,%rdx
  214. mov 8(%rsp),$lo0 # tp[1]
  215. mov %rdx,$hi1
  216. lea 1($j),$j # j++
  217. jmp .Linner_enter
  218. .align 16
  219. .Linner:
  220. add %rax,$hi1
  221. mov ($ap,$j,8),%rax
  222. adc \$0,%rdx
  223. add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j]
  224. mov (%rsp,$j,8),$lo0
  225. adc \$0,%rdx
  226. mov $hi1,-16(%rsp,$j,8) # tp[j-1]
  227. mov %rdx,$hi1
  228. .Linner_enter:
  229. mulq $m0 # ap[j]*bp[i]
  230. add %rax,$hi0
  231. mov ($np,$j,8),%rax
  232. adc \$0,%rdx
  233. add $hi0,$lo0 # ap[j]*bp[i]+tp[j]
  234. mov %rdx,$hi0
  235. adc \$0,$hi0
  236. lea 1($j),$j # j++
  237. mulq $m1 # np[j]*m1
  238. cmp $num,$j
  239. jne .Linner
  240. add %rax,$hi1
  241. mov ($ap),%rax # ap[0]
  242. adc \$0,%rdx
  243. add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j]
  244. mov (%rsp,$j,8),$lo0
  245. adc \$0,%rdx
  246. mov $hi1,-16(%rsp,$j,8) # tp[j-1]
  247. mov %rdx,$hi1
  248. xor %rdx,%rdx
  249. add $hi0,$hi1
  250. adc \$0,%rdx
  251. add $lo0,$hi1 # pull upmost overflow bit
  252. adc \$0,%rdx
  253. mov $hi1,-8(%rsp,$num,8)
  254. mov %rdx,(%rsp,$num,8) # store upmost overflow bit
  255. lea 1($i),$i # i++
  256. cmp $num,$i
  257. jb .Louter
  258. xor $i,$i # i=0 and clear CF!
  259. mov (%rsp),%rax # tp[0]
  260. mov $num,$j # j=num
  261. .align 16
  262. .Lsub: sbb ($np,$i,8),%rax
  263. mov %rax,($rp,$i,8) # rp[i]=tp[i]-np[i]
  264. mov 8(%rsp,$i,8),%rax # tp[i+1]
  265. lea 1($i),$i # i++
  266. dec $j # doesn't affect CF!
  267. jnz .Lsub
  268. sbb \$0,%rax # handle upmost overflow bit
  269. mov \$-1,%rbx
  270. xor %rax,%rbx # not %rax
  271. xor $i,$i
  272. mov $num,$j # j=num
  273. .Lcopy: # conditional copy
  274. mov ($rp,$i,8),%rcx
  275. mov (%rsp,$i,8),%rdx
  276. and %rbx,%rcx
  277. and %rax,%rdx
  278. mov $num,(%rsp,$i,8) # zap temporary vector
  279. or %rcx,%rdx
  280. mov %rdx,($rp,$i,8) # rp[i]=tp[i]
  281. lea 1($i),$i
  282. sub \$1,$j
  283. jnz .Lcopy
  284. mov 8(%rsp,$num,8),%rsi # restore %rsp
  285. .cfi_def_cfa %rsi,8
  286. mov \$1,%rax
  287. mov -48(%rsi),%r15
  288. .cfi_restore %r15
  289. mov -40(%rsi),%r14
  290. .cfi_restore %r14
  291. mov -32(%rsi),%r13
  292. .cfi_restore %r13
  293. mov -24(%rsi),%r12
  294. .cfi_restore %r12
  295. mov -16(%rsi),%rbp
  296. .cfi_restore %rbp
  297. mov -8(%rsi),%rbx
  298. .cfi_restore %rbx
  299. lea (%rsi),%rsp
  300. .cfi_def_cfa_register %rsp
  301. .Lmul_epilogue:
  302. ret
  303. .cfi_endproc
  304. .size bn_mul_mont,.-bn_mul_mont
  305. ___
  306. {{{
  307. my @A=("%r10","%r11");
  308. my @N=("%r13","%rdi");
  309. $code.=<<___;
  310. .type bn_mul4x_mont,\@function,6
  311. .align 16
  312. bn_mul4x_mont:
  313. .cfi_startproc
  314. mov ${num}d,${num}d
  315. mov %rsp,%rax
  316. .cfi_def_cfa_register %rax
  317. .Lmul4x_enter:
  318. ___
  319. $code.=<<___ if ($addx);
  320. and \$0x80100,%r11d
  321. cmp \$0x80100,%r11d
  322. je .Lmulx4x_enter
  323. ___
  324. $code.=<<___;
  325. push %rbx
  326. .cfi_push %rbx
  327. push %rbp
  328. .cfi_push %rbp
  329. push %r12
  330. .cfi_push %r12
  331. push %r13
  332. .cfi_push %r13
  333. push %r14
  334. .cfi_push %r14
  335. push %r15
  336. .cfi_push %r15
  337. neg $num
  338. mov %rsp,%r11
  339. lea -32(%rsp,$num,8),%r10 # future alloca(8*(num+4))
  340. neg $num # restore
  341. and \$-1024,%r10 # minimize TLB usage
  342. sub %r10,%r11
  343. and \$-4096,%r11
  344. lea (%r10,%r11),%rsp
  345. mov (%rsp),%r11
  346. cmp %r10,%rsp
  347. ja .Lmul4x_page_walk
  348. jmp .Lmul4x_page_walk_done
  349. .Lmul4x_page_walk:
  350. lea -4096(%rsp),%rsp
  351. mov (%rsp),%r11
  352. cmp %r10,%rsp
  353. ja .Lmul4x_page_walk
  354. .Lmul4x_page_walk_done:
  355. mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp
  356. .cfi_cfa_expression %rsp+8,$num,8,mul,plus,deref,+8
  357. .Lmul4x_body:
  358. mov $rp,16(%rsp,$num,8) # tp[num+2]=$rp
  359. mov %rdx,%r12 # reassign $bp
  360. ___
  361. $bp="%r12";
  362. $code.=<<___;
  363. mov ($n0),$n0 # pull n0[0] value
  364. mov ($bp),$m0 # m0=bp[0]
  365. mov ($ap),%rax
  366. xor $i,$i # i=0
  367. xor $j,$j # j=0
  368. mov $n0,$m1
  369. mulq $m0 # ap[0]*bp[0]
  370. mov %rax,$A[0]
  371. mov ($np),%rax
  372. imulq $A[0],$m1 # "tp[0]"*n0
  373. mov %rdx,$A[1]
  374. mulq $m1 # np[0]*m1
  375. add %rax,$A[0] # discarded
  376. mov 8($ap),%rax
  377. adc \$0,%rdx
  378. mov %rdx,$N[1]
  379. mulq $m0
  380. add %rax,$A[1]
  381. mov 8($np),%rax
  382. adc \$0,%rdx
  383. mov %rdx,$A[0]
  384. mulq $m1
  385. add %rax,$N[1]
  386. mov 16($ap),%rax
  387. adc \$0,%rdx
  388. add $A[1],$N[1]
  389. lea 4($j),$j # j++
  390. adc \$0,%rdx
  391. mov $N[1],(%rsp)
  392. mov %rdx,$N[0]
  393. jmp .L1st4x
  394. .align 16
  395. .L1st4x:
  396. mulq $m0 # ap[j]*bp[0]
  397. add %rax,$A[0]
  398. mov -16($np,$j,8),%rax
  399. adc \$0,%rdx
  400. mov %rdx,$A[1]
  401. mulq $m1 # np[j]*m1
  402. add %rax,$N[0]
  403. mov -8($ap,$j,8),%rax
  404. adc \$0,%rdx
  405. add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0]
  406. adc \$0,%rdx
  407. mov $N[0],-24(%rsp,$j,8) # tp[j-1]
  408. mov %rdx,$N[1]
  409. mulq $m0 # ap[j]*bp[0]
  410. add %rax,$A[1]
  411. mov -8($np,$j,8),%rax
  412. adc \$0,%rdx
  413. mov %rdx,$A[0]
  414. mulq $m1 # np[j]*m1
  415. add %rax,$N[1]
  416. mov ($ap,$j,8),%rax
  417. adc \$0,%rdx
  418. add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0]
  419. adc \$0,%rdx
  420. mov $N[1],-16(%rsp,$j,8) # tp[j-1]
  421. mov %rdx,$N[0]
  422. mulq $m0 # ap[j]*bp[0]
  423. add %rax,$A[0]
  424. mov ($np,$j,8),%rax
  425. adc \$0,%rdx
  426. mov %rdx,$A[1]
  427. mulq $m1 # np[j]*m1
  428. add %rax,$N[0]
  429. mov 8($ap,$j,8),%rax
  430. adc \$0,%rdx
  431. add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0]
  432. adc \$0,%rdx
  433. mov $N[0],-8(%rsp,$j,8) # tp[j-1]
  434. mov %rdx,$N[1]
  435. mulq $m0 # ap[j]*bp[0]
  436. add %rax,$A[1]
  437. mov 8($np,$j,8),%rax
  438. adc \$0,%rdx
  439. lea 4($j),$j # j++
  440. mov %rdx,$A[0]
  441. mulq $m1 # np[j]*m1
  442. add %rax,$N[1]
  443. mov -16($ap,$j,8),%rax
  444. adc \$0,%rdx
  445. add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0]
  446. adc \$0,%rdx
  447. mov $N[1],-32(%rsp,$j,8) # tp[j-1]
  448. mov %rdx,$N[0]
  449. cmp $num,$j
  450. jb .L1st4x
  451. mulq $m0 # ap[j]*bp[0]
  452. add %rax,$A[0]
  453. mov -16($np,$j,8),%rax
  454. adc \$0,%rdx
  455. mov %rdx,$A[1]
  456. mulq $m1 # np[j]*m1
  457. add %rax,$N[0]
  458. mov -8($ap,$j,8),%rax
  459. adc \$0,%rdx
  460. add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0]
  461. adc \$0,%rdx
  462. mov $N[0],-24(%rsp,$j,8) # tp[j-1]
  463. mov %rdx,$N[1]
  464. mulq $m0 # ap[j]*bp[0]
  465. add %rax,$A[1]
  466. mov -8($np,$j,8),%rax
  467. adc \$0,%rdx
  468. mov %rdx,$A[0]
  469. mulq $m1 # np[j]*m1
  470. add %rax,$N[1]
  471. mov ($ap),%rax # ap[0]
  472. adc \$0,%rdx
  473. add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0]
  474. adc \$0,%rdx
  475. mov $N[1],-16(%rsp,$j,8) # tp[j-1]
  476. mov %rdx,$N[0]
  477. xor $N[1],$N[1]
  478. add $A[0],$N[0]
  479. adc \$0,$N[1]
  480. mov $N[0],-8(%rsp,$j,8)
  481. mov $N[1],(%rsp,$j,8) # store upmost overflow bit
  482. lea 1($i),$i # i++
  483. .align 4
  484. .Louter4x:
  485. mov ($bp,$i,8),$m0 # m0=bp[i]
  486. xor $j,$j # j=0
  487. mov (%rsp),$A[0]
  488. mov $n0,$m1
  489. mulq $m0 # ap[0]*bp[i]
  490. add %rax,$A[0] # ap[0]*bp[i]+tp[0]
  491. mov ($np),%rax
  492. adc \$0,%rdx
  493. imulq $A[0],$m1 # tp[0]*n0
  494. mov %rdx,$A[1]
  495. mulq $m1 # np[0]*m1
  496. add %rax,$A[0] # "$N[0]", discarded
  497. mov 8($ap),%rax
  498. adc \$0,%rdx
  499. mov %rdx,$N[1]
  500. mulq $m0 # ap[j]*bp[i]
  501. add %rax,$A[1]
  502. mov 8($np),%rax
  503. adc \$0,%rdx
  504. add 8(%rsp),$A[1] # +tp[1]
  505. adc \$0,%rdx
  506. mov %rdx,$A[0]
  507. mulq $m1 # np[j]*m1
  508. add %rax,$N[1]
  509. mov 16($ap),%rax
  510. adc \$0,%rdx
  511. add $A[1],$N[1] # np[j]*m1+ap[j]*bp[i]+tp[j]
  512. lea 4($j),$j # j+=2
  513. adc \$0,%rdx
  514. mov $N[1],(%rsp) # tp[j-1]
  515. mov %rdx,$N[0]
  516. jmp .Linner4x
  517. .align 16
  518. .Linner4x:
  519. mulq $m0 # ap[j]*bp[i]
  520. add %rax,$A[0]
  521. mov -16($np,$j,8),%rax
  522. adc \$0,%rdx
  523. add -16(%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j]
  524. adc \$0,%rdx
  525. mov %rdx,$A[1]
  526. mulq $m1 # np[j]*m1
  527. add %rax,$N[0]
  528. mov -8($ap,$j,8),%rax
  529. adc \$0,%rdx
  530. add $A[0],$N[0]
  531. adc \$0,%rdx
  532. mov $N[0],-24(%rsp,$j,8) # tp[j-1]
  533. mov %rdx,$N[1]
  534. mulq $m0 # ap[j]*bp[i]
  535. add %rax,$A[1]
  536. mov -8($np,$j,8),%rax
  537. adc \$0,%rdx
  538. add -8(%rsp,$j,8),$A[1]
  539. adc \$0,%rdx
  540. mov %rdx,$A[0]
  541. mulq $m1 # np[j]*m1
  542. add %rax,$N[1]
  543. mov ($ap,$j,8),%rax
  544. adc \$0,%rdx
  545. add $A[1],$N[1]
  546. adc \$0,%rdx
  547. mov $N[1],-16(%rsp,$j,8) # tp[j-1]
  548. mov %rdx,$N[0]
  549. mulq $m0 # ap[j]*bp[i]
  550. add %rax,$A[0]
  551. mov ($np,$j,8),%rax
  552. adc \$0,%rdx
  553. add (%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j]
  554. adc \$0,%rdx
  555. mov %rdx,$A[1]
  556. mulq $m1 # np[j]*m1
  557. add %rax,$N[0]
  558. mov 8($ap,$j,8),%rax
  559. adc \$0,%rdx
  560. add $A[0],$N[0]
  561. adc \$0,%rdx
  562. mov $N[0],-8(%rsp,$j,8) # tp[j-1]
  563. mov %rdx,$N[1]
  564. mulq $m0 # ap[j]*bp[i]
  565. add %rax,$A[1]
  566. mov 8($np,$j,8),%rax
  567. adc \$0,%rdx
  568. add 8(%rsp,$j,8),$A[1]
  569. adc \$0,%rdx
  570. lea 4($j),$j # j++
  571. mov %rdx,$A[0]
  572. mulq $m1 # np[j]*m1
  573. add %rax,$N[1]
  574. mov -16($ap,$j,8),%rax
  575. adc \$0,%rdx
  576. add $A[1],$N[1]
  577. adc \$0,%rdx
  578. mov $N[1],-32(%rsp,$j,8) # tp[j-1]
  579. mov %rdx,$N[0]
  580. cmp $num,$j
  581. jb .Linner4x
  582. mulq $m0 # ap[j]*bp[i]
  583. add %rax,$A[0]
  584. mov -16($np,$j,8),%rax
  585. adc \$0,%rdx
  586. add -16(%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j]
  587. adc \$0,%rdx
  588. mov %rdx,$A[1]
  589. mulq $m1 # np[j]*m1
  590. add %rax,$N[0]
  591. mov -8($ap,$j,8),%rax
  592. adc \$0,%rdx
  593. add $A[0],$N[0]
  594. adc \$0,%rdx
  595. mov $N[0],-24(%rsp,$j,8) # tp[j-1]
  596. mov %rdx,$N[1]
  597. mulq $m0 # ap[j]*bp[i]
  598. add %rax,$A[1]
  599. mov -8($np,$j,8),%rax
  600. adc \$0,%rdx
  601. add -8(%rsp,$j,8),$A[1]
  602. adc \$0,%rdx
  603. lea 1($i),$i # i++
  604. mov %rdx,$A[0]
  605. mulq $m1 # np[j]*m1
  606. add %rax,$N[1]
  607. mov ($ap),%rax # ap[0]
  608. adc \$0,%rdx
  609. add $A[1],$N[1]
  610. adc \$0,%rdx
  611. mov $N[1],-16(%rsp,$j,8) # tp[j-1]
  612. mov %rdx,$N[0]
  613. xor $N[1],$N[1]
  614. add $A[0],$N[0]
  615. adc \$0,$N[1]
  616. add (%rsp,$num,8),$N[0] # pull upmost overflow bit
  617. adc \$0,$N[1]
  618. mov $N[0],-8(%rsp,$j,8)
  619. mov $N[1],(%rsp,$j,8) # store upmost overflow bit
  620. cmp $num,$i
  621. jb .Louter4x
  622. ___
  623. {
  624. my @ri=("%rax","%rdx",$m0,$m1);
  625. $code.=<<___;
  626. mov 16(%rsp,$num,8),$rp # restore $rp
  627. lea -4($num),$j
  628. mov 0(%rsp),@ri[0] # tp[0]
  629. mov 8(%rsp),@ri[1] # tp[1]
  630. shr \$2,$j # j=num/4-1
  631. lea (%rsp),$ap # borrow ap for tp
  632. xor $i,$i # i=0 and clear CF!
  633. sub 0($np),@ri[0]
  634. mov 16($ap),@ri[2] # tp[2]
  635. mov 24($ap),@ri[3] # tp[3]
  636. sbb 8($np),@ri[1]
  637. .Lsub4x:
  638. mov @ri[0],0($rp,$i,8) # rp[i]=tp[i]-np[i]
  639. mov @ri[1],8($rp,$i,8) # rp[i]=tp[i]-np[i]
  640. sbb 16($np,$i,8),@ri[2]
  641. mov 32($ap,$i,8),@ri[0] # tp[i+1]
  642. mov 40($ap,$i,8),@ri[1]
  643. sbb 24($np,$i,8),@ri[3]
  644. mov @ri[2],16($rp,$i,8) # rp[i]=tp[i]-np[i]
  645. mov @ri[3],24($rp,$i,8) # rp[i]=tp[i]-np[i]
  646. sbb 32($np,$i,8),@ri[0]
  647. mov 48($ap,$i,8),@ri[2]
  648. mov 56($ap,$i,8),@ri[3]
  649. sbb 40($np,$i,8),@ri[1]
  650. lea 4($i),$i # i++
  651. dec $j # doesn't affect CF!
  652. jnz .Lsub4x
  653. mov @ri[0],0($rp,$i,8) # rp[i]=tp[i]-np[i]
  654. mov 32($ap,$i,8),@ri[0] # load overflow bit
  655. sbb 16($np,$i,8),@ri[2]
  656. mov @ri[1],8($rp,$i,8) # rp[i]=tp[i]-np[i]
  657. sbb 24($np,$i,8),@ri[3]
  658. mov @ri[2],16($rp,$i,8) # rp[i]=tp[i]-np[i]
  659. sbb \$0,@ri[0] # handle upmost overflow bit
  660. mov @ri[3],24($rp,$i,8) # rp[i]=tp[i]-np[i]
  661. pxor %xmm0,%xmm0
  662. movq @ri[0],%xmm4
  663. pcmpeqd %xmm5,%xmm5
  664. pshufd \$0,%xmm4,%xmm4
  665. mov $num,$j
  666. pxor %xmm4,%xmm5
  667. shr \$2,$j # j=num/4
  668. xor %eax,%eax # i=0
  669. jmp .Lcopy4x
  670. .align 16
  671. .Lcopy4x: # conditional copy
  672. movdqa (%rsp,%rax),%xmm1
  673. movdqu ($rp,%rax),%xmm2
  674. pand %xmm4,%xmm1
  675. pand %xmm5,%xmm2
  676. movdqa 16(%rsp,%rax),%xmm3
  677. movdqa %xmm0,(%rsp,%rax)
  678. por %xmm2,%xmm1
  679. movdqu 16($rp,%rax),%xmm2
  680. movdqu %xmm1,($rp,%rax)
  681. pand %xmm4,%xmm3
  682. pand %xmm5,%xmm2
  683. movdqa %xmm0,16(%rsp,%rax)
  684. por %xmm2,%xmm3
  685. movdqu %xmm3,16($rp,%rax)
  686. lea 32(%rax),%rax
  687. dec $j
  688. jnz .Lcopy4x
  689. ___
  690. }
  691. $code.=<<___;
  692. mov 8(%rsp,$num,8),%rsi # restore %rsp
  693. .cfi_def_cfa %rsi, 8
  694. mov \$1,%rax
  695. mov -48(%rsi),%r15
  696. .cfi_restore %r15
  697. mov -40(%rsi),%r14
  698. .cfi_restore %r14
  699. mov -32(%rsi),%r13
  700. .cfi_restore %r13
  701. mov -24(%rsi),%r12
  702. .cfi_restore %r12
  703. mov -16(%rsi),%rbp
  704. .cfi_restore %rbp
  705. mov -8(%rsi),%rbx
  706. .cfi_restore %rbx
  707. lea (%rsi),%rsp
  708. .cfi_def_cfa_register %rsp
  709. .Lmul4x_epilogue:
  710. ret
  711. .cfi_endproc
  712. .size bn_mul4x_mont,.-bn_mul4x_mont
  713. ___
  714. }}}
  715. {{{
  716. ######################################################################
  717. # void bn_sqr8x_mont(
  718. my $rptr="%rdi"; # const BN_ULONG *rptr,
  719. my $aptr="%rsi"; # const BN_ULONG *aptr,
  720. my $bptr="%rdx"; # not used
  721. my $nptr="%rcx"; # const BN_ULONG *nptr,
  722. my $n0 ="%r8"; # const BN_ULONG *n0);
  723. my $num ="%r9"; # int num, has to be divisible by 8
  724. my ($i,$j,$tptr)=("%rbp","%rcx",$rptr);
  725. my @A0=("%r10","%r11");
  726. my @A1=("%r12","%r13");
  727. my ($a0,$a1,$ai)=("%r14","%r15","%rbx");
  728. $code.=<<___ if ($addx);
  729. .extern bn_sqrx8x_internal # see x86_64-mont5 module
  730. ___
  731. $code.=<<___;
  732. .extern bn_sqr8x_internal # see x86_64-mont5 module
  733. .type bn_sqr8x_mont,\@function,6
  734. .align 32
  735. bn_sqr8x_mont:
  736. .cfi_startproc
  737. mov %rsp,%rax
  738. .cfi_def_cfa_register %rax
  739. .Lsqr8x_enter:
  740. push %rbx
  741. .cfi_push %rbx
  742. push %rbp
  743. .cfi_push %rbp
  744. push %r12
  745. .cfi_push %r12
  746. push %r13
  747. .cfi_push %r13
  748. push %r14
  749. .cfi_push %r14
  750. push %r15
  751. .cfi_push %r15
  752. .Lsqr8x_prologue:
  753. mov ${num}d,%r10d
  754. shl \$3,${num}d # convert $num to bytes
  755. shl \$3+2,%r10 # 4*$num
  756. neg $num
  757. ##############################################################
  758. # ensure that stack frame doesn't alias with $aptr modulo
  759. # 4096. this is done to allow memory disambiguation logic
  760. # do its job.
  761. #
  762. lea -64(%rsp,$num,2),%r11
  763. mov %rsp,%rbp
  764. mov ($n0),$n0 # *n0
  765. sub $aptr,%r11
  766. and \$4095,%r11
  767. cmp %r11,%r10
  768. jb .Lsqr8x_sp_alt
  769. sub %r11,%rbp # align with $aptr
  770. lea -64(%rbp,$num,2),%rbp # future alloca(frame+2*$num)
  771. jmp .Lsqr8x_sp_done
  772. .align 32
  773. .Lsqr8x_sp_alt:
  774. lea 4096-64(,$num,2),%r10 # 4096-frame-2*$num
  775. lea -64(%rbp,$num,2),%rbp # future alloca(frame+2*$num)
  776. sub %r10,%r11
  777. mov \$0,%r10
  778. cmovc %r10,%r11
  779. sub %r11,%rbp
  780. .Lsqr8x_sp_done:
  781. and \$-64,%rbp
  782. mov %rsp,%r11
  783. sub %rbp,%r11
  784. and \$-4096,%r11
  785. lea (%rbp,%r11),%rsp
  786. mov (%rsp),%r10
  787. cmp %rbp,%rsp
  788. ja .Lsqr8x_page_walk
  789. jmp .Lsqr8x_page_walk_done
  790. .align 16
  791. .Lsqr8x_page_walk:
  792. lea -4096(%rsp),%rsp
  793. mov (%rsp),%r10
  794. cmp %rbp,%rsp
  795. ja .Lsqr8x_page_walk
  796. .Lsqr8x_page_walk_done:
  797. mov $num,%r10
  798. neg $num
  799. mov $n0, 32(%rsp)
  800. mov %rax, 40(%rsp) # save original %rsp
  801. .cfi_cfa_expression %rsp+40,deref,+8
  802. .Lsqr8x_body:
  803. movq $nptr, %xmm2 # save pointer to modulus
  804. pxor %xmm0,%xmm0
  805. movq $rptr,%xmm1 # save $rptr
  806. movq %r10, %xmm3 # -$num
  807. ___
  808. $code.=<<___ if ($addx);
  809. leaq OPENSSL_ia32cap_P(%rip),%rax
  810. mov 8(%rax),%eax
  811. and \$0x80100,%eax
  812. cmp \$0x80100,%eax
  813. jne .Lsqr8x_nox
  814. call bn_sqrx8x_internal # see x86_64-mont5 module
  815. # %rax top-most carry
  816. # %rbp nptr
  817. # %rcx -8*num
  818. # %r8 end of tp[2*num]
  819. lea (%r8,%rcx),%rbx
  820. mov %rcx,$num
  821. mov %rcx,%rdx
  822. movq %xmm1,$rptr
  823. sar \$3+2,%rcx # %cf=0
  824. jmp .Lsqr8x_sub
  825. .align 32
  826. .Lsqr8x_nox:
  827. ___
  828. $code.=<<___;
  829. call bn_sqr8x_internal # see x86_64-mont5 module
  830. # %rax top-most carry
  831. # %rbp nptr
  832. # %r8 -8*num
  833. # %rdi end of tp[2*num]
  834. lea (%rdi,$num),%rbx
  835. mov $num,%rcx
  836. mov $num,%rdx
  837. movq %xmm1,$rptr
  838. sar \$3+2,%rcx # %cf=0
  839. jmp .Lsqr8x_sub
  840. .align 32
  841. .Lsqr8x_sub:
  842. mov 8*0(%rbx),%r12
  843. mov 8*1(%rbx),%r13
  844. mov 8*2(%rbx),%r14
  845. mov 8*3(%rbx),%r15
  846. lea 8*4(%rbx),%rbx
  847. sbb 8*0(%rbp),%r12
  848. sbb 8*1(%rbp),%r13
  849. sbb 8*2(%rbp),%r14
  850. sbb 8*3(%rbp),%r15
  851. lea 8*4(%rbp),%rbp
  852. mov %r12,8*0($rptr)
  853. mov %r13,8*1($rptr)
  854. mov %r14,8*2($rptr)
  855. mov %r15,8*3($rptr)
  856. lea 8*4($rptr),$rptr
  857. inc %rcx # preserves %cf
  858. jnz .Lsqr8x_sub
  859. sbb \$0,%rax # top-most carry
  860. lea (%rbx,$num),%rbx # rewind
  861. lea ($rptr,$num),$rptr # rewind
  862. movq %rax,%xmm1
  863. pxor %xmm0,%xmm0
  864. pshufd \$0,%xmm1,%xmm1
  865. mov 40(%rsp),%rsi # restore %rsp
  866. .cfi_def_cfa %rsi,8
  867. jmp .Lsqr8x_cond_copy
  868. .align 32
  869. .Lsqr8x_cond_copy:
  870. movdqa 16*0(%rbx),%xmm2
  871. movdqa 16*1(%rbx),%xmm3
  872. lea 16*2(%rbx),%rbx
  873. movdqu 16*0($rptr),%xmm4
  874. movdqu 16*1($rptr),%xmm5
  875. lea 16*2($rptr),$rptr
  876. movdqa %xmm0,-16*2(%rbx) # zero tp
  877. movdqa %xmm0,-16*1(%rbx)
  878. movdqa %xmm0,-16*2(%rbx,%rdx)
  879. movdqa %xmm0,-16*1(%rbx,%rdx)
  880. pcmpeqd %xmm1,%xmm0
  881. pand %xmm1,%xmm2
  882. pand %xmm1,%xmm3
  883. pand %xmm0,%xmm4
  884. pand %xmm0,%xmm5
  885. pxor %xmm0,%xmm0
  886. por %xmm2,%xmm4
  887. por %xmm3,%xmm5
  888. movdqu %xmm4,-16*2($rptr)
  889. movdqu %xmm5,-16*1($rptr)
  890. add \$32,$num
  891. jnz .Lsqr8x_cond_copy
  892. mov \$1,%rax
  893. mov -48(%rsi),%r15
  894. .cfi_restore %r15
  895. mov -40(%rsi),%r14
  896. .cfi_restore %r14
  897. mov -32(%rsi),%r13
  898. .cfi_restore %r13
  899. mov -24(%rsi),%r12
  900. .cfi_restore %r12
  901. mov -16(%rsi),%rbp
  902. .cfi_restore %rbp
  903. mov -8(%rsi),%rbx
  904. .cfi_restore %rbx
  905. lea (%rsi),%rsp
  906. .cfi_def_cfa_register %rsp
  907. .Lsqr8x_epilogue:
  908. ret
  909. .cfi_endproc
  910. .size bn_sqr8x_mont,.-bn_sqr8x_mont
  911. ___
  912. }}}
  913. if ($addx) {{{
  914. my $bp="%rdx"; # original value
  915. $code.=<<___;
  916. .type bn_mulx4x_mont,\@function,6
  917. .align 32
  918. bn_mulx4x_mont:
  919. .cfi_startproc
  920. mov %rsp,%rax
  921. .cfi_def_cfa_register %rax
  922. .Lmulx4x_enter:
  923. push %rbx
  924. .cfi_push %rbx
  925. push %rbp
  926. .cfi_push %rbp
  927. push %r12
  928. .cfi_push %r12
  929. push %r13
  930. .cfi_push %r13
  931. push %r14
  932. .cfi_push %r14
  933. push %r15
  934. .cfi_push %r15
  935. .Lmulx4x_prologue:
  936. shl \$3,${num}d # convert $num to bytes
  937. xor %r10,%r10
  938. sub $num,%r10 # -$num
  939. mov ($n0),$n0 # *n0
  940. lea -72(%rsp,%r10),%rbp # future alloca(frame+$num+8)
  941. and \$-128,%rbp
  942. mov %rsp,%r11
  943. sub %rbp,%r11
  944. and \$-4096,%r11
  945. lea (%rbp,%r11),%rsp
  946. mov (%rsp),%r10
  947. cmp %rbp,%rsp
  948. ja .Lmulx4x_page_walk
  949. jmp .Lmulx4x_page_walk_done
  950. .align 16
  951. .Lmulx4x_page_walk:
  952. lea -4096(%rsp),%rsp
  953. mov (%rsp),%r10
  954. cmp %rbp,%rsp
  955. ja .Lmulx4x_page_walk
  956. .Lmulx4x_page_walk_done:
  957. lea ($bp,$num),%r10
  958. ##############################################################
  959. # Stack layout
  960. # +0 num
  961. # +8 off-loaded &b[i]
  962. # +16 end of b[num]
  963. # +24 saved n0
  964. # +32 saved rp
  965. # +40 saved %rsp
  966. # +48 inner counter
  967. # +56
  968. # +64 tmp[num+1]
  969. #
  970. mov $num,0(%rsp) # save $num
  971. shr \$5,$num
  972. mov %r10,16(%rsp) # end of b[num]
  973. sub \$1,$num
  974. mov $n0, 24(%rsp) # save *n0
  975. mov $rp, 32(%rsp) # save $rp
  976. mov %rax,40(%rsp) # save original %rsp
  977. .cfi_cfa_expression %rsp+40,deref,+8
  978. mov $num,48(%rsp) # inner counter
  979. jmp .Lmulx4x_body
  980. .align 32
  981. .Lmulx4x_body:
  982. ___
  983. my ($aptr, $bptr, $nptr, $tptr, $mi, $bi, $zero, $num)=
  984. ("%rsi","%rdi","%rcx","%rbx","%r8","%r9","%rbp","%rax");
  985. my $rptr=$bptr;
  986. $code.=<<___;
  987. lea 8($bp),$bptr
  988. mov ($bp),%rdx # b[0], $bp==%rdx actually
  989. lea 64+32(%rsp),$tptr
  990. mov %rdx,$bi
  991. mulx 0*8($aptr),$mi,%rax # a[0]*b[0]
  992. mulx 1*8($aptr),%r11,%r14 # a[1]*b[0]
  993. add %rax,%r11
  994. mov $bptr,8(%rsp) # off-load &b[i]
  995. mulx 2*8($aptr),%r12,%r13 # ...
  996. adc %r14,%r12
  997. adc \$0,%r13
  998. mov $mi,$bptr # borrow $bptr
  999. imulq 24(%rsp),$mi # "t[0]"*n0
  1000. xor $zero,$zero # cf=0, of=0
  1001. mulx 3*8($aptr),%rax,%r14
  1002. mov $mi,%rdx
  1003. lea 4*8($aptr),$aptr
  1004. adcx %rax,%r13
  1005. adcx $zero,%r14 # cf=0
  1006. mulx 0*8($nptr),%rax,%r10
  1007. adcx %rax,$bptr # discarded
  1008. adox %r11,%r10
  1009. mulx 1*8($nptr),%rax,%r11
  1010. adcx %rax,%r10
  1011. adox %r12,%r11
  1012. .byte 0xc4,0x62,0xfb,0xf6,0xa1,0x10,0x00,0x00,0x00 # mulx 2*8($nptr),%rax,%r12
  1013. mov 48(%rsp),$bptr # counter value
  1014. mov %r10,-4*8($tptr)
  1015. adcx %rax,%r11
  1016. adox %r13,%r12
  1017. mulx 3*8($nptr),%rax,%r15
  1018. mov $bi,%rdx
  1019. mov %r11,-3*8($tptr)
  1020. adcx %rax,%r12
  1021. adox $zero,%r15 # of=0
  1022. lea 4*8($nptr),$nptr
  1023. mov %r12,-2*8($tptr)
  1024. jmp .Lmulx4x_1st
  1025. .align 32
  1026. .Lmulx4x_1st:
  1027. adcx $zero,%r15 # cf=0, modulo-scheduled
  1028. mulx 0*8($aptr),%r10,%rax # a[4]*b[0]
  1029. adcx %r14,%r10
  1030. mulx 1*8($aptr),%r11,%r14 # a[5]*b[0]
  1031. adcx %rax,%r11
  1032. mulx 2*8($aptr),%r12,%rax # ...
  1033. adcx %r14,%r12
  1034. mulx 3*8($aptr),%r13,%r14
  1035. .byte 0x67,0x67
  1036. mov $mi,%rdx
  1037. adcx %rax,%r13
  1038. adcx $zero,%r14 # cf=0
  1039. lea 4*8($aptr),$aptr
  1040. lea 4*8($tptr),$tptr
  1041. adox %r15,%r10
  1042. mulx 0*8($nptr),%rax,%r15
  1043. adcx %rax,%r10
  1044. adox %r15,%r11
  1045. mulx 1*8($nptr),%rax,%r15
  1046. adcx %rax,%r11
  1047. adox %r15,%r12
  1048. mulx 2*8($nptr),%rax,%r15
  1049. mov %r10,-5*8($tptr)
  1050. adcx %rax,%r12
  1051. mov %r11,-4*8($tptr)
  1052. adox %r15,%r13
  1053. mulx 3*8($nptr),%rax,%r15
  1054. mov $bi,%rdx
  1055. mov %r12,-3*8($tptr)
  1056. adcx %rax,%r13
  1057. adox $zero,%r15
  1058. lea 4*8($nptr),$nptr
  1059. mov %r13,-2*8($tptr)
  1060. dec $bptr # of=0, pass cf
  1061. jnz .Lmulx4x_1st
  1062. mov 0(%rsp),$num # load num
  1063. mov 8(%rsp),$bptr # re-load &b[i]
  1064. adc $zero,%r15 # modulo-scheduled
  1065. add %r15,%r14
  1066. sbb %r15,%r15 # top-most carry
  1067. mov %r14,-1*8($tptr)
  1068. jmp .Lmulx4x_outer
  1069. .align 32
  1070. .Lmulx4x_outer:
  1071. mov ($bptr),%rdx # b[i]
  1072. lea 8($bptr),$bptr # b++
  1073. sub $num,$aptr # rewind $aptr
  1074. mov %r15,($tptr) # save top-most carry
  1075. lea 64+4*8(%rsp),$tptr
  1076. sub $num,$nptr # rewind $nptr
  1077. mulx 0*8($aptr),$mi,%r11 # a[0]*b[i]
  1078. xor %ebp,%ebp # xor $zero,$zero # cf=0, of=0
  1079. mov %rdx,$bi
  1080. mulx 1*8($aptr),%r14,%r12 # a[1]*b[i]
  1081. adox -4*8($tptr),$mi
  1082. adcx %r14,%r11
  1083. mulx 2*8($aptr),%r15,%r13 # ...
  1084. adox -3*8($tptr),%r11
  1085. adcx %r15,%r12
  1086. adox -2*8($tptr),%r12
  1087. adcx $zero,%r13
  1088. adox $zero,%r13
  1089. mov $bptr,8(%rsp) # off-load &b[i]
  1090. mov $mi,%r15
  1091. imulq 24(%rsp),$mi # "t[0]"*n0
  1092. xor %ebp,%ebp # xor $zero,$zero # cf=0, of=0
  1093. mulx 3*8($aptr),%rax,%r14
  1094. mov $mi,%rdx
  1095. adcx %rax,%r13
  1096. adox -1*8($tptr),%r13
  1097. adcx $zero,%r14
  1098. lea 4*8($aptr),$aptr
  1099. adox $zero,%r14
  1100. mulx 0*8($nptr),%rax,%r10
  1101. adcx %rax,%r15 # discarded
  1102. adox %r11,%r10
  1103. mulx 1*8($nptr),%rax,%r11
  1104. adcx %rax,%r10
  1105. adox %r12,%r11
  1106. mulx 2*8($nptr),%rax,%r12
  1107. mov %r10,-4*8($tptr)
  1108. adcx %rax,%r11
  1109. adox %r13,%r12
  1110. mulx 3*8($nptr),%rax,%r15
  1111. mov $bi,%rdx
  1112. mov %r11,-3*8($tptr)
  1113. lea 4*8($nptr),$nptr
  1114. adcx %rax,%r12
  1115. adox $zero,%r15 # of=0
  1116. mov 48(%rsp),$bptr # counter value
  1117. mov %r12,-2*8($tptr)
  1118. jmp .Lmulx4x_inner
  1119. .align 32
  1120. .Lmulx4x_inner:
  1121. mulx 0*8($aptr),%r10,%rax # a[4]*b[i]
  1122. adcx $zero,%r15 # cf=0, modulo-scheduled
  1123. adox %r14,%r10
  1124. mulx 1*8($aptr),%r11,%r14 # a[5]*b[i]
  1125. adcx 0*8($tptr),%r10
  1126. adox %rax,%r11
  1127. mulx 2*8($aptr),%r12,%rax # ...
  1128. adcx 1*8($tptr),%r11
  1129. adox %r14,%r12
  1130. mulx 3*8($aptr),%r13,%r14
  1131. mov $mi,%rdx
  1132. adcx 2*8($tptr),%r12
  1133. adox %rax,%r13
  1134. adcx 3*8($tptr),%r13
  1135. adox $zero,%r14 # of=0
  1136. lea 4*8($aptr),$aptr
  1137. lea 4*8($tptr),$tptr
  1138. adcx $zero,%r14 # cf=0
  1139. adox %r15,%r10
  1140. mulx 0*8($nptr),%rax,%r15
  1141. adcx %rax,%r10
  1142. adox %r15,%r11
  1143. mulx 1*8($nptr),%rax,%r15
  1144. adcx %rax,%r11
  1145. adox %r15,%r12
  1146. mulx 2*8($nptr),%rax,%r15
  1147. mov %r10,-5*8($tptr)
  1148. adcx %rax,%r12
  1149. adox %r15,%r13
  1150. mulx 3*8($nptr),%rax,%r15
  1151. mov $bi,%rdx
  1152. mov %r11,-4*8($tptr)
  1153. mov %r12,-3*8($tptr)
  1154. adcx %rax,%r13
  1155. adox $zero,%r15
  1156. lea 4*8($nptr),$nptr
  1157. mov %r13,-2*8($tptr)
  1158. dec $bptr # of=0, pass cf
  1159. jnz .Lmulx4x_inner
  1160. mov 0(%rsp),$num # load num
  1161. mov 8(%rsp),$bptr # re-load &b[i]
  1162. adc $zero,%r15 # modulo-scheduled
  1163. sub 0*8($tptr),$zero # pull top-most carry
  1164. adc %r15,%r14
  1165. sbb %r15,%r15 # top-most carry
  1166. mov %r14,-1*8($tptr)
  1167. cmp 16(%rsp),$bptr
  1168. jne .Lmulx4x_outer
  1169. lea 64(%rsp),$tptr
  1170. sub $num,$nptr # rewind $nptr
  1171. neg %r15
  1172. mov $num,%rdx
  1173. shr \$3+2,$num # %cf=0
  1174. mov 32(%rsp),$rptr # restore rp
  1175. jmp .Lmulx4x_sub
  1176. .align 32
  1177. .Lmulx4x_sub:
  1178. mov 8*0($tptr),%r11
  1179. mov 8*1($tptr),%r12
  1180. mov 8*2($tptr),%r13
  1181. mov 8*3($tptr),%r14
  1182. lea 8*4($tptr),$tptr
  1183. sbb 8*0($nptr),%r11
  1184. sbb 8*1($nptr),%r12
  1185. sbb 8*2($nptr),%r13
  1186. sbb 8*3($nptr),%r14
  1187. lea 8*4($nptr),$nptr
  1188. mov %r11,8*0($rptr)
  1189. mov %r12,8*1($rptr)
  1190. mov %r13,8*2($rptr)
  1191. mov %r14,8*3($rptr)
  1192. lea 8*4($rptr),$rptr
  1193. dec $num # preserves %cf
  1194. jnz .Lmulx4x_sub
  1195. sbb \$0,%r15 # top-most carry
  1196. lea 64(%rsp),$tptr
  1197. sub %rdx,$rptr # rewind
  1198. movq %r15,%xmm1
  1199. pxor %xmm0,%xmm0
  1200. pshufd \$0,%xmm1,%xmm1
  1201. mov 40(%rsp),%rsi # restore %rsp
  1202. .cfi_def_cfa %rsi,8
  1203. jmp .Lmulx4x_cond_copy
  1204. .align 32
  1205. .Lmulx4x_cond_copy:
  1206. movdqa 16*0($tptr),%xmm2
  1207. movdqa 16*1($tptr),%xmm3
  1208. lea 16*2($tptr),$tptr
  1209. movdqu 16*0($rptr),%xmm4
  1210. movdqu 16*1($rptr),%xmm5
  1211. lea 16*2($rptr),$rptr
  1212. movdqa %xmm0,-16*2($tptr) # zero tp
  1213. movdqa %xmm0,-16*1($tptr)
  1214. pcmpeqd %xmm1,%xmm0
  1215. pand %xmm1,%xmm2
  1216. pand %xmm1,%xmm3
  1217. pand %xmm0,%xmm4
  1218. pand %xmm0,%xmm5
  1219. pxor %xmm0,%xmm0
  1220. por %xmm2,%xmm4
  1221. por %xmm3,%xmm5
  1222. movdqu %xmm4,-16*2($rptr)
  1223. movdqu %xmm5,-16*1($rptr)
  1224. sub \$32,%rdx
  1225. jnz .Lmulx4x_cond_copy
  1226. mov %rdx,($tptr)
  1227. mov \$1,%rax
  1228. mov -48(%rsi),%r15
  1229. .cfi_restore %r15
  1230. mov -40(%rsi),%r14
  1231. .cfi_restore %r14
  1232. mov -32(%rsi),%r13
  1233. .cfi_restore %r13
  1234. mov -24(%rsi),%r12
  1235. .cfi_restore %r12
  1236. mov -16(%rsi),%rbp
  1237. .cfi_restore %rbp
  1238. mov -8(%rsi),%rbx
  1239. .cfi_restore %rbx
  1240. lea (%rsi),%rsp
  1241. .cfi_def_cfa_register %rsp
  1242. .Lmulx4x_epilogue:
  1243. ret
  1244. .cfi_endproc
  1245. .size bn_mulx4x_mont,.-bn_mulx4x_mont
  1246. ___
  1247. }}}
  1248. $code.=<<___;
  1249. .asciz "Montgomery Multiplication for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
  1250. .align 16
  1251. ___
  1252. # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
  1253. # CONTEXT *context,DISPATCHER_CONTEXT *disp)
  1254. if ($win64) {
  1255. $rec="%rcx";
  1256. $frame="%rdx";
  1257. $context="%r8";
  1258. $disp="%r9";
  1259. $code.=<<___;
  1260. .extern __imp_RtlVirtualUnwind
  1261. .type mul_handler,\@abi-omnipotent
  1262. .align 16
  1263. mul_handler:
  1264. push %rsi
  1265. push %rdi
  1266. push %rbx
  1267. push %rbp
  1268. push %r12
  1269. push %r13
  1270. push %r14
  1271. push %r15
  1272. pushfq
  1273. sub \$64,%rsp
  1274. mov 120($context),%rax # pull context->Rax
  1275. mov 248($context),%rbx # pull context->Rip
  1276. mov 8($disp),%rsi # disp->ImageBase
  1277. mov 56($disp),%r11 # disp->HandlerData
  1278. mov 0(%r11),%r10d # HandlerData[0]
  1279. lea (%rsi,%r10),%r10 # end of prologue label
  1280. cmp %r10,%rbx # context->Rip<end of prologue label
  1281. jb .Lcommon_seh_tail
  1282. mov 152($context),%rax # pull context->Rsp
  1283. mov 4(%r11),%r10d # HandlerData[1]
  1284. lea (%rsi,%r10),%r10 # epilogue label
  1285. cmp %r10,%rbx # context->Rip>=epilogue label
  1286. jae .Lcommon_seh_tail
  1287. mov 192($context),%r10 # pull $num
  1288. mov 8(%rax,%r10,8),%rax # pull saved stack pointer
  1289. jmp .Lcommon_pop_regs
  1290. .size mul_handler,.-mul_handler
  1291. .type sqr_handler,\@abi-omnipotent
  1292. .align 16
  1293. sqr_handler:
  1294. push %rsi
  1295. push %rdi
  1296. push %rbx
  1297. push %rbp
  1298. push %r12
  1299. push %r13
  1300. push %r14
  1301. push %r15
  1302. pushfq
  1303. sub \$64,%rsp
  1304. mov 120($context),%rax # pull context->Rax
  1305. mov 248($context),%rbx # pull context->Rip
  1306. mov 8($disp),%rsi # disp->ImageBase
  1307. mov 56($disp),%r11 # disp->HandlerData
  1308. mov 0(%r11),%r10d # HandlerData[0]
  1309. lea (%rsi,%r10),%r10 # end of prologue label
  1310. cmp %r10,%rbx # context->Rip<.Lsqr_prologue
  1311. jb .Lcommon_seh_tail
  1312. mov 4(%r11),%r10d # HandlerData[1]
  1313. lea (%rsi,%r10),%r10 # body label
  1314. cmp %r10,%rbx # context->Rip<.Lsqr_body
  1315. jb .Lcommon_pop_regs
  1316. mov 152($context),%rax # pull context->Rsp
  1317. mov 8(%r11),%r10d # HandlerData[2]
  1318. lea (%rsi,%r10),%r10 # epilogue label
  1319. cmp %r10,%rbx # context->Rip>=.Lsqr_epilogue
  1320. jae .Lcommon_seh_tail
  1321. mov 40(%rax),%rax # pull saved stack pointer
  1322. .Lcommon_pop_regs:
  1323. mov -8(%rax),%rbx
  1324. mov -16(%rax),%rbp
  1325. mov -24(%rax),%r12
  1326. mov -32(%rax),%r13
  1327. mov -40(%rax),%r14
  1328. mov -48(%rax),%r15
  1329. mov %rbx,144($context) # restore context->Rbx
  1330. mov %rbp,160($context) # restore context->Rbp
  1331. mov %r12,216($context) # restore context->R12
  1332. mov %r13,224($context) # restore context->R13
  1333. mov %r14,232($context) # restore context->R14
  1334. mov %r15,240($context) # restore context->R15
  1335. .Lcommon_seh_tail:
  1336. mov 8(%rax),%rdi
  1337. mov 16(%rax),%rsi
  1338. mov %rax,152($context) # restore context->Rsp
  1339. mov %rsi,168($context) # restore context->Rsi
  1340. mov %rdi,176($context) # restore context->Rdi
  1341. mov 40($disp),%rdi # disp->ContextRecord
  1342. mov $context,%rsi # context
  1343. mov \$154,%ecx # sizeof(CONTEXT)
  1344. .long 0xa548f3fc # cld; rep movsq
  1345. mov $disp,%rsi
  1346. xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
  1347. mov 8(%rsi),%rdx # arg2, disp->ImageBase
  1348. mov 0(%rsi),%r8 # arg3, disp->ControlPc
  1349. mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
  1350. mov 40(%rsi),%r10 # disp->ContextRecord
  1351. lea 56(%rsi),%r11 # &disp->HandlerData
  1352. lea 24(%rsi),%r12 # &disp->EstablisherFrame
  1353. mov %r10,32(%rsp) # arg5
  1354. mov %r11,40(%rsp) # arg6
  1355. mov %r12,48(%rsp) # arg7
  1356. mov %rcx,56(%rsp) # arg8, (NULL)
  1357. call *__imp_RtlVirtualUnwind(%rip)
  1358. mov \$1,%eax # ExceptionContinueSearch
  1359. add \$64,%rsp
  1360. popfq
  1361. pop %r15
  1362. pop %r14
  1363. pop %r13
  1364. pop %r12
  1365. pop %rbp
  1366. pop %rbx
  1367. pop %rdi
  1368. pop %rsi
  1369. ret
  1370. .size sqr_handler,.-sqr_handler
  1371. .section .pdata
  1372. .align 4
  1373. .rva .LSEH_begin_bn_mul_mont
  1374. .rva .LSEH_end_bn_mul_mont
  1375. .rva .LSEH_info_bn_mul_mont
  1376. .rva .LSEH_begin_bn_mul4x_mont
  1377. .rva .LSEH_end_bn_mul4x_mont
  1378. .rva .LSEH_info_bn_mul4x_mont
  1379. .rva .LSEH_begin_bn_sqr8x_mont
  1380. .rva .LSEH_end_bn_sqr8x_mont
  1381. .rva .LSEH_info_bn_sqr8x_mont
  1382. ___
  1383. $code.=<<___ if ($addx);
  1384. .rva .LSEH_begin_bn_mulx4x_mont
  1385. .rva .LSEH_end_bn_mulx4x_mont
  1386. .rva .LSEH_info_bn_mulx4x_mont
  1387. ___
  1388. $code.=<<___;
  1389. .section .xdata
  1390. .align 8
  1391. .LSEH_info_bn_mul_mont:
  1392. .byte 9,0,0,0
  1393. .rva mul_handler
  1394. .rva .Lmul_body,.Lmul_epilogue # HandlerData[]
  1395. .LSEH_info_bn_mul4x_mont:
  1396. .byte 9,0,0,0
  1397. .rva mul_handler
  1398. .rva .Lmul4x_body,.Lmul4x_epilogue # HandlerData[]
  1399. .LSEH_info_bn_sqr8x_mont:
  1400. .byte 9,0,0,0
  1401. .rva sqr_handler
  1402. .rva .Lsqr8x_prologue,.Lsqr8x_body,.Lsqr8x_epilogue # HandlerData[]
  1403. .align 8
  1404. ___
  1405. $code.=<<___ if ($addx);
  1406. .LSEH_info_bn_mulx4x_mont:
  1407. .byte 9,0,0,0
  1408. .rva sqr_handler
  1409. .rva .Lmulx4x_prologue,.Lmulx4x_body,.Lmulx4x_epilogue # HandlerData[]
  1410. .align 8
  1411. ___
  1412. }
  1413. print $code;
  1414. close STDOUT;