You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

1402 lines
28 KiB

  1. #!/usr/bin/env perl
  2. # ====================================================================
  3. # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  4. # project. The module is, however, dual licensed under OpenSSL and
  5. # CRYPTOGAMS licenses depending on where you obtain it. For further
  6. # details see http://www.openssl.org/~appro/cryptogams/.
  7. # ====================================================================
  8. # October 2005.
  9. #
  10. # Montgomery multiplication routine for x86_64. While it gives modest
  11. # 9% improvement of rsa4096 sign on Opteron, rsa512 sign runs more
  12. # than twice, >2x, as fast. Most common rsa1024 sign is improved by
  13. # respectful 50%. It remains to be seen if loop unrolling and
  14. # dedicated squaring routine can provide further improvement...
  15. # July 2011.
  16. #
  17. # Add dedicated squaring procedure. Performance improvement varies
  18. # from platform to platform, but in average it's ~5%/15%/25%/33%
  19. # for 512-/1024-/2048-/4096-bit RSA *sign* benchmarks respectively.
  20. # August 2011.
  21. #
  22. # Unroll and modulo-schedule inner loops in such manner that they
  23. # are "fallen through" for input lengths of 8, which is critical for
  24. # 1024-bit RSA *sign*. Average performance improvement in comparison
  25. # to *initial* version of this module from 2005 is ~0%/30%/40%/45%
  26. # for 512-/1024-/2048-/4096-bit RSA *sign* benchmarks respectively.
  27. # June 2013.
  28. #
  29. # Optimize reduction in squaring procedure and improve 1024+-bit RSA
  30. # sign performance by 10-16% on Intel Sandy Bridge and later
  31. # (virtually same on non-Intel processors).
  32. # August 2013.
  33. #
  34. # Add MULX/ADOX/ADCX code path.
  35. $flavour = shift;
  36. $output = shift;
  37. if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
  38. $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
  39. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  40. ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
  41. ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
  42. die "can't locate x86_64-xlate.pl";
  43. open OUT,"| \"$^X\" $xlate $flavour $output";
  44. *STDOUT=*OUT;
  45. if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
  46. =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
  47. $addx = ($1>=2.23);
  48. }
  49. if (!$addx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
  50. `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
  51. $addx = ($1>=2.10);
  52. }
  53. if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
  54. `ml64 2>&1` =~ /Version ([0-9]+)\./) {
  55. $addx = ($1>=12);
  56. }
  57. # int bn_mul_mont(
  58. $rp="%rdi"; # BN_ULONG *rp,
  59. $ap="%rsi"; # const BN_ULONG *ap,
  60. $bp="%rdx"; # const BN_ULONG *bp,
  61. $np="%rcx"; # const BN_ULONG *np,
  62. $n0="%r8"; # const BN_ULONG *n0,
  63. $num="%r9"; # int num);
  64. $lo0="%r10";
  65. $hi0="%r11";
  66. $hi1="%r13";
  67. $i="%r14";
  68. $j="%r15";
  69. $m0="%rbx";
  70. $m1="%rbp";
  71. $code=<<___;
  72. .text
  73. .extern OPENSSL_ia32cap_P
  74. .globl bn_mul_mont
  75. .type bn_mul_mont,\@function,6
  76. .align 16
  77. bn_mul_mont:
  78. test \$3,${num}d
  79. jnz .Lmul_enter
  80. cmp \$8,${num}d
  81. jb .Lmul_enter
  82. ___
  83. $code.=<<___ if ($addx);
  84. mov OPENSSL_ia32cap_P+8(%rip),%r11d
  85. ___
  86. $code.=<<___;
  87. cmp $ap,$bp
  88. jne .Lmul4x_enter
  89. test \$7,${num}d
  90. jz .Lsqr8x_enter
  91. jmp .Lmul4x_enter
  92. .align 16
  93. .Lmul_enter:
  94. push %rbx
  95. push %rbp
  96. push %r12
  97. push %r13
  98. push %r14
  99. push %r15
  100. mov ${num}d,${num}d
  101. lea 2($num),%r10
  102. mov %rsp,%r11
  103. neg %r10
  104. lea (%rsp,%r10,8),%rsp # tp=alloca(8*(num+2))
  105. and \$-1024,%rsp # minimize TLB usage
  106. mov %r11,8(%rsp,$num,8) # tp[num+1]=%rsp
  107. .Lmul_body:
  108. mov $bp,%r12 # reassign $bp
  109. ___
  110. $bp="%r12";
  111. $code.=<<___;
  112. mov ($n0),$n0 # pull n0[0] value
  113. mov ($bp),$m0 # m0=bp[0]
  114. mov ($ap),%rax
  115. xor $i,$i # i=0
  116. xor $j,$j # j=0
  117. mov $n0,$m1
  118. mulq $m0 # ap[0]*bp[0]
  119. mov %rax,$lo0
  120. mov ($np),%rax
  121. imulq $lo0,$m1 # "tp[0]"*n0
  122. mov %rdx,$hi0
  123. mulq $m1 # np[0]*m1
  124. add %rax,$lo0 # discarded
  125. mov 8($ap),%rax
  126. adc \$0,%rdx
  127. mov %rdx,$hi1
  128. lea 1($j),$j # j++
  129. jmp .L1st_enter
  130. .align 16
  131. .L1st:
  132. add %rax,$hi1
  133. mov ($ap,$j,8),%rax
  134. adc \$0,%rdx
  135. add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0]
  136. mov $lo0,$hi0
  137. adc \$0,%rdx
  138. mov $hi1,-16(%rsp,$j,8) # tp[j-1]
  139. mov %rdx,$hi1
  140. .L1st_enter:
  141. mulq $m0 # ap[j]*bp[0]
  142. add %rax,$hi0
  143. mov ($np,$j,8),%rax
  144. adc \$0,%rdx
  145. lea 1($j),$j # j++
  146. mov %rdx,$lo0
  147. mulq $m1 # np[j]*m1
  148. cmp $num,$j
  149. jne .L1st
  150. add %rax,$hi1
  151. mov ($ap),%rax # ap[0]
  152. adc \$0,%rdx
  153. add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0]
  154. adc \$0,%rdx
  155. mov $hi1,-16(%rsp,$j,8) # tp[j-1]
  156. mov %rdx,$hi1
  157. mov $lo0,$hi0
  158. xor %rdx,%rdx
  159. add $hi0,$hi1
  160. adc \$0,%rdx
  161. mov $hi1,-8(%rsp,$num,8)
  162. mov %rdx,(%rsp,$num,8) # store upmost overflow bit
  163. lea 1($i),$i # i++
  164. jmp .Louter
  165. .align 16
  166. .Louter:
  167. mov ($bp,$i,8),$m0 # m0=bp[i]
  168. xor $j,$j # j=0
  169. mov $n0,$m1
  170. mov (%rsp),$lo0
  171. mulq $m0 # ap[0]*bp[i]
  172. add %rax,$lo0 # ap[0]*bp[i]+tp[0]
  173. mov ($np),%rax
  174. adc \$0,%rdx
  175. imulq $lo0,$m1 # tp[0]*n0
  176. mov %rdx,$hi0
  177. mulq $m1 # np[0]*m1
  178. add %rax,$lo0 # discarded
  179. mov 8($ap),%rax
  180. adc \$0,%rdx
  181. mov 8(%rsp),$lo0 # tp[1]
  182. mov %rdx,$hi1
  183. lea 1($j),$j # j++
  184. jmp .Linner_enter
  185. .align 16
  186. .Linner:
  187. add %rax,$hi1
  188. mov ($ap,$j,8),%rax
  189. adc \$0,%rdx
  190. add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j]
  191. mov (%rsp,$j,8),$lo0
  192. adc \$0,%rdx
  193. mov $hi1,-16(%rsp,$j,8) # tp[j-1]
  194. mov %rdx,$hi1
  195. .Linner_enter:
  196. mulq $m0 # ap[j]*bp[i]
  197. add %rax,$hi0
  198. mov ($np,$j,8),%rax
  199. adc \$0,%rdx
  200. add $hi0,$lo0 # ap[j]*bp[i]+tp[j]
  201. mov %rdx,$hi0
  202. adc \$0,$hi0
  203. lea 1($j),$j # j++
  204. mulq $m1 # np[j]*m1
  205. cmp $num,$j
  206. jne .Linner
  207. add %rax,$hi1
  208. mov ($ap),%rax # ap[0]
  209. adc \$0,%rdx
  210. add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j]
  211. mov (%rsp,$j,8),$lo0
  212. adc \$0,%rdx
  213. mov $hi1,-16(%rsp,$j,8) # tp[j-1]
  214. mov %rdx,$hi1
  215. xor %rdx,%rdx
  216. add $hi0,$hi1
  217. adc \$0,%rdx
  218. add $lo0,$hi1 # pull upmost overflow bit
  219. adc \$0,%rdx
  220. mov $hi1,-8(%rsp,$num,8)
  221. mov %rdx,(%rsp,$num,8) # store upmost overflow bit
  222. lea 1($i),$i # i++
  223. cmp $num,$i
  224. jb .Louter
  225. xor $i,$i # i=0 and clear CF!
  226. mov (%rsp),%rax # tp[0]
  227. lea (%rsp),$ap # borrow ap for tp
  228. mov $num,$j # j=num
  229. jmp .Lsub
  230. .align 16
  231. .Lsub: sbb ($np,$i,8),%rax
  232. mov %rax,($rp,$i,8) # rp[i]=tp[i]-np[i]
  233. mov 8($ap,$i,8),%rax # tp[i+1]
  234. lea 1($i),$i # i++
  235. dec $j # doesn't affect CF!
  236. jnz .Lsub
  237. sbb \$0,%rax # handle upmost overflow bit
  238. xor $i,$i
  239. mov $num,$j # j=num
  240. .align 16
  241. .Lcopy: # copy or in-place refresh
  242. mov (%rsp,$i,8),$ap
  243. mov ($rp,$i,8),$np
  244. xor $np,$ap # conditional select:
  245. and %rax,$ap # ((ap ^ np) & %rax) ^ np
  246. xor $np,$ap # ap = borrow?tp:rp
  247. mov $i,(%rsp,$i,8) # zap temporary vector
  248. mov $ap,($rp,$i,8) # rp[i]=tp[i]
  249. lea 1($i),$i
  250. sub \$1,$j
  251. jnz .Lcopy
  252. mov 8(%rsp,$num,8),%rsi # restore %rsp
  253. mov \$1,%rax
  254. mov (%rsi),%r15
  255. mov 8(%rsi),%r14
  256. mov 16(%rsi),%r13
  257. mov 24(%rsi),%r12
  258. mov 32(%rsi),%rbp
  259. mov 40(%rsi),%rbx
  260. lea 48(%rsi),%rsp
  261. .Lmul_epilogue:
  262. ret
  263. .size bn_mul_mont,.-bn_mul_mont
  264. ___
  265. {{{
  266. my @A=("%r10","%r11");
  267. my @N=("%r13","%rdi");
  268. $code.=<<___;
  269. .type bn_mul4x_mont,\@function,6
  270. .align 16
  271. bn_mul4x_mont:
  272. .Lmul4x_enter:
  273. ___
  274. $code.=<<___ if ($addx);
  275. and \$0x80100,%r11d
  276. cmp \$0x80100,%r11d
  277. je .Lmulx4x_enter
  278. ___
  279. $code.=<<___;
  280. push %rbx
  281. push %rbp
  282. push %r12
  283. push %r13
  284. push %r14
  285. push %r15
  286. mov ${num}d,${num}d
  287. lea 4($num),%r10
  288. mov %rsp,%r11
  289. neg %r10
  290. lea (%rsp,%r10,8),%rsp # tp=alloca(8*(num+4))
  291. and \$-1024,%rsp # minimize TLB usage
  292. mov %r11,8(%rsp,$num,8) # tp[num+1]=%rsp
  293. .Lmul4x_body:
  294. mov $rp,16(%rsp,$num,8) # tp[num+2]=$rp
  295. mov %rdx,%r12 # reassign $bp
  296. ___
  297. $bp="%r12";
  298. $code.=<<___;
  299. mov ($n0),$n0 # pull n0[0] value
  300. mov ($bp),$m0 # m0=bp[0]
  301. mov ($ap),%rax
  302. xor $i,$i # i=0
  303. xor $j,$j # j=0
  304. mov $n0,$m1
  305. mulq $m0 # ap[0]*bp[0]
  306. mov %rax,$A[0]
  307. mov ($np),%rax
  308. imulq $A[0],$m1 # "tp[0]"*n0
  309. mov %rdx,$A[1]
  310. mulq $m1 # np[0]*m1
  311. add %rax,$A[0] # discarded
  312. mov 8($ap),%rax
  313. adc \$0,%rdx
  314. mov %rdx,$N[1]
  315. mulq $m0
  316. add %rax,$A[1]
  317. mov 8($np),%rax
  318. adc \$0,%rdx
  319. mov %rdx,$A[0]
  320. mulq $m1
  321. add %rax,$N[1]
  322. mov 16($ap),%rax
  323. adc \$0,%rdx
  324. add $A[1],$N[1]
  325. lea 4($j),$j # j++
  326. adc \$0,%rdx
  327. mov $N[1],(%rsp)
  328. mov %rdx,$N[0]
  329. jmp .L1st4x
  330. .align 16
  331. .L1st4x:
  332. mulq $m0 # ap[j]*bp[0]
  333. add %rax,$A[0]
  334. mov -16($np,$j,8),%rax
  335. adc \$0,%rdx
  336. mov %rdx,$A[1]
  337. mulq $m1 # np[j]*m1
  338. add %rax,$N[0]
  339. mov -8($ap,$j,8),%rax
  340. adc \$0,%rdx
  341. add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0]
  342. adc \$0,%rdx
  343. mov $N[0],-24(%rsp,$j,8) # tp[j-1]
  344. mov %rdx,$N[1]
  345. mulq $m0 # ap[j]*bp[0]
  346. add %rax,$A[1]
  347. mov -8($np,$j,8),%rax
  348. adc \$0,%rdx
  349. mov %rdx,$A[0]
  350. mulq $m1 # np[j]*m1
  351. add %rax,$N[1]
  352. mov ($ap,$j,8),%rax
  353. adc \$0,%rdx
  354. add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0]
  355. adc \$0,%rdx
  356. mov $N[1],-16(%rsp,$j,8) # tp[j-1]
  357. mov %rdx,$N[0]
  358. mulq $m0 # ap[j]*bp[0]
  359. add %rax,$A[0]
  360. mov ($np,$j,8),%rax
  361. adc \$0,%rdx
  362. mov %rdx,$A[1]
  363. mulq $m1 # np[j]*m1
  364. add %rax,$N[0]
  365. mov 8($ap,$j,8),%rax
  366. adc \$0,%rdx
  367. add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0]
  368. adc \$0,%rdx
  369. mov $N[0],-8(%rsp,$j,8) # tp[j-1]
  370. mov %rdx,$N[1]
  371. mulq $m0 # ap[j]*bp[0]
  372. add %rax,$A[1]
  373. mov 8($np,$j,8),%rax
  374. adc \$0,%rdx
  375. lea 4($j),$j # j++
  376. mov %rdx,$A[0]
  377. mulq $m1 # np[j]*m1
  378. add %rax,$N[1]
  379. mov -16($ap,$j,8),%rax
  380. adc \$0,%rdx
  381. add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0]
  382. adc \$0,%rdx
  383. mov $N[1],-32(%rsp,$j,8) # tp[j-1]
  384. mov %rdx,$N[0]
  385. cmp $num,$j
  386. jb .L1st4x
  387. mulq $m0 # ap[j]*bp[0]
  388. add %rax,$A[0]
  389. mov -16($np,$j,8),%rax
  390. adc \$0,%rdx
  391. mov %rdx,$A[1]
  392. mulq $m1 # np[j]*m1
  393. add %rax,$N[0]
  394. mov -8($ap,$j,8),%rax
  395. adc \$0,%rdx
  396. add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0]
  397. adc \$0,%rdx
  398. mov $N[0],-24(%rsp,$j,8) # tp[j-1]
  399. mov %rdx,$N[1]
  400. mulq $m0 # ap[j]*bp[0]
  401. add %rax,$A[1]
  402. mov -8($np,$j,8),%rax
  403. adc \$0,%rdx
  404. mov %rdx,$A[0]
  405. mulq $m1 # np[j]*m1
  406. add %rax,$N[1]
  407. mov ($ap),%rax # ap[0]
  408. adc \$0,%rdx
  409. add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0]
  410. adc \$0,%rdx
  411. mov $N[1],-16(%rsp,$j,8) # tp[j-1]
  412. mov %rdx,$N[0]
  413. xor $N[1],$N[1]
  414. add $A[0],$N[0]
  415. adc \$0,$N[1]
  416. mov $N[0],-8(%rsp,$j,8)
  417. mov $N[1],(%rsp,$j,8) # store upmost overflow bit
  418. lea 1($i),$i # i++
  419. .align 4
  420. .Louter4x:
  421. mov ($bp,$i,8),$m0 # m0=bp[i]
  422. xor $j,$j # j=0
  423. mov (%rsp),$A[0]
  424. mov $n0,$m1
  425. mulq $m0 # ap[0]*bp[i]
  426. add %rax,$A[0] # ap[0]*bp[i]+tp[0]
  427. mov ($np),%rax
  428. adc \$0,%rdx
  429. imulq $A[0],$m1 # tp[0]*n0
  430. mov %rdx,$A[1]
  431. mulq $m1 # np[0]*m1
  432. add %rax,$A[0] # "$N[0]", discarded
  433. mov 8($ap),%rax
  434. adc \$0,%rdx
  435. mov %rdx,$N[1]
  436. mulq $m0 # ap[j]*bp[i]
  437. add %rax,$A[1]
  438. mov 8($np),%rax
  439. adc \$0,%rdx
  440. add 8(%rsp),$A[1] # +tp[1]
  441. adc \$0,%rdx
  442. mov %rdx,$A[0]
  443. mulq $m1 # np[j]*m1
  444. add %rax,$N[1]
  445. mov 16($ap),%rax
  446. adc \$0,%rdx
  447. add $A[1],$N[1] # np[j]*m1+ap[j]*bp[i]+tp[j]
  448. lea 4($j),$j # j+=2
  449. adc \$0,%rdx
  450. mov $N[1],(%rsp) # tp[j-1]
  451. mov %rdx,$N[0]
  452. jmp .Linner4x
  453. .align 16
  454. .Linner4x:
  455. mulq $m0 # ap[j]*bp[i]
  456. add %rax,$A[0]
  457. mov -16($np,$j,8),%rax
  458. adc \$0,%rdx
  459. add -16(%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j]
  460. adc \$0,%rdx
  461. mov %rdx,$A[1]
  462. mulq $m1 # np[j]*m1
  463. add %rax,$N[0]
  464. mov -8($ap,$j,8),%rax
  465. adc \$0,%rdx
  466. add $A[0],$N[0]
  467. adc \$0,%rdx
  468. mov $N[0],-24(%rsp,$j,8) # tp[j-1]
  469. mov %rdx,$N[1]
  470. mulq $m0 # ap[j]*bp[i]
  471. add %rax,$A[1]
  472. mov -8($np,$j,8),%rax
  473. adc \$0,%rdx
  474. add -8(%rsp,$j,8),$A[1]
  475. adc \$0,%rdx
  476. mov %rdx,$A[0]
  477. mulq $m1 # np[j]*m1
  478. add %rax,$N[1]
  479. mov ($ap,$j,8),%rax
  480. adc \$0,%rdx
  481. add $A[1],$N[1]
  482. adc \$0,%rdx
  483. mov $N[1],-16(%rsp,$j,8) # tp[j-1]
  484. mov %rdx,$N[0]
  485. mulq $m0 # ap[j]*bp[i]
  486. add %rax,$A[0]
  487. mov ($np,$j,8),%rax
  488. adc \$0,%rdx
  489. add (%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j]
  490. adc \$0,%rdx
  491. mov %rdx,$A[1]
  492. mulq $m1 # np[j]*m1
  493. add %rax,$N[0]
  494. mov 8($ap,$j,8),%rax
  495. adc \$0,%rdx
  496. add $A[0],$N[0]
  497. adc \$0,%rdx
  498. mov $N[0],-8(%rsp,$j,8) # tp[j-1]
  499. mov %rdx,$N[1]
  500. mulq $m0 # ap[j]*bp[i]
  501. add %rax,$A[1]
  502. mov 8($np,$j,8),%rax
  503. adc \$0,%rdx
  504. add 8(%rsp,$j,8),$A[1]
  505. adc \$0,%rdx
  506. lea 4($j),$j # j++
  507. mov %rdx,$A[0]
  508. mulq $m1 # np[j]*m1
  509. add %rax,$N[1]
  510. mov -16($ap,$j,8),%rax
  511. adc \$0,%rdx
  512. add $A[1],$N[1]
  513. adc \$0,%rdx
  514. mov $N[1],-32(%rsp,$j,8) # tp[j-1]
  515. mov %rdx,$N[0]
  516. cmp $num,$j
  517. jb .Linner4x
  518. mulq $m0 # ap[j]*bp[i]
  519. add %rax,$A[0]
  520. mov -16($np,$j,8),%rax
  521. adc \$0,%rdx
  522. add -16(%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j]
  523. adc \$0,%rdx
  524. mov %rdx,$A[1]
  525. mulq $m1 # np[j]*m1
  526. add %rax,$N[0]
  527. mov -8($ap,$j,8),%rax
  528. adc \$0,%rdx
  529. add $A[0],$N[0]
  530. adc \$0,%rdx
  531. mov $N[0],-24(%rsp,$j,8) # tp[j-1]
  532. mov %rdx,$N[1]
  533. mulq $m0 # ap[j]*bp[i]
  534. add %rax,$A[1]
  535. mov -8($np,$j,8),%rax
  536. adc \$0,%rdx
  537. add -8(%rsp,$j,8),$A[1]
  538. adc \$0,%rdx
  539. lea 1($i),$i # i++
  540. mov %rdx,$A[0]
  541. mulq $m1 # np[j]*m1
  542. add %rax,$N[1]
  543. mov ($ap),%rax # ap[0]
  544. adc \$0,%rdx
  545. add $A[1],$N[1]
  546. adc \$0,%rdx
  547. mov $N[1],-16(%rsp,$j,8) # tp[j-1]
  548. mov %rdx,$N[0]
  549. xor $N[1],$N[1]
  550. add $A[0],$N[0]
  551. adc \$0,$N[1]
  552. add (%rsp,$num,8),$N[0] # pull upmost overflow bit
  553. adc \$0,$N[1]
  554. mov $N[0],-8(%rsp,$j,8)
  555. mov $N[1],(%rsp,$j,8) # store upmost overflow bit
  556. cmp $num,$i
  557. jb .Louter4x
  558. ___
  559. {
  560. my @ri=("%rax","%rdx",$m0,$m1);
  561. $code.=<<___;
  562. mov 16(%rsp,$num,8),$rp # restore $rp
  563. mov 0(%rsp),@ri[0] # tp[0]
  564. mov 8(%rsp),@ri[1] # tp[1]
  565. shr \$2,$num # num/=4
  566. lea (%rsp),$ap # borrow ap for tp
  567. xor $i,$i # i=0 and clear CF!
  568. sub 0($np),@ri[0]
  569. mov 16($ap),@ri[2] # tp[2]
  570. mov 24($ap),@ri[3] # tp[3]
  571. sbb 8($np),@ri[1]
  572. lea -1($num),$j # j=num/4-1
  573. jmp .Lsub4x
  574. .align 16
  575. .Lsub4x:
  576. mov @ri[0],0($rp,$i,8) # rp[i]=tp[i]-np[i]
  577. mov @ri[1],8($rp,$i,8) # rp[i]=tp[i]-np[i]
  578. sbb 16($np,$i,8),@ri[2]
  579. mov 32($ap,$i,8),@ri[0] # tp[i+1]
  580. mov 40($ap,$i,8),@ri[1]
  581. sbb 24($np,$i,8),@ri[3]
  582. mov @ri[2],16($rp,$i,8) # rp[i]=tp[i]-np[i]
  583. mov @ri[3],24($rp,$i,8) # rp[i]=tp[i]-np[i]
  584. sbb 32($np,$i,8),@ri[0]
  585. mov 48($ap,$i,8),@ri[2]
  586. mov 56($ap,$i,8),@ri[3]
  587. sbb 40($np,$i,8),@ri[1]
  588. lea 4($i),$i # i++
  589. dec $j # doesnn't affect CF!
  590. jnz .Lsub4x
  591. mov @ri[0],0($rp,$i,8) # rp[i]=tp[i]-np[i]
  592. mov 32($ap,$i,8),@ri[0] # load overflow bit
  593. sbb 16($np,$i,8),@ri[2]
  594. mov @ri[1],8($rp,$i,8) # rp[i]=tp[i]-np[i]
  595. sbb 24($np,$i,8),@ri[3]
  596. mov @ri[2],16($rp,$i,8) # rp[i]=tp[i]-np[i]
  597. sbb \$0,@ri[0] # handle upmost overflow bit
  598. mov @ri[0],%xmm0
  599. punpcklqdq %xmm0,%xmm0 # extend mask to 128 bits
  600. mov @ri[3],24($rp,$i,8) # rp[i]=tp[i]-np[i]
  601. xor $i,$i # i=0
  602. mov $num,$j
  603. pxor %xmm5,%xmm5
  604. jmp .Lcopy4x
  605. .align 16
  606. .Lcopy4x: # copy or in-place refresh
  607. movdqu (%rsp,$i),%xmm2
  608. movdqu 16(%rsp,$i),%xmm4
  609. movdqu ($rp,$i),%xmm1
  610. movdqu 16($rp,$i),%xmm3
  611. pxor %xmm1,%xmm2 # conditional select
  612. pxor %xmm3,%xmm4
  613. pand %xmm0,%xmm2
  614. pand %xmm0,%xmm4
  615. pxor %xmm1,%xmm2
  616. pxor %xmm3,%xmm4
  617. movdqu %xmm2,($rp,$i)
  618. movdqu %xmm4,16($rp,$i)
  619. movdqa %xmm5,(%rsp,$i) # zap temporary vectors
  620. movdqa %xmm5,16(%rsp,$i)
  621. lea 32($i),$i
  622. dec $j
  623. jnz .Lcopy4x
  624. shl \$2,$num
  625. ___
  626. }
  627. $code.=<<___;
  628. mov 8(%rsp,$num,8),%rsi # restore %rsp
  629. mov \$1,%rax
  630. mov (%rsi),%r15
  631. mov 8(%rsi),%r14
  632. mov 16(%rsi),%r13
  633. mov 24(%rsi),%r12
  634. mov 32(%rsi),%rbp
  635. mov 40(%rsi),%rbx
  636. lea 48(%rsi),%rsp
  637. .Lmul4x_epilogue:
  638. ret
  639. .size bn_mul4x_mont,.-bn_mul4x_mont
  640. ___
  641. }}}
  642. {{{
  643. ######################################################################
  644. # void bn_sqr8x_mont(
  645. my $rptr="%rdi"; # const BN_ULONG *rptr,
  646. my $aptr="%rsi"; # const BN_ULONG *aptr,
  647. my $bptr="%rdx"; # not used
  648. my $nptr="%rcx"; # const BN_ULONG *nptr,
  649. my $n0 ="%r8"; # const BN_ULONG *n0);
  650. my $num ="%r9"; # int num, has to be divisible by 8
  651. my ($i,$j,$tptr)=("%rbp","%rcx",$rptr);
  652. my @A0=("%r10","%r11");
  653. my @A1=("%r12","%r13");
  654. my ($a0,$a1,$ai)=("%r14","%r15","%rbx");
  655. $code.=<<___ if ($addx);
  656. .extern bn_sqrx8x_internal # see x86_64-mont5 module
  657. ___
  658. $code.=<<___;
  659. .extern bn_sqr8x_internal # see x86_64-mont5 module
  660. .type bn_sqr8x_mont,\@function,6
  661. .align 32
  662. bn_sqr8x_mont:
  663. .Lsqr8x_enter:
  664. mov %rsp,%rax
  665. push %rbx
  666. push %rbp
  667. push %r12
  668. push %r13
  669. push %r14
  670. push %r15
  671. mov ${num}d,%r10d
  672. shl \$3,${num}d # convert $num to bytes
  673. shl \$3+2,%r10 # 4*$num
  674. neg $num
  675. ##############################################################
  676. # ensure that stack frame doesn't alias with $aptr modulo
  677. # 4096. this is done to allow memory disambiguation logic
  678. # do its job.
  679. #
  680. lea -64(%rsp,$num,4),%r11
  681. mov ($n0),$n0 # *n0
  682. sub $aptr,%r11
  683. and \$4095,%r11
  684. cmp %r11,%r10
  685. jb .Lsqr8x_sp_alt
  686. sub %r11,%rsp # align with $aptr
  687. lea -64(%rsp,$num,4),%rsp # alloca(frame+4*$num)
  688. jmp .Lsqr8x_sp_done
  689. .align 32
  690. .Lsqr8x_sp_alt:
  691. lea 4096-64(,$num,4),%r10 # 4096-frame-4*$num
  692. lea -64(%rsp,$num,4),%rsp # alloca(frame+4*$num)
  693. sub %r10,%r11
  694. mov \$0,%r10
  695. cmovc %r10,%r11
  696. sub %r11,%rsp
  697. .Lsqr8x_sp_done:
  698. and \$-64,%rsp
  699. mov $num,%r10
  700. neg $num
  701. lea 64(%rsp,$num,2),%r11 # copy of modulus
  702. mov $n0, 32(%rsp)
  703. mov %rax, 40(%rsp) # save original %rsp
  704. .Lsqr8x_body:
  705. mov $num,$i
  706. movq %r11, %xmm2 # save pointer to modulus copy
  707. shr \$3+2,$i
  708. mov OPENSSL_ia32cap_P+8(%rip),%eax
  709. jmp .Lsqr8x_copy_n
  710. .align 32
  711. .Lsqr8x_copy_n:
  712. movq 8*0($nptr),%xmm0
  713. movq 8*1($nptr),%xmm1
  714. movq 8*2($nptr),%xmm3
  715. movq 8*3($nptr),%xmm4
  716. lea 8*4($nptr),$nptr
  717. movdqa %xmm0,16*0(%r11)
  718. movdqa %xmm1,16*1(%r11)
  719. movdqa %xmm3,16*2(%r11)
  720. movdqa %xmm4,16*3(%r11)
  721. lea 16*4(%r11),%r11
  722. dec $i
  723. jnz .Lsqr8x_copy_n
  724. pxor %xmm0,%xmm0
  725. movq $rptr,%xmm1 # save $rptr
  726. movq %r10, %xmm3 # -$num
  727. ___
  728. $code.=<<___ if ($addx);
  729. and \$0x80100,%eax
  730. cmp \$0x80100,%eax
  731. jne .Lsqr8x_nox
  732. call bn_sqrx8x_internal # see x86_64-mont5 module
  733. pxor %xmm0,%xmm0
  734. lea 48(%rsp),%rax
  735. lea 64(%rsp,$num,2),%rdx
  736. shr \$3+2,$num
  737. mov 40(%rsp),%rsi # restore %rsp
  738. jmp .Lsqr8x_zero
  739. .align 32
  740. .Lsqr8x_nox:
  741. ___
  742. $code.=<<___;
  743. call bn_sqr8x_internal # see x86_64-mont5 module
  744. pxor %xmm0,%xmm0
  745. lea 48(%rsp),%rax
  746. lea 64(%rsp,$num,2),%rdx
  747. shr \$3+2,$num
  748. mov 40(%rsp),%rsi # restore %rsp
  749. jmp .Lsqr8x_zero
  750. .align 32
  751. .Lsqr8x_zero:
  752. movdqa %xmm0,16*0(%rax) # wipe t
  753. movdqa %xmm0,16*1(%rax)
  754. movdqa %xmm0,16*2(%rax)
  755. movdqa %xmm0,16*3(%rax)
  756. lea 16*4(%rax),%rax
  757. movdqa %xmm0,16*0(%rdx) # wipe n
  758. movdqa %xmm0,16*1(%rdx)
  759. movdqa %xmm0,16*2(%rdx)
  760. movdqa %xmm0,16*3(%rdx)
  761. lea 16*4(%rdx),%rdx
  762. dec $num
  763. jnz .Lsqr8x_zero
  764. mov \$1,%rax
  765. mov -48(%rsi),%r15
  766. mov -40(%rsi),%r14
  767. mov -32(%rsi),%r13
  768. mov -24(%rsi),%r12
  769. mov -16(%rsi),%rbp
  770. mov -8(%rsi),%rbx
  771. lea (%rsi),%rsp
  772. .Lsqr8x_epilogue:
  773. ret
  774. .size bn_sqr8x_mont,.-bn_sqr8x_mont
  775. ___
  776. }}}
  777. if ($addx) {{{
  778. my $bp="%rdx"; # original value
  779. $code.=<<___;
  780. .type bn_mulx4x_mont,\@function,6
  781. .align 32
  782. bn_mulx4x_mont:
  783. .Lmulx4x_enter:
  784. mov %rsp,%rax
  785. push %rbx
  786. push %rbp
  787. push %r12
  788. push %r13
  789. push %r14
  790. push %r15
  791. shl \$3,${num}d # convert $num to bytes
  792. .byte 0x67
  793. xor %r10,%r10
  794. sub $num,%r10 # -$num
  795. mov ($n0),$n0 # *n0
  796. lea -72(%rsp,%r10),%rsp # alloca(frame+$num+8)
  797. lea ($bp,$num),%r10
  798. and \$-128,%rsp
  799. ##############################################################
  800. # Stack layout
  801. # +0 num
  802. # +8 off-loaded &b[i]
  803. # +16 end of b[num]
  804. # +24 saved n0
  805. # +32 saved rp
  806. # +40 saved %rsp
  807. # +48 inner counter
  808. # +56
  809. # +64 tmp[num+1]
  810. #
  811. mov $num,0(%rsp) # save $num
  812. shr \$5,$num
  813. mov %r10,16(%rsp) # end of b[num]
  814. sub \$1,$num
  815. mov $n0, 24(%rsp) # save *n0
  816. mov $rp, 32(%rsp) # save $rp
  817. mov %rax,40(%rsp) # save original %rsp
  818. mov $num,48(%rsp) # inner counter
  819. jmp .Lmulx4x_body
  820. .align 32
  821. .Lmulx4x_body:
  822. ___
  823. my ($aptr, $bptr, $nptr, $tptr, $mi, $bi, $zero, $num)=
  824. ("%rsi","%rdi","%rcx","%rbx","%r8","%r9","%rbp","%rax");
  825. my $rptr=$bptr;
  826. $code.=<<___;
  827. lea 8($bp),$bptr
  828. mov ($bp),%rdx # b[0], $bp==%rdx actually
  829. lea 64+32(%rsp),$tptr
  830. mov %rdx,$bi
  831. mulx 0*8($aptr),$mi,%rax # a[0]*b[0]
  832. mulx 1*8($aptr),%r11,%r14 # a[1]*b[0]
  833. add %rax,%r11
  834. mov $bptr,8(%rsp) # off-load &b[i]
  835. mulx 2*8($aptr),%r12,%r13 # ...
  836. adc %r14,%r12
  837. adc \$0,%r13
  838. mov $mi,$bptr # borrow $bptr
  839. imulq 24(%rsp),$mi # "t[0]"*n0
  840. xor $zero,$zero # cf=0, of=0
  841. mulx 3*8($aptr),%rax,%r14
  842. mov $mi,%rdx
  843. lea 4*8($aptr),$aptr
  844. adcx %rax,%r13
  845. adcx $zero,%r14 # cf=0
  846. mulx 0*8($nptr),%rax,%r10
  847. adcx %rax,$bptr # discarded
  848. adox %r11,%r10
  849. mulx 1*8($nptr),%rax,%r11
  850. adcx %rax,%r10
  851. adox %r12,%r11
  852. .byte 0xc4,0x62,0xfb,0xf6,0xa1,0x10,0x00,0x00,0x00 # mulx 2*8($nptr),%rax,%r12
  853. mov 48(%rsp),$bptr # counter value
  854. mov %r10,-4*8($tptr)
  855. adcx %rax,%r11
  856. adox %r13,%r12
  857. mulx 3*8($nptr),%rax,%r15
  858. mov $bi,%rdx
  859. mov %r11,-3*8($tptr)
  860. adcx %rax,%r12
  861. adox $zero,%r15 # of=0
  862. lea 4*8($nptr),$nptr
  863. mov %r12,-2*8($tptr)
  864. jmp .Lmulx4x_1st
  865. .align 32
  866. .Lmulx4x_1st:
  867. adcx $zero,%r15 # cf=0, modulo-scheduled
  868. mulx 0*8($aptr),%r10,%rax # a[4]*b[0]
  869. adcx %r14,%r10
  870. mulx 1*8($aptr),%r11,%r14 # a[5]*b[0]
  871. adcx %rax,%r11
  872. mulx 2*8($aptr),%r12,%rax # ...
  873. adcx %r14,%r12
  874. mulx 3*8($aptr),%r13,%r14
  875. .byte 0x67,0x67
  876. mov $mi,%rdx
  877. adcx %rax,%r13
  878. adcx $zero,%r14 # cf=0
  879. lea 4*8($aptr),$aptr
  880. lea 4*8($tptr),$tptr
  881. adox %r15,%r10
  882. mulx 0*8($nptr),%rax,%r15
  883. adcx %rax,%r10
  884. adox %r15,%r11
  885. mulx 1*8($nptr),%rax,%r15
  886. adcx %rax,%r11
  887. adox %r15,%r12
  888. mulx 2*8($nptr),%rax,%r15
  889. mov %r10,-5*8($tptr)
  890. adcx %rax,%r12
  891. mov %r11,-4*8($tptr)
  892. adox %r15,%r13
  893. mulx 3*8($nptr),%rax,%r15
  894. mov $bi,%rdx
  895. mov %r12,-3*8($tptr)
  896. adcx %rax,%r13
  897. adox $zero,%r15
  898. lea 4*8($nptr),$nptr
  899. mov %r13,-2*8($tptr)
  900. dec $bptr # of=0, pass cf
  901. jnz .Lmulx4x_1st
  902. mov 0(%rsp),$num # load num
  903. mov 8(%rsp),$bptr # re-load &b[i]
  904. adc $zero,%r15 # modulo-scheduled
  905. add %r15,%r14
  906. sbb %r15,%r15 # top-most carry
  907. mov %r14,-1*8($tptr)
  908. jmp .Lmulx4x_outer
  909. .align 32
  910. .Lmulx4x_outer:
  911. mov ($bptr),%rdx # b[i]
  912. lea 8($bptr),$bptr # b++
  913. sub $num,$aptr # rewind $aptr
  914. mov %r15,($tptr) # save top-most carry
  915. lea 64+4*8(%rsp),$tptr
  916. sub $num,$nptr # rewind $nptr
  917. mulx 0*8($aptr),$mi,%r11 # a[0]*b[i]
  918. xor %ebp,%ebp # xor $zero,$zero # cf=0, of=0
  919. mov %rdx,$bi
  920. mulx 1*8($aptr),%r14,%r12 # a[1]*b[i]
  921. adox -4*8($tptr),$mi
  922. adcx %r14,%r11
  923. mulx 2*8($aptr),%r15,%r13 # ...
  924. adox -3*8($tptr),%r11
  925. adcx %r15,%r12
  926. adox $zero,%r12
  927. adcx $zero,%r13
  928. mov $bptr,8(%rsp) # off-load &b[i]
  929. .byte 0x67
  930. mov $mi,%r15
  931. imulq 24(%rsp),$mi # "t[0]"*n0
  932. xor %ebp,%ebp # xor $zero,$zero # cf=0, of=0
  933. mulx 3*8($aptr),%rax,%r14
  934. mov $mi,%rdx
  935. adox -2*8($tptr),%r12
  936. adcx %rax,%r13
  937. adox -1*8($tptr),%r13
  938. adcx $zero,%r14
  939. lea 4*8($aptr),$aptr
  940. adox $zero,%r14
  941. mulx 0*8($nptr),%rax,%r10
  942. adcx %rax,%r15 # discarded
  943. adox %r11,%r10
  944. mulx 1*8($nptr),%rax,%r11
  945. adcx %rax,%r10
  946. adox %r12,%r11
  947. mulx 2*8($nptr),%rax,%r12
  948. mov %r10,-4*8($tptr)
  949. adcx %rax,%r11
  950. adox %r13,%r12
  951. mulx 3*8($nptr),%rax,%r15
  952. mov $bi,%rdx
  953. mov %r11,-3*8($tptr)
  954. lea 4*8($nptr),$nptr
  955. adcx %rax,%r12
  956. adox $zero,%r15 # of=0
  957. mov 48(%rsp),$bptr # counter value
  958. mov %r12,-2*8($tptr)
  959. jmp .Lmulx4x_inner
  960. .align 32
  961. .Lmulx4x_inner:
  962. mulx 0*8($aptr),%r10,%rax # a[4]*b[i]
  963. adcx $zero,%r15 # cf=0, modulo-scheduled
  964. adox %r14,%r10
  965. mulx 1*8($aptr),%r11,%r14 # a[5]*b[i]
  966. adcx 0*8($tptr),%r10
  967. adox %rax,%r11
  968. mulx 2*8($aptr),%r12,%rax # ...
  969. adcx 1*8($tptr),%r11
  970. adox %r14,%r12
  971. mulx 3*8($aptr),%r13,%r14
  972. mov $mi,%rdx
  973. adcx 2*8($tptr),%r12
  974. adox %rax,%r13
  975. adcx 3*8($tptr),%r13
  976. adox $zero,%r14 # of=0
  977. lea 4*8($aptr),$aptr
  978. lea 4*8($tptr),$tptr
  979. adcx $zero,%r14 # cf=0
  980. adox %r15,%r10
  981. mulx 0*8($nptr),%rax,%r15
  982. adcx %rax,%r10
  983. adox %r15,%r11
  984. mulx 1*8($nptr),%rax,%r15
  985. adcx %rax,%r11
  986. adox %r15,%r12
  987. mulx 2*8($nptr),%rax,%r15
  988. mov %r10,-5*8($tptr)
  989. adcx %rax,%r12
  990. adox %r15,%r13
  991. mulx 3*8($nptr),%rax,%r15
  992. mov $bi,%rdx
  993. mov %r11,-4*8($tptr)
  994. mov %r12,-3*8($tptr)
  995. adcx %rax,%r13
  996. adox $zero,%r15
  997. lea 4*8($nptr),$nptr
  998. mov %r13,-2*8($tptr)
  999. dec $bptr # of=0, pass cf
  1000. jnz .Lmulx4x_inner
  1001. mov 0(%rsp),$num # load num
  1002. mov 8(%rsp),$bptr # re-load &b[i]
  1003. adc $zero,%r15 # modulo-scheduled
  1004. sub 0*8($tptr),$zero # pull top-most carry
  1005. adc %r15,%r14
  1006. mov -8($nptr),$mi
  1007. sbb %r15,%r15 # top-most carry
  1008. mov %r14,-1*8($tptr)
  1009. cmp 16(%rsp),$bptr
  1010. jne .Lmulx4x_outer
  1011. sub %r14,$mi # compare top-most words
  1012. sbb $mi,$mi
  1013. or $mi,%r15
  1014. neg $num
  1015. xor %rdx,%rdx
  1016. mov 32(%rsp),$rptr # restore rp
  1017. lea 64(%rsp),$tptr
  1018. pxor %xmm0,%xmm0
  1019. mov 0*8($nptr,$num),%r8
  1020. mov 1*8($nptr,$num),%r9
  1021. neg %r8
  1022. jmp .Lmulx4x_sub_entry
  1023. .align 32
  1024. .Lmulx4x_sub:
  1025. mov 0*8($nptr,$num),%r8
  1026. mov 1*8($nptr,$num),%r9
  1027. not %r8
  1028. .Lmulx4x_sub_entry:
  1029. mov 2*8($nptr,$num),%r10
  1030. not %r9
  1031. and %r15,%r8
  1032. mov 3*8($nptr,$num),%r11
  1033. not %r10
  1034. and %r15,%r9
  1035. not %r11
  1036. and %r15,%r10
  1037. and %r15,%r11
  1038. neg %rdx # mov %rdx,%cf
  1039. adc 0*8($tptr),%r8
  1040. adc 1*8($tptr),%r9
  1041. movdqa %xmm0,($tptr)
  1042. adc 2*8($tptr),%r10
  1043. adc 3*8($tptr),%r11
  1044. movdqa %xmm0,16($tptr)
  1045. lea 4*8($tptr),$tptr
  1046. sbb %rdx,%rdx # mov %cf,%rdx
  1047. mov %r8,0*8($rptr)
  1048. mov %r9,1*8($rptr)
  1049. mov %r10,2*8($rptr)
  1050. mov %r11,3*8($rptr)
  1051. lea 4*8($rptr),$rptr
  1052. add \$32,$num
  1053. jnz .Lmulx4x_sub
  1054. mov 40(%rsp),%rsi # restore %rsp
  1055. mov \$1,%rax
  1056. mov -48(%rsi),%r15
  1057. mov -40(%rsi),%r14
  1058. mov -32(%rsi),%r13
  1059. mov -24(%rsi),%r12
  1060. mov -16(%rsi),%rbp
  1061. mov -8(%rsi),%rbx
  1062. lea (%rsi),%rsp
  1063. .Lmulx4x_epilogue:
  1064. ret
  1065. .size bn_mulx4x_mont,.-bn_mulx4x_mont
  1066. ___
  1067. }}}
  1068. $code.=<<___;
  1069. .asciz "Montgomery Multiplication for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
  1070. .align 16
  1071. ___
  1072. # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
  1073. # CONTEXT *context,DISPATCHER_CONTEXT *disp)
  1074. if ($win64) {
  1075. $rec="%rcx";
  1076. $frame="%rdx";
  1077. $context="%r8";
  1078. $disp="%r9";
  1079. $code.=<<___;
  1080. .extern __imp_RtlVirtualUnwind
  1081. .type mul_handler,\@abi-omnipotent
  1082. .align 16
  1083. mul_handler:
  1084. push %rsi
  1085. push %rdi
  1086. push %rbx
  1087. push %rbp
  1088. push %r12
  1089. push %r13
  1090. push %r14
  1091. push %r15
  1092. pushfq
  1093. sub \$64,%rsp
  1094. mov 120($context),%rax # pull context->Rax
  1095. mov 248($context),%rbx # pull context->Rip
  1096. mov 8($disp),%rsi # disp->ImageBase
  1097. mov 56($disp),%r11 # disp->HandlerData
  1098. mov 0(%r11),%r10d # HandlerData[0]
  1099. lea (%rsi,%r10),%r10 # end of prologue label
  1100. cmp %r10,%rbx # context->Rip<end of prologue label
  1101. jb .Lcommon_seh_tail
  1102. mov 152($context),%rax # pull context->Rsp
  1103. mov 4(%r11),%r10d # HandlerData[1]
  1104. lea (%rsi,%r10),%r10 # epilogue label
  1105. cmp %r10,%rbx # context->Rip>=epilogue label
  1106. jae .Lcommon_seh_tail
  1107. mov 192($context),%r10 # pull $num
  1108. mov 8(%rax,%r10,8),%rax # pull saved stack pointer
  1109. lea 48(%rax),%rax
  1110. mov -8(%rax),%rbx
  1111. mov -16(%rax),%rbp
  1112. mov -24(%rax),%r12
  1113. mov -32(%rax),%r13
  1114. mov -40(%rax),%r14
  1115. mov -48(%rax),%r15
  1116. mov %rbx,144($context) # restore context->Rbx
  1117. mov %rbp,160($context) # restore context->Rbp
  1118. mov %r12,216($context) # restore context->R12
  1119. mov %r13,224($context) # restore context->R13
  1120. mov %r14,232($context) # restore context->R14
  1121. mov %r15,240($context) # restore context->R15
  1122. jmp .Lcommon_seh_tail
  1123. .size mul_handler,.-mul_handler
  1124. .type sqr_handler,\@abi-omnipotent
  1125. .align 16
  1126. sqr_handler:
  1127. push %rsi
  1128. push %rdi
  1129. push %rbx
  1130. push %rbp
  1131. push %r12
  1132. push %r13
  1133. push %r14
  1134. push %r15
  1135. pushfq
  1136. sub \$64,%rsp
  1137. mov 120($context),%rax # pull context->Rax
  1138. mov 248($context),%rbx # pull context->Rip
  1139. mov 8($disp),%rsi # disp->ImageBase
  1140. mov 56($disp),%r11 # disp->HandlerData
  1141. mov 0(%r11),%r10d # HandlerData[0]
  1142. lea (%rsi,%r10),%r10 # end of prologue label
  1143. cmp %r10,%rbx # context->Rip<.Lsqr_body
  1144. jb .Lcommon_seh_tail
  1145. mov 152($context),%rax # pull context->Rsp
  1146. mov 4(%r11),%r10d # HandlerData[1]
  1147. lea (%rsi,%r10),%r10 # epilogue label
  1148. cmp %r10,%rbx # context->Rip>=.Lsqr_epilogue
  1149. jae .Lcommon_seh_tail
  1150. mov 40(%rax),%rax # pull saved stack pointer
  1151. mov -8(%rax),%rbx
  1152. mov -16(%rax),%rbp
  1153. mov -24(%rax),%r12
  1154. mov -32(%rax),%r13
  1155. mov -40(%rax),%r14
  1156. mov -48(%rax),%r15
  1157. mov %rbx,144($context) # restore context->Rbx
  1158. mov %rbp,160($context) # restore context->Rbp
  1159. mov %r12,216($context) # restore context->R12
  1160. mov %r13,224($context) # restore context->R13
  1161. mov %r14,232($context) # restore context->R14
  1162. mov %r15,240($context) # restore context->R15
  1163. .Lcommon_seh_tail:
  1164. mov 8(%rax),%rdi
  1165. mov 16(%rax),%rsi
  1166. mov %rax,152($context) # restore context->Rsp
  1167. mov %rsi,168($context) # restore context->Rsi
  1168. mov %rdi,176($context) # restore context->Rdi
  1169. mov 40($disp),%rdi # disp->ContextRecord
  1170. mov $context,%rsi # context
  1171. mov \$154,%ecx # sizeof(CONTEXT)
  1172. .long 0xa548f3fc # cld; rep movsq
  1173. mov $disp,%rsi
  1174. xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
  1175. mov 8(%rsi),%rdx # arg2, disp->ImageBase
  1176. mov 0(%rsi),%r8 # arg3, disp->ControlPc
  1177. mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
  1178. mov 40(%rsi),%r10 # disp->ContextRecord
  1179. lea 56(%rsi),%r11 # &disp->HandlerData
  1180. lea 24(%rsi),%r12 # &disp->EstablisherFrame
  1181. mov %r10,32(%rsp) # arg5
  1182. mov %r11,40(%rsp) # arg6
  1183. mov %r12,48(%rsp) # arg7
  1184. mov %rcx,56(%rsp) # arg8, (NULL)
  1185. call *__imp_RtlVirtualUnwind(%rip)
  1186. mov \$1,%eax # ExceptionContinueSearch
  1187. add \$64,%rsp
  1188. popfq
  1189. pop %r15
  1190. pop %r14
  1191. pop %r13
  1192. pop %r12
  1193. pop %rbp
  1194. pop %rbx
  1195. pop %rdi
  1196. pop %rsi
  1197. ret
  1198. .size sqr_handler,.-sqr_handler
  1199. .section .pdata
  1200. .align 4
  1201. .rva .LSEH_begin_bn_mul_mont
  1202. .rva .LSEH_end_bn_mul_mont
  1203. .rva .LSEH_info_bn_mul_mont
  1204. .rva .LSEH_begin_bn_mul4x_mont
  1205. .rva .LSEH_end_bn_mul4x_mont
  1206. .rva .LSEH_info_bn_mul4x_mont
  1207. .rva .LSEH_begin_bn_sqr8x_mont
  1208. .rva .LSEH_end_bn_sqr8x_mont
  1209. .rva .LSEH_info_bn_sqr8x_mont
  1210. ___
  1211. $code.=<<___ if ($addx);
  1212. .rva .LSEH_begin_bn_mulx4x_mont
  1213. .rva .LSEH_end_bn_mulx4x_mont
  1214. .rva .LSEH_info_bn_mulx4x_mont
  1215. ___
  1216. $code.=<<___;
  1217. .section .xdata
  1218. .align 8
  1219. .LSEH_info_bn_mul_mont:
  1220. .byte 9,0,0,0
  1221. .rva mul_handler
  1222. .rva .Lmul_body,.Lmul_epilogue # HandlerData[]
  1223. .LSEH_info_bn_mul4x_mont:
  1224. .byte 9,0,0,0
  1225. .rva mul_handler
  1226. .rva .Lmul4x_body,.Lmul4x_epilogue # HandlerData[]
  1227. .LSEH_info_bn_sqr8x_mont:
  1228. .byte 9,0,0,0
  1229. .rva sqr_handler
  1230. .rva .Lsqr8x_body,.Lsqr8x_epilogue # HandlerData[]
  1231. ___
  1232. $code.=<<___ if ($addx);
  1233. .LSEH_info_bn_mulx4x_mont:
  1234. .byte 9,0,0,0
  1235. .rva sqr_handler
  1236. .rva .Lmulx4x_body,.Lmulx4x_epilogue # HandlerData[]
  1237. ___
  1238. }
  1239. print $code;
  1240. close STDOUT;