25'ten fazla konu seçemezsiniz Konular bir harf veya rakamla başlamalı, kısa çizgiler ('-') içerebilir ve en fazla 35 karakter uzunluğunda olabilir.
 
 
 

269 satır
5.9 KiB

  1. #if defined(__APPLE__)
  2. /* OS X's C ABI prefixes functions with underscore. */
  3. #define C_ABI(x) _ ## x
  4. #define HIDDEN .private_extern
  5. #else
  6. #define C_ABI(x) x
  7. #define HIDDEN .hidden
  8. #endif
  9. .p2align 6
  10. .LpermMask0:
  11. .word 0,1,2,3, 3,4,5,6, 6,7,8,9, 9,10,11,12, 13,14,15,16, 16,17,18,19, 19,20,21,22, 22,23,24,25
  12. .LshiftMask0:
  13. .quad 0,4,8,12,0,4,8,12
  14. .LandMask:
  15. .quad 0xfffffffffffff
  16. .p2align 6
  17. .Lpoly:
  18. .quad 0x000fffffffffffff, 0x000fffffffffffff, 0x000fffffffffffff, 0x000fffffffffffff
  19. .quad 0x000fffffffffffff, 0x000fffffffffffff, 0x000fffffffffffff, 0x00049f878a8eeaff
  20. .quad 0x0007cc76e3ec9685, 0x00076da959b1a13f, 0x00084e9867d6ebe8, 0x000b5045cb257480
  21. .quad 0x000f97badc668562, 0x00041f71c0e12909, 0x00000000006fe5d5, 0
  22. .LR2:
  23. .quad 0x000dad40589641fd, 0x000452a233046449, 0x000edb010161a696, 0x00036941472e3fd8
  24. .quad 0x000e2082a2e7065e, 0x000904f8751f40bf, 0x0007fc814932cca8, 0x00033f174b08b2ee
  25. .quad 0x0009814efb9f1375, 0x00099594a1afe512, 0x00043c75310de66d, 0x000197021a5b37b0
  26. .quad 0x000cc1a272e73959, 0x000a733d7c97cd76, 0x0000000000292ee8, 0
  27. .Lone:
  28. .quad 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
  29. .globl C_ABI(norm2red)
  30. .p2align 6
  31. C_ABI(norm2red):
  32. mov $0x3FFFFF, %eax
  33. kmovd %eax, %k1
  34. mov $0x7F, %eax
  35. kmovd %eax, %k2
  36. vmovdqa64 .LpermMask0(%rip), %zmm0
  37. vmovdqa64 .LshiftMask0(%rip), %zmm1
  38. vpbroadcastq .LandMask(%rip), %zmm10
  39. vpermw 52*0(%rsi), %zmm0, %zmm2
  40. vmovdqu16 52*1(%rsi), %zmm3{%k1}{z}
  41. vpermw %zmm3, %zmm0, %zmm3
  42. vpsrlvq %zmm1, %zmm2, %zmm2
  43. vpsrlvq %zmm1, %zmm3, %zmm3
  44. vpsrlvq %zmm1, %zmm4, %zmm4
  45. vpandq %zmm10, %zmm2, %zmm2
  46. vpandq %zmm10, %zmm3, %zmm3
  47. vpandq %zmm10, %zmm4, %zmm4
  48. vmovdqu64 %zmm2, 64*0(%rdi)
  49. vmovdqu64 %zmm3, 64*1(%rdi){%k2}
  50. ret
  51. #define res %rdi // uint64_t *rp,
  52. #define a0 %rsi // const uint64_t *ap,
  53. #define bpi %rdx // const uint64_t *bptr,
  54. #define m0 %rcx
  55. #define b_ptr %rax
  56. #define acc0 %r9
  57. #define itr %r10
  58. #define t0 %r11
  59. #define t1 %r12
  60. #define t2 %r13
  61. #define A0 %zmm0
  62. #define A1 %zmm1
  63. #define M0 %zmm2
  64. #define M1 %zmm3
  65. #define ACC0 %zmm4
  66. #define ACC0_xmm %xmm4
  67. #define ACC1 %zmm5
  68. #define Y_curr %zmm6
  69. #define Y_prev %zmm7
  70. #define B_curr %zmm8
  71. #define B_prev %zmm9
  72. #define TMP %zmm10
  73. #define TMP_xmm %xmm10
  74. #define ZERO %zmm11
  75. #define AND_MASK %zmm12
  76. #define ACC0b %zmm13
  77. #define ACC1b %zmm14
  78. ###############################################################################
  79. .globl C_ABI(to_mont_ifma)
  80. .p2align 6
  81. C_ABI(to_mont_ifma):
  82. leaq .LR2(%rip), bpi
  83. jmp C_ABI(fp_mul_ifma)
  84. ###############################################################################
  85. .globl C_ABI(from_mont_ifma)
  86. .p2align 6
  87. C_ABI(from_mont_ifma):
  88. leaq .Lone(%rip), bpi
  89. jmp C_ABI(fp_mul_ifma)
  90. ###############################################################################
  91. .globl C_ABI(fp_mul_ifma)
  92. .p2align 6
  93. C_ABI(fp_mul_ifma):
  94. push %rbx
  95. push %r12
  96. push %r13
  97. mov bpi, b_ptr
  98. mov $1, t0
  99. mov $0x3f, t1
  100. kmovq t0, %k1
  101. kmovq t1, %k2
  102. vpbroadcastq .LandMask(%rip), AND_MASK
  103. vpxorq ZERO, ZERO, ZERO
  104. # Load operands A into registers. A[0] is stored in ALU register, in order to compensate for the latency of IFMA when computing (A*B)[0] * K0
  105. vmovdqu64 8*1+64*0(a0), A0
  106. vmovdqu64 8*1+64*1(a0), A1{%k2}{z}
  107. mov 8*0(a0), a0
  108. # Load the modulii
  109. mov .Lpoly(%rip), m0
  110. vmovdqu64 8*1+64*0+.Lpoly(%rip), M0
  111. vmovdqu64 8*1+64*1+.Lpoly(%rip), M1{%k2}{z}
  112. # Prepare the accumulators
  113. vpxorq ACC0, ACC0, ACC0
  114. vpxorq ACC1, ACC1, ACC1
  115. vpxorq B_curr, B_curr, B_curr
  116. vpxorq Y_curr, Y_curr, Y_curr
  117. xor acc0, acc0
  118. mov $15, itr
  119. 1:
  120. vpxorq ACC0b, ACC0b, ACC0b
  121. vpxorq ACC1b, ACC1b, ACC1b
  122. # High multiplications
  123. vpmadd52huq B_curr, A0, ACC0b
  124. vpmadd52huq B_curr, A1, ACC1b
  125. vpmadd52huq Y_curr, M0, ACC0b
  126. vpmadd52huq Y_curr, M1, ACC1b
  127. # Shift the ACC in zmms right by a word
  128. valignq $1, ACC0, ACC1, ACC0
  129. valignq $1, ACC1, ZERO, ACC1
  130. mov a0, %rdx
  131. mulx (b_ptr), t0, t2
  132. add t0, acc0
  133. adc $0, t2
  134. mov acc0, %rdx
  135. and .LandMask(%rip), %rdx
  136. vpbroadcastq %rdx, Y_curr
  137. vpbroadcastq (b_ptr), B_curr
  138. mulx m0, t0, t1
  139. add t0, acc0
  140. adc t1, t2
  141. shrd $52, t2, acc0
  142. # Low multiplications
  143. vpmadd52luq B_curr, A0, ACC0b
  144. vpmadd52luq B_curr, A1, ACC1b
  145. vpmadd52luq Y_curr, M0, ACC0
  146. vpmadd52luq Y_curr, M1, ACC1
  147. vpaddq ACC0b, ACC0, ACC0
  148. vpaddq ACC1b, ACC1, ACC1
  149. vmovq ACC0_xmm, t0
  150. add t0, acc0
  151. lea 8(b_ptr), b_ptr
  152. dec itr
  153. jne 1b
  154. vmovq acc0, TMP_xmm
  155. vmovdqa64 TMP, ACC0{%k1}
  156. valignq $7, A0, A1, A1
  157. valignq $7, ZERO, A0, A0
  158. valignq $7, M0, M1, M1
  159. valignq $7, ZERO, M0, M0
  160. # The last high multiplications
  161. vpmadd52huq B_curr, A0, ACC0
  162. vpmadd52huq B_curr, A1, ACC1
  163. vpmadd52huq Y_curr, M0, ACC0
  164. vpmadd52huq Y_curr, M1, ACC1
  165. # Now 'normalize' the result to 52 bit words
  166. vpsrlq $52, ACC0, A0
  167. vpsrlq $52, ACC1, A1
  168. vpandq AND_MASK, ACC0, ACC0
  169. vpandq AND_MASK, ACC1, ACC1
  170. valignq $7, A0, A1, A1
  171. valignq $7, ZERO, A0, A0
  172. vpaddq A0, ACC0, ACC0
  173. vpaddq A1, ACC1, ACC1
  174. vpcmpuq $1, A0, ACC0, %k1
  175. vpcmpuq $1, A1, ACC1, %k2
  176. kmovb %k1, %eax
  177. kmovb %k2, %ebx
  178. add %al, %al
  179. adc %cl, %cl
  180. vpcmpuq $0, AND_MASK, ACC0, %k1
  181. vpcmpuq $0, AND_MASK, ACC1, %k2
  182. kmovb %k1, %r8d
  183. kmovb %k2, %r9d
  184. add %r8b, %al
  185. adc %r9b, %bl
  186. xor %r8b, %al
  187. xor %r9b, %bl
  188. kmovb %eax, %k1
  189. kmovb %ebx, %k2
  190. vpsubq AND_MASK, ACC0, ACC0{%k1}
  191. vpsubq AND_MASK, ACC1, ACC1{%k2}
  192. vpandq AND_MASK, ACC0, ACC0
  193. vpandq AND_MASK, ACC1, ACC1
  194. mov $0x7f, t0
  195. kmovq t0, %k1
  196. vmovdqu64 ACC0, 64*0(res)
  197. vmovdqu64 ACC1, 64*1(res){%k1}
  198. bail:
  199. pop %r13
  200. pop %r12
  201. pop %rbx
  202. ret