You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 

193 lines
6.9 KiB

  1. // +build amd64,!noasm
  2. #include "textflag.h"
  3. // Multipies 512-bit value by 64-bit value. Uses MULQ instruction to
  4. // multiply 2 64-bit values.
  5. //
  6. // Result: x = (y * z) mod 2^512
  7. //
  8. // Registers used: AX, CX, DX, SI, DI, R8
  9. //
  10. // func mul512(a, b *Fp, c uint64)
  11. TEXT ·mul512(SB), NOSPLIT, $0-24
  12. MOVQ a+0(FP), DI // result
  13. MOVQ b+8(FP), SI // multiplicand
  14. // Check wether to use optimized implementation
  15. CMPB ·hasBMI2(SB), $1
  16. JE mul512_mulx
  17. MOVQ c+16(FP), R10 // 64 bit multiplier, used by MULQ
  18. MOVQ R10, AX; MULQ 0(SI); MOVQ DX, R11; MOVQ AX, 0(DI) //x[0]
  19. MOVQ R10, AX; MULQ 8(SI); ADDQ R11, AX; ADCQ $0, DX; MOVQ DX, R11; MOVQ AX, 8(DI) //x[1]
  20. MOVQ R10, AX; MULQ 16(SI); ADDQ R11, AX; ADCQ $0, DX; MOVQ DX, R11; MOVQ AX, 16(DI) //x[2]
  21. MOVQ R10, AX; MULQ 24(SI); ADDQ R11, AX; ADCQ $0, DX; MOVQ DX, R11; MOVQ AX, 24(DI) //x[3]
  22. MOVQ R10, AX; MULQ 32(SI); ADDQ R11, AX; ADCQ $0, DX; MOVQ DX, R11; MOVQ AX, 32(DI) //x[4]
  23. MOVQ R10, AX; MULQ 40(SI); ADDQ R11, AX; ADCQ $0, DX; MOVQ DX, R11; MOVQ AX, 40(DI) //x[5]
  24. MOVQ R10, AX; MULQ 48(SI); ADDQ R11, AX; ADCQ $0, DX; MOVQ DX, R11; MOVQ AX, 48(DI) //x[6]
  25. MOVQ R10, AX; MULQ 56(SI); ADDQ R11, AX; MOVQ AX, 56(DI) //x[7]
  26. RET
  27. // Optimized for CPUs with BMI2
  28. mul512_mulx:
  29. MOVQ c+16(FP), DX // 64 bit multiplier, used by MULX
  30. MULXQ 0(SI), AX, R10; MOVQ AX, 0(DI) // x[0]
  31. MULXQ 8(SI), AX, R11; ADDQ R10, AX; MOVQ AX, 8(DI) // x[1]
  32. MULXQ 16(SI), AX, R10; ADCQ R11, AX; MOVQ AX, 16(DI) // x[2]
  33. MULXQ 24(SI), AX, R11; ADCQ R10, AX; MOVQ AX, 24(DI) // x[3]
  34. MULXQ 32(SI), AX, R10; ADCQ R11, AX; MOVQ AX, 32(DI) // x[4]
  35. MULXQ 40(SI), AX, R11; ADCQ R10, AX; MOVQ AX, 40(DI) // x[5]
  36. MULXQ 48(SI), AX, R10; ADCQ R11, AX; MOVQ AX, 48(DI) // x[6]
  37. MULXQ 56(SI), AX, R11; ADCQ R10, AX; MOVQ AX, 56(DI) // x[7]
  38. RET
  39. // Multipies 512-bit value by 64-bit value and returns 576-bit result. Uses MULQ instruction to
  40. // multiply 2 64-bit values. Returns 576-bit result.
  41. //
  42. // Result: x = (y * z)
  43. //
  44. // Registers used: AX, CX, DX, SI, DI, R8
  45. //
  46. // func mul576(a, b *Fp, c uint64)
  47. TEXT ·mul576(SB), NOSPLIT, $0-24
  48. MOVQ a+0(FP), DI // result
  49. MOVQ b+8(FP), SI // multiplicand
  50. MOVQ c+16(FP), R10 // 64 bit multiplier, used by MULQ
  51. MOVQ R10, AX; MULQ 0(SI); MOVQ DX, R11; MOVQ AX, 0(DI) //x[0]
  52. MOVQ R10, AX; MULQ 8(SI); ADDQ R11, AX; ADCQ $0, DX; MOVQ DX, R11; MOVQ AX, 8(DI) //x[1]
  53. MOVQ R10, AX; MULQ 16(SI); ADDQ R11, AX; ADCQ $0, DX; MOVQ DX, R11; MOVQ AX, 16(DI) //x[2]
  54. MOVQ R10, AX; MULQ 24(SI); ADDQ R11, AX; ADCQ $0, DX; MOVQ DX, R11; MOVQ AX, 24(DI) //x[3]
  55. MOVQ R10, AX; MULQ 32(SI); ADDQ R11, AX; ADCQ $0, DX; MOVQ DX, R11; MOVQ AX, 32(DI) //x[4]
  56. MOVQ R10, AX; MULQ 40(SI); ADDQ R11, AX; ADCQ $0, DX; MOVQ DX, R11; MOVQ AX, 40(DI) //x[5]
  57. MOVQ R10, AX; MULQ 48(SI); ADDQ R11, AX; ADCQ $0, DX; MOVQ DX, R11; MOVQ AX, 48(DI) //x[6]
  58. MOVQ R10, AX; MULQ 56(SI); ADDQ R11, AX; ADCQ $0, DX; MOVQ AX, 56(DI) //x[7]
  59. MOVQ DX, 64(DI) //x[8]
  60. RET
  61. TEXT ·cswap512(SB),NOSPLIT,$0-17
  62. MOVQ x+0(FP), DI
  63. MOVQ y+8(FP), SI
  64. MOVBLZX choice+16(FP), AX // AL = 0 or 1
  65. // Make AX, so that either all bits are set or non
  66. // AX = 0 or 1
  67. NEGQ AX
  68. // Fill xmm15. After this step first half of XMM15 is
  69. // just zeros and second half is whatever in AX
  70. MOVQ AX, X15
  71. // Copy lower double word everywhere else. So that
  72. // XMM15=AL|AL|AL|AL. As AX has either all bits set
  73. // or non result will be that XMM15 has also either
  74. // all bits set or non of them.
  75. PSHUFD $0, X15, X15
  76. #ifndef CSWAP_BLOCK
  77. #define CSWAP_BLOCK(idx) \
  78. MOVOU (idx*16)(DI), X0 \
  79. MOVOU (idx*16)(SI), X1 \
  80. \ // X2 = mask & (X0 ^ X1)
  81. MOVO X1, X2 \
  82. PXOR X0, X2 \
  83. PAND X15, X2 \
  84. \
  85. PXOR X2, X0 \
  86. PXOR X2, X1 \
  87. \
  88. MOVOU X0, (idx*16)(DI) \
  89. MOVOU X1, (idx*16)(SI)
  90. #endif
  91. CSWAP_BLOCK(0)
  92. CSWAP_BLOCK(1)
  93. CSWAP_BLOCK(2)
  94. CSWAP_BLOCK(3)
  95. RET
  96. // mulAsm implements montgomery multiplication interleaved with
  97. // montgomery reduction. It uses MULX and ADCX/ADOX instructions.
  98. // Implementation specific to 511-bit prime 'p'
  99. //
  100. // func mulBmiAsm(res, x, y *fp)
  101. TEXT ·mulBmiAsm(SB),NOSPLIT,$8-24
  102. MOVQ x+8(FP), DI // multiplicand
  103. MOVQ y+16(FP), SI // multiplier
  104. XORQ R8, R8
  105. XORQ R9, R9
  106. XORQ R10, R10
  107. XORQ R11, R11
  108. XORQ R12, R12
  109. XORQ R13, R13
  110. XORQ R14, R14
  111. XORQ R15, R15
  112. MOVQ BP, 0(SP)
  113. XORQ BP, BP
  114. // Uses BMI2 (MULX)
  115. #ifdef MULS_MULX_512
  116. #undef MULS_MULX_512
  117. #endif
  118. #define MULS_MULX_512(idx, r0, r1, r2, r3, r4, r5, r6, r7, r8) \
  119. \ // Reduction step
  120. MOVQ ( 0)(SI), DX \
  121. MULXQ ( 8*idx)(DI), DX, CX \
  122. ADDQ r0, DX \
  123. MULXQ ·pNegInv(SB), DX, CX \
  124. \
  125. XORQ AX, AX \
  126. MULXQ ·p+ 0(SB), AX, BX; ; ADOXQ AX, r0 \
  127. MULXQ ·p+ 8(SB), AX, CX; ADCXQ BX, r1; ADOXQ AX, r1 \
  128. MULXQ ·p+16(SB), AX, BX; ADCXQ CX, r2; ADOXQ AX, r2 \
  129. MULXQ ·p+24(SB), AX, CX; ADCXQ BX, r3; ADOXQ AX, r3 \
  130. MULXQ ·p+32(SB), AX, BX; ADCXQ CX, r4; ADOXQ AX, r4 \
  131. MULXQ ·p+40(SB), AX, CX; ADCXQ BX, r5; ADOXQ AX, r5 \
  132. MULXQ ·p+48(SB), AX, BX; ADCXQ CX, r6; ADOXQ AX, r6 \
  133. MULXQ ·p+56(SB), AX, CX; ADCXQ BX, r7; ADOXQ AX, r7 \
  134. MOVQ $0, AX ; ADCXQ CX, r8; ADOXQ AX, r8 \
  135. \ // Multiplication step
  136. MOVQ (8*idx)(DI), DX \
  137. \
  138. XORQ AX, AX \
  139. MULXQ ( 0)(SI), AX, BX; ADOXQ AX, r0 \
  140. MULXQ ( 8)(SI), AX, CX; ADCXQ BX, r1; ADOXQ AX, r1 \
  141. MULXQ (16)(SI), AX, BX; ADCXQ CX, r2; ADOXQ AX, r2 \
  142. MULXQ (24)(SI), AX, CX; ADCXQ BX, r3; ADOXQ AX, r3 \
  143. MULXQ (32)(SI), AX, BX; ADCXQ CX, r4; ADOXQ AX, r4 \
  144. MULXQ (40)(SI), AX, CX; ADCXQ BX, r5; ADOXQ AX, r5 \
  145. MULXQ (48)(SI), AX, BX; ADCXQ CX, r6; ADOXQ AX, r6 \
  146. MULXQ (56)(SI), AX, CX; ADCXQ BX, r7; ADOXQ AX, r7 \
  147. MOVQ $0, AX ; ADCXQ CX, r8; ADOXQ AX, r8
  148. MULS_MULX_512(0, R8, R9, R10, R11, R12, R13, R14, R15, BP)
  149. MULS_MULX_512(1, R9, R10, R11, R12, R13, R14, R15, BP, R8)
  150. MULS_MULX_512(2, R10, R11, R12, R13, R14, R15, BP, R8, R9)
  151. MULS_MULX_512(3, R11, R12, R13, R14, R15, BP, R8, R9, R10)
  152. MULS_MULX_512(4, R12, R13, R14, R15, BP, R8, R9, R10, R11)
  153. MULS_MULX_512(5, R13, R14, R15, BP, R8, R9, R10, R11, R12)
  154. MULS_MULX_512(6, R14, R15, BP, R8, R9, R10, R11, R12, R13)
  155. MULS_MULX_512(7, R15, BP, R8, R9, R10, R11, R12, R13, R14)
  156. #undef MULS_MULX_512
  157. MOVQ res+0(FP), DI
  158. MOVQ BP, ( 0)(DI)
  159. MOVQ R8, ( 8)(DI)
  160. MOVQ R9, (16)(DI)
  161. MOVQ R10, (24)(DI)
  162. MOVQ R11, (32)(DI)
  163. MOVQ R12, (40)(DI)
  164. MOVQ R13, (48)(DI)
  165. MOVQ R14, (56)(DI)
  166. MOVQ 0(SP), BP
  167. // NOW DI needs to be reduced if > p
  168. RET