|
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916 |
-
- #if defined(__APPLE__)
- /* OS X's C ABI prefixes functions with underscore. */
- #define C_ABI(x) _ ## x
- #define HIDDEN .private_extern
- #else
- #define C_ABI(x) x
- #define HIDDEN .hidden
- #endif
-
- .p2align 6
- .LpermMask0:
- .word 0,1,2,3, 3,4,5,6, 6,7,8,9, 9,10,11,12, 13,14,15,16, 16,17,18,19, 19,20,21,22, 22,23,24,25
- .LshiftMask0:
- .quad 0,4,8,12,0,4,8,12
- .LandMask:
- .quad 0xfffffffffffff
-
- .p2align 6
- .Lpoly:
- .quad 0x000fffffffffffff, 0x000fffffffffffff, 0x000fffffffffffff, 0x000fffffffffffff
- .quad 0x000fffffffffffff, 0x000fffffffffffff, 0x000fffffffffffff, 0x00049f878a8eeaff
- .quad 0x0007cc76e3ec9685, 0x00076da959b1a13f, 0x00084e9867d6ebe8, 0x000b5045cb257480
- .quad 0x000f97badc668562, 0x00041f71c0e12909, 0x00000000006fe5d5, 0
-
- .LpolyX:
- .quad 0x0fffffffffffff00, 0x0fffffffffffff00, 0x0fffffffffffff00, 0x0fffffffffffff00
- .quad 0x0fffffffffffff00, 0x0fffffffffffff00, 0x0fffffffffffff00, 0x049f878a8eeaff00
- .quad 0x07cc76e3ec968500, 0x076da959b1a13f00, 0x084e9867d6ebe800, 0x0b5045cb25748000
- .quad 0x0f97badc66856200, 0x041f71c0e1290900, 0x000000006fe5d500, 0
-
- #define felemR %rdi
- #define felemA %rsi
- #define felemB %rdx
-
- #define itr %r10
-
- #define M0 %zmm0
- #define M1 %zmm1
- #define ZERO %zmm2
- #define AND_MASK %zmm3
-
- #define A0a %zmm4
- #define A0b %zmm5
-
- #define A1a %zmm6
- #define A1b %zmm7
-
- #define ACC0a %zmm8
- #define ACC0b %zmm9
- #define ACC1a %zmm10
- #define ACC1b %zmm11
- #define ACC2a %zmm12
- #define ACC2b %zmm13
- #define ACC3a %zmm14
- #define ACC3b %zmm15
-
- #define B0curr %zmm16
- #define B0prev %zmm17
- #define B1curr %zmm18
- #define B1prev %zmm19
-
- #define Y0curr %zmm20
- #define Y0prev %zmm21
- #define Y1curr %zmm22
- #define Y1prev %zmm23
- #define Y2curr %zmm24
- #define Y2prev %zmm25
- #define Y3curr %zmm26
- #define Y3prev %zmm27
-
- #define T0 %zmm28
- #define T1 %zmm29
- #define T2 %zmm30
- #define T3 %zmm31
-
- ###############################################################################
- .globl C_ABI(fp2_mul_ifma)
- .p2align 6
- C_ABI(fp2_mul_ifma):
-
- mov $1, %eax
- kmovw %eax, %k1
- mov $0x7f, %eax
- kmovw %eax, %k5
-
- vpbroadcastq .LandMask(%rip), AND_MASK
- vpxorq ZERO, ZERO, ZERO
-
- vmovdqu64 64*0(felemA), A0a
- vmovdqu64 64*1(felemA), A0b{%k5}{z}
- vmovdqu64 15*8 + 64*0(felemA), A1a
- vmovdqu64 15*8 + 64*1(felemA), A1b{%k5}{z}
-
- # Load the modulus
- vmovdqa64 64*0 + .Lpoly(%rip), M0
- vmovdqa64 64*1 + .Lpoly(%rip), M1
-
- # Prepare the accumulators
- vpxorq ACC0a, ACC0a, ACC0a
- vpxorq ACC0b, ACC0b, ACC0b
- vpxorq ACC1a, ACC1a, ACC1a
- vpxorq ACC1b, ACC1b, ACC1b
- vpxorq ACC2a, ACC2a, ACC2a
- vpxorq ACC2b, ACC2b, ACC2b
- vpxorq ACC3a, ACC3a, ACC3a
- vpxorq ACC3b, ACC3b, ACC3b
- vpxorq T0, T0, T0
- vpxorq T1, T1, T1
- vpxorq T2, T2, T2
- vpxorq T3, T3, T3
-
- # First iteration
- vpbroadcastq (felemB), B0curr
- vpbroadcastq 15*8(felemB), B1curr
- lea 8(felemB), felemB
-
- vpmadd52luq B0curr, A0a, ACC0a
- vpmadd52luq B0curr, A0b, ACC0b
- vpmadd52luq B1curr, A1a, ACC1a
- vpmadd52luq B1curr, A1b, ACC1b
- vpmadd52luq B0curr, A1a, ACC2a
- vpmadd52luq B0curr, A1b, ACC2b
- vpmadd52luq B1curr, A0a, ACC3a
- vpmadd52luq B1curr, A0b, ACC3b
-
- vpermq ACC0a, ZERO, Y0curr
- vpermq ACC1a, ZERO, Y1curr
- vpermq ACC2a, ZERO, Y2curr
- vpermq ACC3a, ZERO, Y3curr
-
- vpmadd52luq Y0curr, M0, ACC0a
- vpmadd52luq Y0curr, M1, ACC0b
- vpmadd52luq Y1curr, M0, ACC1a
- vpmadd52luq Y1curr, M1, ACC1b
- vpmadd52luq Y2curr, M0, ACC2a
- vpmadd52luq Y2curr, M1, ACC2b
- vpmadd52luq Y3curr, M0, ACC3a
- vpmadd52luq Y3curr, M1, ACC3b
-
- vpsrlq $52, ACC0a, T0{%k1}{z}
- vpsrlq $52, ACC1a, T1{%k1}{z}
- vpsrlq $52, ACC2a, T2{%k1}{z}
- vpsrlq $52, ACC3a, T3{%k1}{z}
-
- mov $14, itr
-
- 1:
- # Shift the ACC in zmms right by a word
- valignq $1, ACC0a, ACC0b, ACC0a
- valignq $1, ACC0b, ZERO, ACC0b
- valignq $1, ACC1a, ACC1b, ACC1a
- valignq $1, ACC1b, ZERO, ACC1b
- valignq $1, ACC2a, ACC2b, ACC2a
- valignq $1, ACC2b, ZERO, ACC2b
- valignq $1, ACC3a, ACC3b, ACC3a
- valignq $1, ACC3b, ZERO, ACC3b
-
- vmovdqa64 B0curr, B0prev
- vmovdqa64 B1curr, B1prev
- vmovdqa64 Y0curr, Y0prev
- vmovdqa64 Y1curr, Y1prev
- vmovdqa64 Y2curr, Y2prev
- vmovdqa64 Y3curr, Y3prev
-
- vpbroadcastq (felemB), B0curr
- vpbroadcastq 15*8(felemB), B1curr
- lea 8(felemB), felemB
-
- # High multiplications
- vpmadd52huq B0prev, A0a, ACC0a # ACC0 = A0 * B0
- vpmadd52huq B0prev, A0b, ACC0b
- vpmadd52huq B1prev, A1a, ACC1a # ACC1 = A1 * B1
- vpmadd52huq B1prev, A1b, ACC1b
- vpmadd52huq B0prev, A1a, ACC2a # ACC2 = A1 * B0
- vpmadd52huq B0prev, A1b, ACC2b
- vpmadd52huq B1prev, A0a, ACC3a # ACC3 = A0 * B1
- vpmadd52huq B1prev, A0b, ACC3b
-
- vpmadd52huq Y0prev, M0, ACC0a
- vpmadd52huq Y0prev, M1, ACC0b
- vpmadd52huq Y1prev, M0, ACC1a
- vpmadd52huq Y1prev, M1, ACC1b
- vpmadd52huq Y2prev, M0, ACC2a
- vpmadd52huq Y2prev, M1, ACC2b
- vpmadd52huq Y3prev, M0, ACC3a
- vpmadd52huq Y3prev, M1, ACC3b
- # Low multiplications
- vpmadd52luq B0curr, A0a, ACC0a
- vpmadd52luq B0curr, A0b, ACC0b
- vpmadd52luq B1curr, A1a, ACC1a
- vpmadd52luq B1curr, A1b, ACC1b
- vpmadd52luq B0curr, A1a, ACC2a
- vpmadd52luq B0curr, A1b, ACC2b
- vpmadd52luq B1curr, A0a, ACC3a
- vpmadd52luq B1curr, A0b, ACC3b
-
- vpaddq T0, ACC0a, ACC0a
- vpaddq T1, ACC1a, ACC1a
- vpaddq T2, ACC2a, ACC2a
- vpaddq T3, ACC3a, ACC3a
- vpermq ACC0a, ZERO, Y0curr
- vpermq ACC1a, ZERO, Y1curr
- vpermq ACC2a, ZERO, Y2curr
- vpermq ACC3a, ZERO, Y3curr
-
- vpmadd52luq Y0curr, M0, ACC0a
- vpmadd52luq Y0curr, M1, ACC0b
- vpmadd52luq Y1curr, M0, ACC1a
- vpmadd52luq Y1curr, M1, ACC1b
- vpmadd52luq Y2curr, M0, ACC2a
- vpmadd52luq Y2curr, M1, ACC2b
- vpmadd52luq Y3curr, M0, ACC3a
- vpmadd52luq Y3curr, M1, ACC3b
-
- vpsrlq $52, ACC0a, T0{%k1}{z}
- vpsrlq $52, ACC1a, T1{%k1}{z}
- vpsrlq $52, ACC2a, T2{%k1}{z}
- vpsrlq $52, ACC3a, T3{%k1}{z}
-
- dec itr
- jne 1b
- valignq $1, ACC0a, ACC0b, ACC0a
- valignq $1, ACC0b, ZERO, ACC0b
- valignq $1, ACC1a, ACC1b, ACC1a
- valignq $1, ACC1b, ZERO, ACC1b
- valignq $1, ACC2a, ACC2b, ACC2a
- valignq $1, ACC2b, ZERO, ACC2b
- valignq $1, ACC3a, ACC3b, ACC3a
- valignq $1, ACC3b, ZERO, ACC3b
- vpaddq T0, ACC0a, ACC0a
- vpaddq T1, ACC1a, ACC1a
- vpaddq T2, ACC2a, ACC2a
- vpaddq T3, ACC3a, ACC3a
-
- # The last high multiplications
- vpmadd52huq B0curr, A0a, ACC0a
- vpmadd52huq B0curr, A0b, ACC0b
- vpmadd52huq B1curr, A1a, ACC1a
- vpmadd52huq B1curr, A1b, ACC1b
- vpmadd52huq B0curr, A1a, ACC2a
- vpmadd52huq B0curr, A1b, ACC2b
- vpmadd52huq B1curr, A0a, ACC3a
- vpmadd52huq B1curr, A0b, ACC3b
-
- vpmadd52huq Y0curr, M0, ACC0a
- vpmadd52huq Y0curr, M1, ACC0b
- vpmadd52huq Y1curr, M0, ACC1a
- vpmadd52huq Y1curr, M1, ACC1b
- vpmadd52huq Y2curr, M0, ACC2a
- vpmadd52huq Y2curr, M1, ACC2b
- vpmadd52huq Y3curr, M0, ACC3a
- vpmadd52huq Y3curr, M1, ACC3b
-
- # C0 = A0*B0 - A1*B1
- # C1 = A0*B1 + A1*B0
- vpaddq 64*0 + .LpolyX(%rip), ACC0a, ACC0a
- vpaddq 64*1 + .LpolyX(%rip), ACC0b, ACC0b
-
- vpaddq ACC3a, ACC2a, ACC2a
- vpaddq ACC3b, ACC2b, ACC2b
-
- vpsubq ACC1a, ACC0a, ACC0a
- vpsubq ACC1b, ACC0b, ACC0b
- # Now 'normalize' the acc to 52 bit words
- vpsrlq $52, ACC0a, A0a
- vpsrlq $52, ACC0b, A0b
-
- vpsrlq $52, ACC2a, A1a
- vpsrlq $52, ACC2b, A1b
-
- vpandq AND_MASK, ACC0a, ACC0a
- vpandq AND_MASK, ACC0b, ACC0b
- vpandq AND_MASK, ACC2a, ACC2a
- vpandq AND_MASK, ACC2b, ACC2b
-
- valignq $7, A0a, A0b, A0b
- valignq $7, ZERO, A0a, A0a
- valignq $7, A1a, A1b, A1b
- valignq $7, ZERO, A1a, A1a
-
- vpaddq A0a, ACC0a, ACC0a
- vpaddq A0b, ACC0b, ACC0b
- vpaddq A1a, ACC2a, ACC2a
- vpaddq A1b, ACC2b, ACC2b
-
- vpcmpuq $1, A0a, ACC0a, %k1
- vpcmpuq $1, A0b, ACC0b, %k2
- vpcmpuq $0, AND_MASK, ACC0a, %k3
- vpcmpuq $0, AND_MASK, ACC0b, %k4
-
- kmovb %k1, %eax
- kmovb %k2, %ecx
- kmovb %k3, %r8d
- kmovb %k4, %r9d
-
- add %al, %al
- adc %cl, %cl
-
- add %r8b, %al
- adc %r9b, %cl
-
- xor %r8b, %al
- xor %r9b, %cl
-
- kmovb %eax, %k1
- kmovb %ecx, %k2
-
- vpsubq AND_MASK, ACC0a, ACC0a{%k1}
- vpsubq AND_MASK, ACC0b, ACC0b{%k2}
- vpandq AND_MASK, ACC0a, ACC0a
- vpandq AND_MASK, ACC0b, ACC0b
-
- vpcmpuq $1, A1a, ACC2a, %k1
- vpcmpuq $1, A1b, ACC2b, %k2
- vpcmpuq $0, AND_MASK, ACC2a, %k3
- vpcmpuq $0, AND_MASK, ACC2b, %k4
-
- kmovb %k1, %eax
- kmovb %k2, %ecx
- kmovb %k3, %r8d
- kmovb %k4, %r9d
-
- add %al, %al
- adc %cl, %cl
- add %r8b, %al
- adc %r9b, %cl
- xor %r8b, %al
- xor %r9b, %cl
- kmovb %eax, %k1
- kmovb %ecx, %k2
-
- vpsubq AND_MASK, ACC2a, ACC2a{%k1}
- vpsubq AND_MASK, ACC2b, ACC2b{%k2}
- vpandq AND_MASK, ACC2a, ACC2a
- vpandq AND_MASK, ACC2b, ACC2b
-
- mov $0x7f, %eax
- kmovw %eax, %k1
-
- vmovdqu64 ACC0a, 64*0(felemR)
- vmovdqu64 ACC0b, 64*1(felemR){%k5}
- vmovdqu64 ACC2a, 15*8 + 64*0(felemR)
- vmovdqu64 ACC2b, 15*8 + 64*1(felemR){%k5}
- ret
-
- ###############################################################################
-
- #define ST0 ACC3a
- #define ST1 ACC3b
- #define ST2 Y3curr
-
- .globl C_ABI(fp2_sqr_ifma)
- .p2align 6
- C_ABI(fp2_sqr_ifma):
-
- mov $1, %eax
- kmovw %eax, %k1
- mov $0x7f, %eax
- kmovw %eax, %k2
-
- vpbroadcastq .LandMask(%rip), AND_MASK
- vpxorq ZERO, ZERO, ZERO
-
- vmovdqu64 64*0(felemA), A0a
- vmovdqu64 64*1(felemA), A0b{%k2}{z}
- vmovdqu64 15*8 + 64*0(felemA), A1a
- vmovdqu64 15*8 + 64*1(felemA), A1b{%k2}{z}
-
- # Load the modulus
- vmovdqa64 64*0 + .Lpoly(%rip), M0
- vmovdqa64 64*1 + .Lpoly(%rip), M1
-
- # Prepare the accumulators
- vpxorq ACC0a, ACC0a, ACC0a
- vpxorq ACC0b, ACC0b, ACC0b
- vpxorq ACC1a, ACC1a, ACC1a
- vpxorq ACC1b, ACC1b, ACC1b
- vpxorq ACC2a, ACC2a, ACC2a
- vpxorq ACC2b, ACC2b, ACC2b
- vpxorq T0, T0, T0
- vpxorq T1, T1, T1
- vpxorq T2, T2, T2
-
- # First iteration
- vpbroadcastq (felemA), B0curr
- vpbroadcastq 15*8(felemA), B1curr
- lea 8(felemA), felemA
-
- vpmadd52luq B0curr, A0a, ACC0a
- vpmadd52luq B0curr, A0b, ACC0b
- vpmadd52luq B1curr, A1a, ACC1a
- vpmadd52luq B1curr, A1b, ACC1b
- vpmadd52luq B0curr, A1a, ACC2a
- vpmadd52luq B0curr, A1b, ACC2b
-
- vpermq ACC0a, ZERO, Y0curr
- vpermq ACC1a, ZERO, Y1curr
- vpermq ACC2a, ZERO, Y2curr
-
- vpmadd52luq Y0curr, M0, ACC0a
- vpmadd52luq Y0curr, M1, ACC0b
- vpmadd52luq Y1curr, M0, ACC1a
- vpmadd52luq Y1curr, M1, ACC1b
- vpmadd52luq Y2curr, M0, ACC2a
- vpmadd52luq Y2curr, M1, ACC2b
-
- vpsrlq $52, ACC0a, T0{%k1}{z}
- vpsrlq $52, ACC1a, T1{%k1}{z}
- vpsrlq $52, ACC2a, T2{%k1}{z}
-
- mov $14, itr
-
- 1:
- # Shift the ACC in zmms right by a word
- valignq $1, ACC0a, ACC0b, ACC0a
- valignq $1, ACC0b, ZERO, ACC0b
- valignq $1, ACC1a, ACC1b, ACC1a
- valignq $1, ACC1b, ZERO, ACC1b
- valignq $1, ACC2a, ACC2b, ACC2a
- valignq $1, ACC2b, ZERO, ACC2b
-
- vpxorq ST0, ST0, ST0
- vpxorq ST1, ST1, ST1
- vpxorq ST2, ST2, ST2
-
- vmovdqa64 B0curr, B0prev
- vmovdqa64 B1curr, B1prev
- vmovdqa64 Y0curr, Y0prev
- vmovdqa64 Y1curr, Y1prev
- vmovdqa64 Y2curr, Y2prev
-
- vpbroadcastq (felemA), B0curr
- vpbroadcastq 15*8(felemA), B1curr
- lea 8(felemA), felemA
-
- # High multiplications
- vpmadd52huq B0prev, A0a, ACC0a # ACC0 = A0 * B0
- vpmadd52huq B1prev, A1a, ACC1a # ACC1 = A1 * B1
- vpmadd52huq B0prev, A1a, ACC2a # ACC2 = A1 * B0
- vpmadd52huq B0prev, A0b, ACC0b
- vpmadd52huq B1prev, A1b, ACC1b
- vpmadd52huq B0prev, A1b, ACC2b
- # We really want to have 8 independent vpmadd instructions in the pipe
- vpmadd52huq Y0prev, M0, T0
- vpmadd52huq Y1prev, M0, T1
- vpmadd52huq Y2prev, M0, T2
-
- vpmadd52huq Y0prev, M1, ACC0b
- vpmadd52huq Y1prev, M1, ACC1b
- vpmadd52huq Y2prev, M1, ACC2b
- # Low multiplications
- vpmadd52luq B0curr, A0a, ACC0a
- vpmadd52luq B1curr, A1a, ACC1a
- vpmadd52luq B0curr, A1a, ACC2a
-
- vpmadd52luq B0curr, A0b, ST0
- vpmadd52luq B1curr, A1b, ST1
- vpmadd52luq B0curr, A1b, ST2
-
- vpaddq T0, ACC0a, ACC0a
- vpaddq T1, ACC1a, ACC1a
- vpaddq T2, ACC2a, ACC2a
- vpermq ACC0a, ZERO, Y0curr
- vpermq ACC1a, ZERO, Y1curr
- vpermq ACC2a, ZERO, Y2curr
- vpaddq ST0, ACC0b, ACC0b
- vpaddq ST1, ACC1b, ACC1b
- vpaddq ST2, ACC2b, ACC2b
-
- vpmadd52luq Y0curr, M0, ACC0a
- vpmadd52luq Y0curr, M1, ACC0b
- vpmadd52luq Y1curr, M0, ACC1a
- vpmadd52luq Y1curr, M1, ACC1b
- vpmadd52luq Y2curr, M0, ACC2a
- vpmadd52luq Y2curr, M1, ACC2b
-
- vpsrlq $52, ACC0a, T0{%k1}{z}
- vpsrlq $52, ACC1a, T1{%k1}{z}
- vpsrlq $52, ACC2a, T2{%k1}{z}
-
- dec itr
- jne 1b
- valignq $1, ACC0a, ACC0b, ACC0a
- valignq $1, ACC0b, ZERO, ACC0b
- valignq $1, ACC1a, ACC1b, ACC1a
- valignq $1, ACC1b, ZERO, ACC1b
- valignq $1, ACC2a, ACC2b, ACC2a
- valignq $1, ACC2b, ZERO, ACC2b
- vpaddq T0, ACC0a, ACC0a
- vpaddq T1, ACC1a, ACC1a
- vpaddq T2, ACC2a, ACC2a
-
- # The last high multiplications
- vpmadd52huq B0curr, A0a, ACC0a
- vpmadd52huq B0curr, A0b, ACC0b
- vpmadd52huq B1curr, A1a, ACC1a
- vpmadd52huq B1curr, A1b, ACC1b
- vpmadd52huq B0curr, A1a, ACC2a
- vpmadd52huq B0curr, A1b, ACC2b
-
- vpmadd52huq Y0curr, M0, ACC0a
- vpmadd52huq Y0curr, M1, ACC0b
- vpmadd52huq Y1curr, M0, ACC1a
- vpmadd52huq Y1curr, M1, ACC1b
- vpmadd52huq Y2curr, M0, ACC2a
- vpmadd52huq Y2curr, M1, ACC2b
-
- # C0 = A0*B0 - A1*B1
- # C1 = A0*B1 + A1*B0
- vpaddq 64*0 + .LpolyX(%rip), ACC0a, ACC0a
- vpaddq 64*1 + .LpolyX(%rip), ACC0b, ACC0b
-
- vpaddq ACC2a, ACC2a, ACC2a
- vpaddq ACC2b, ACC2b, ACC2b
-
- vpsubq ACC1a, ACC0a, ACC0a
- vpsubq ACC1b, ACC0b, ACC0b
-
- # Now 'normalize' the acc to 52 bit words
- vpsrlq $52, ACC0a, A0a
- vpsrlq $52, ACC0b, A0b
- vpsrlq $52, ACC2a, A1a
- vpsrlq $52, ACC2b, A1b
-
- vpandq AND_MASK, ACC0a, ACC0a
- vpandq AND_MASK, ACC0b, ACC0b
- vpandq AND_MASK, ACC2a, ACC2a
- vpandq AND_MASK, ACC2b, ACC2b
-
- valignq $7, A0a, A0b, A0b
- valignq $7, ZERO, A0a, A0a
- valignq $7, A1a, A1b, A1b
- valignq $7, ZERO, A1a, A1a
-
- vpaddq A0a, ACC0a, ACC0a
- vpaddq A0b, ACC0b, ACC0b
- vpaddq A1a, ACC2a, ACC2a
- vpaddq A1b, ACC2b, ACC2b
-
- vpcmpuq $1, A0a, ACC0a, %k1
- vpcmpuq $1, A0b, ACC0b, %k2
- vpcmpuq $0, AND_MASK, ACC0a, %k3
- vpcmpuq $0, AND_MASK, ACC0b, %k4
-
- kmovb %k1, %eax
- kmovb %k2, %ecx
- kmovb %k3, %r8d
- kmovb %k4, %r9d
-
- add %al, %al
- adc %cl, %cl
- add %r8b, %al
- adc %r9b, %cl
- xor %r8b, %al
- xor %r9b, %cl
- kmovb %eax, %k1
- kmovb %ecx, %k2
-
- vpsubq AND_MASK, ACC0a, ACC0a{%k1}
- vpsubq AND_MASK, ACC0b, ACC0b{%k2}
- vpandq AND_MASK, ACC0a, ACC0a
- vpandq AND_MASK, ACC0b, ACC0b
-
- vpcmpuq $1, A1a, ACC2a, %k1
- vpcmpuq $1, A1b, ACC2b, %k2
- vpcmpuq $0, AND_MASK, ACC2a, %k3
- vpcmpuq $0, AND_MASK, ACC2b, %k4
-
- kmovb %k1, %eax
- kmovb %k2, %ecx
- kmovb %k3, %r8d
- kmovb %k4, %r9d
-
- add %al, %al
- adc %cl, %cl
- add %r8b, %al
- adc %r9b, %cl
- xor %r8b, %al
- xor %r9b, %cl
- kmovb %eax, %k1
- kmovb %ecx, %k2
-
- vpsubq AND_MASK, ACC2a, ACC2a{%k1}
- vpsubq AND_MASK, ACC2b, ACC2b{%k2}
- vpandq AND_MASK, ACC2a, ACC2a
- vpandq AND_MASK, ACC2b, ACC2b
-
- mov $0x7f, %eax
- kmovw %eax, %k1
-
- vmovdqu64 ACC0a, 64*0(felemR)
- vmovdqu64 ACC0b, 64*1(felemR){%k1}
- vmovdqu64 ACC2a, 15*8 + 64*0(felemR)
- vmovdqu64 ACC2b, 15*8 + 64*1(felemR){%k1}
- ret
-
- ###############################################################################
- .globl C_ABI(fp2_sub)
- .p2align 6
- C_ABI(fp2_sub):
-
- mov $1, %eax
- kmovw %eax, %k1
- mov $0x7f, %eax
- kmovw %eax, %k2
-
- vmovdqu64 64*0(felemA), ACC0a
- vmovdqu64 64*1(felemA), ACC0b{%k2}{z}
- vmovdqu64 15*8 + 64*0(felemA), ACC1a
- vmovdqu64 15*8 + 64*1(felemA), ACC1b{%k2}{z}
-
- vmovdqu64 64*0(felemB), ACC2a
- vmovdqu64 64*1(felemB), ACC2b{%k2}{z}
- vmovdqu64 15*8 + 64*0(felemB), ACC3a
- vmovdqu64 15*8 + 64*1(felemB), ACC3b{%k2}{z}
-
- vpaddq 64*0 + .LpolyX(%rip), ACC0a, ACC0a
- vpaddq 64*1 + .LpolyX(%rip), ACC0b, ACC0b
- vpaddq 64*0 + .LpolyX(%rip), ACC1a, ACC1a
- vpaddq 64*1 + .LpolyX(%rip), ACC1b, ACC1b
-
- vpsubq ACC2a, ACC0a, ACC0a
- vpsubq ACC2b, ACC0b, ACC0b
- vpsubq ACC3a, ACC1a, ACC2a
- vpsubq ACC3b, ACC1b, ACC2b
-
- jmp fp2_normalize
- ###############################################################################
- .globl C_ABI(fp2_add)
- .p2align 6
- C_ABI(fp2_add):
-
- mov $1, %eax
- kmovw %eax, %k1
- mov $0x7f, %eax
- kmovw %eax, %k2
-
- vmovdqu64 64*0(felemA), ACC0a
- vmovdqu64 64*1(felemA), ACC0b{%k2}{z}
- vmovdqu64 15*8 + 64*0(felemA), ACC1a
- vmovdqu64 15*8 + 64*1(felemA), ACC1b{%k2}{z}
-
- vmovdqu64 64*0(felemB), ACC2a
- vmovdqu64 64*1(felemB), ACC2b{%k2}{z}
- vmovdqu64 15*8 + 64*0(felemB), ACC3a
- vmovdqu64 15*8 + 64*1(felemB), ACC3b{%k2}{z}
-
- vpaddq ACC2a, ACC0a, ACC0a
- vpaddq ACC2b, ACC0b, ACC0b
- vpaddq ACC3a, ACC1a, ACC2a
- vpaddq ACC3b, ACC1b, ACC2b
-
- // Fallthrough
- ###############################################################################
- .p2align 6
- C_ABI(fp2_normalize):
-
- vpbroadcastq .LandMask(%rip), AND_MASK
- vpxorq ZERO, ZERO, ZERO
-
- # Now 'normalize' the acc to 52 bit words
- vpsrlq $52, ACC0a, A0a
- vpsrlq $52, ACC0b, A0b
- vpsrlq $52, ACC2a, A1a
- vpsrlq $52, ACC2b, A1b
-
- vpandq AND_MASK, ACC0a, ACC0a
- vpandq AND_MASK, ACC0b, ACC0b
- vpandq AND_MASK, ACC2a, ACC2a
- vpandq AND_MASK, ACC2b, ACC2b
-
- valignq $7, A0a, A0b, A0b
- valignq $7, ZERO, A0a, A0a
- valignq $7, A1a, A1b, A1b
- valignq $7, ZERO, A1a, A1a
-
- vpaddq A0a, ACC0a, ACC0a
- vpaddq A0b, ACC0b, ACC0b
- vpaddq A1a, ACC2a, ACC2a
- vpaddq A1b, ACC2b, ACC2b
-
- vpcmpuq $1, A0a, ACC0a, %k1
- vpcmpuq $1, A0b, ACC0b, %k2
- vpcmpuq $0, AND_MASK, ACC0a, %k3
- vpcmpuq $0, AND_MASK, ACC0b, %k4
-
- kmovb %k1, %eax
- kmovb %k2, %ecx
- kmovb %k3, %r8d
- kmovb %k4, %r9d
-
- add %al, %al
- adc %cl, %cl
- add %r8b, %al
- adc %r9b, %cl
- xor %r8b, %al
- xor %r9b, %cl
- kmovb %eax, %k1
- kmovb %ecx, %k2
-
- vpsubq AND_MASK, ACC0a, ACC0a{%k1}
- vpsubq AND_MASK, ACC0b, ACC0b{%k2}
- vpandq AND_MASK, ACC0a, ACC0a
- vpandq AND_MASK, ACC0b, ACC0b
-
- vpcmpuq $1, A1a, ACC2a, %k1
- vpcmpuq $1, A1b, ACC2b, %k2
- vpcmpuq $0, AND_MASK, ACC2a, %k3
- vpcmpuq $0, AND_MASK, ACC2b, %k4
-
- kmovb %k1, %eax
- kmovb %k2, %ecx
- kmovb %k3, %r8d
- kmovb %k4, %r9d
-
- add %al, %al
- adc %cl, %cl
- add %r8b, %al
- adc %r9b, %cl
- xor %r8b, %al
- xor %r9b, %cl
- kmovb %eax, %k1
- kmovb %ecx, %k2
-
- vpsubq AND_MASK, ACC2a, ACC2a{%k1}
- vpsubq AND_MASK, ACC2b, ACC2b{%k2}
- vpandq AND_MASK, ACC2a, ACC2a
- vpandq AND_MASK, ACC2b, ACC2b
-
- mov $0x7f, %eax
- kmovw %eax, %k1
-
- vmovdqu64 ACC0a, 64*0(felemR)
- vmovdqu64 ACC0b, 64*1(felemR){%k1}
- vmovdqu64 ACC2a, 15*8 + 64*0(felemR)
- vmovdqu64 ACC2b, 15*8 + 64*1(felemR){%k1}
-
- ret
-
-
- ###############################################################################
- #define p1ptr %rdi
- #define p2ptr %rsi
- #define swap %rdx
- .globl C_ABI(fp2_swap)
- .p2align 6
- C_ABI(fp2_swap):
-
- mov $0x7f, %eax
- kmovw %eax, %k2
- // TODO: get rid of the masks, not needed
- vmovdqu64 64*0(p1ptr), %zmm0
- vmovdqu64 64*1(p1ptr), %zmm1{%k2}{z}
- vmovdqu64 15*8 + 64*0(p1ptr), %zmm2
- vmovdqu64 15*8 + 64*1(p1ptr), %zmm3{%k2}{z}
- vmovdqu64 2*15*8 + 64*0(p1ptr), %zmm4
- vmovdqu64 2*15*8 + 64*1(p1ptr), %zmm5{%k2}{z}
- vmovdqu64 3*15*8 + 64*0(p1ptr), %zmm6
- vmovdqu64 3*15*8 + 64*1(p1ptr), %zmm7{%k2}{z}
-
- vmovdqu64 64*0(p2ptr), %zmm8
- vmovdqu64 64*1(p2ptr), %zmm9{%k2}{z}
- vmovdqu64 15*8 + 64*0(p2ptr), %zmm10
- vmovdqu64 15*8 + 64*1(p2ptr), %zmm11{%k2}{z}
- vmovdqu64 2*15*8 + 64*0(p2ptr), %zmm12
- vmovdqu64 2*15*8 + 64*1(p2ptr), %zmm13{%k2}{z}
- vmovdqu64 3*15*8 + 64*0(p2ptr), %zmm14
- vmovdqu64 3*15*8 + 64*1(p2ptr), %zmm15{%k2}{z}
-
- vpxorq %zmm16, %zmm16, %zmm16
- vpbroadcastq swap, %zmm17
- vpsubq %zmm17, %zmm16, %zmm16
-
- vmovdqa64 %zmm8, %zmm17
- vmovdqa64 %zmm9, %zmm18
- vmovdqa64 %zmm10, %zmm19
- vmovdqa64 %zmm11, %zmm20
- vmovdqa64 %zmm12, %zmm21
- vmovdqa64 %zmm13, %zmm22
- vmovdqa64 %zmm14, %zmm23
- vmovdqa64 %zmm15, %zmm24
-
- vpternlogq $0xd8, %zmm16, %zmm0, %zmm17
- vpternlogq $0xd8, %zmm16, %zmm1, %zmm18
- vpternlogq $0xd8, %zmm16, %zmm2, %zmm19
- vpternlogq $0xd8, %zmm16, %zmm3, %zmm20
- vpternlogq $0xd8, %zmm16, %zmm4, %zmm21
- vpternlogq $0xd8, %zmm16, %zmm5, %zmm22
- vpternlogq $0xd8, %zmm16, %zmm6, %zmm23
- vpternlogq $0xd8, %zmm16, %zmm7, %zmm24
-
- vpternlogq $0xe4, %zmm16, %zmm0, %zmm8
- vpternlogq $0xe4, %zmm16, %zmm1, %zmm9
- vpternlogq $0xe4, %zmm16, %zmm2, %zmm10
- vpternlogq $0xe4, %zmm16, %zmm3, %zmm11
- vpternlogq $0xe4, %zmm16, %zmm4, %zmm12
- vpternlogq $0xe4, %zmm16, %zmm5, %zmm13
- vpternlogq $0xe4, %zmm16, %zmm6, %zmm14
- vpternlogq $0xe4, %zmm16, %zmm7, %zmm15
-
-
- vmovdqu64 %zmm8, 64*0(p1ptr)
- vmovdqu64 %zmm9, 64*1(p1ptr){%k2}
- vmovdqu64 %zmm10, 15*8 + 64*0(p1ptr)
- vmovdqu64 %zmm11, 15*8 + 64*1(p1ptr){%k2}
- vmovdqu64 %zmm12, 2*15*8 + 64*0(p1ptr)
- vmovdqu64 %zmm13, 2*15*8 + 64*1(p1ptr){%k2}
- vmovdqu64 %zmm14, 3*15*8 + 64*0(p1ptr)
- vmovdqu64 %zmm15, 3*15*8 + 64*1(p1ptr){%k2}
-
- vmovdqu64 %zmm17, 64*0(p2ptr)
- vmovdqu64 %zmm18, 64*1(p2ptr){%k2}
- vmovdqu64 %zmm19, 15*8 + 64*0(p2ptr)
- vmovdqu64 %zmm20, 15*8 + 64*1(p2ptr){%k2}
- vmovdqu64 %zmm21, 2*15*8 + 64*0(p2ptr)
- vmovdqu64 %zmm22, 2*15*8 + 64*1(p2ptr){%k2}
- vmovdqu64 %zmm23, 3*15*8 + 64*0(p2ptr)
- vmovdqu64 %zmm24, 3*15*8 + 64*1(p2ptr){%k2}
-
- ret
- ###############################################################################
- .globl C_ABI(fp_add)
- .p2align 6
- C_ABI(fp_add):
-
- mov $0x7f, %eax
- kmovw %eax, %k2
-
- vmovdqu64 64*0(felemA), ACC0a
- vmovdqu64 64*1(felemA), ACC0b{%k2}{z}
-
- vmovdqu64 64*0(felemB), ACC2a
- vmovdqu64 64*1(felemB), ACC2b{%k2}{z}
-
- vpaddq ACC2a, ACC0a, ACC0a
- vpaddq ACC2b, ACC0b, ACC0b
-
- // Fallthrough
- ###############################################################################
- .p2align 6
- C_ABI(fp_normalize):
-
- vpbroadcastq .LandMask(%rip), AND_MASK
- vpxorq ZERO, ZERO, ZERO
-
- # Now 'normalize' the acc to 52 bit words
- vpsrlq $52, ACC0a, A0a
- vpsrlq $52, ACC0b, A0b
-
- vpandq AND_MASK, ACC0a, ACC0a
- vpandq AND_MASK, ACC0b, ACC0b
-
- valignq $7, A0a, A0b, A0b
- valignq $7, ZERO, A0a, A0a
-
- vpaddq A0a, ACC0a, ACC0a
- vpaddq A0b, ACC0b, ACC0b
-
- vpcmpuq $1, A0a, ACC0a, %k1
- vpcmpuq $1, A0b, ACC0b, %k2
- vpcmpuq $0, AND_MASK, ACC0a, %k3
- vpcmpuq $0, AND_MASK, ACC0b, %k4
-
- kmovb %k1, %eax
- kmovb %k2, %ecx
- kmovb %k3, %r8d
- kmovb %k4, %r9d
-
- add %al, %al
- adc %cl, %cl
-
- add %r8b, %al
- adc %r9b, %cl
-
- xor %r8b, %al
- xor %r9b, %cl
-
- kmovb %eax, %k1
- kmovb %ecx, %k2
-
- vpsubq AND_MASK, ACC0a, ACC0a{%k1}
- vpsubq AND_MASK, ACC0b, ACC0b{%k2}
- vpandq AND_MASK, ACC0a, ACC0a
- vpandq AND_MASK, ACC0b, ACC0b
-
- mov $0x7f, %eax
- kmovw %eax, %k1
-
- vmovdqu64 ACC0a, 64*0(%rdi)
- vmovdqu64 ACC0b, 64*1(%rdi){%k1}
-
- ret
-
- ###############################################################################
- .globl C_ABI(fp_sub)
- .p2align 6
- C_ABI(fp_sub):
-
- mov $0x7f, %eax
- kmovw %eax, %k2
-
- vmovdqu64 64*0(felemA), ACC0a
- vmovdqu64 64*1(felemA), ACC0b{%k2}{z}
-
- vmovdqu64 64*0(felemB), ACC2a
- vmovdqu64 64*1(felemB), ACC2b{%k2}{z}
-
- vpaddq 64*0 + .LpolyX(%rip), ACC0a, ACC0a
- vpaddq 64*1 + .LpolyX(%rip), ACC0b, ACC0b
-
- vpsubq ACC2a, ACC0a, ACC0a
- vpsubq ACC2b, ACC0b, ACC0b
-
- jmp fp_normalize
-
|