@@ -1,4 +1,4 @@ | |||
name: NTRU-HPS2048509 | |||
name: ntruhps2048509 | |||
type: kem | |||
claimed-nist-level: 1 | |||
claimed-security: IND-CCA2 | |||
@@ -15,9 +15,22 @@ auxiliary-submitters: | |||
- Jeffrey Hoffstein | |||
- Andreas Hülsing | |||
- Joost Rijneveld | |||
- Tsunekazu Saito | |||
- Peter Schwabe | |||
- William Whyte | |||
- Keita Xagawa | |||
- Takashi Yamakawa | |||
- Zhenfei Zhang | |||
implementations: | |||
- name: clean | |||
version: https://github.com/jschanck/ntru/tree/485dde03 reference implementation | |||
version: https://github.com/jschanck/ntru/tree/4699d70a reference implementation | |||
- name: avx2 | |||
version: https://github.com/jschanck/ntru/tree/4699d70a avx2 implementation | |||
supported_platforms: | |||
- architecture: x86_64 | |||
operating_systems: | |||
- Linux | |||
- Darwin | |||
required_flags: | |||
- avx2 | |||
- bmi2 |
@@ -0,0 +1 @@ | |||
Public Domain |
@@ -0,0 +1,24 @@ | |||
# This Makefile can be used with GNU Make or BSD Make | |||
LIB=libntruhps2048509_avx2.a | |||
HEADERS=api.h cmov.h crypto_sort_int32.h owcpa.h params.h poly.h poly_r2_inv.h sample.h | |||
OBJECTS=cmov.o crypto_sort_int32.o kem.o owcpa.o pack3.o packq.o poly.o poly_lift.o poly_r2_inv.o poly_s3_inv.o sample.o sample_iid.o \ | |||
square_1_509_patience.o square_3_509_patience.o square_6_509_patience.o square_15_509_shufbytes.o square_30_509_shufbytes.o square_63_509_shufbytes.o square_126_509_shufbytes.o square_252_509_shufbytes.o \ | |||
poly_mod_3_Phi_n.o poly_mod_q_Phi_n.o poly_r2_mul.o poly_rq_mul.o poly_rq_to_s3.o vec32_sample_iid.o | |||
CFLAGS=-O3 -mavx2 -mbmi2 -Wall -Wextra -Wpedantic -Wvla -Werror -Wredundant-decls -Wmissing-prototypes -std=c99 -I../../../common $(EXTRAFLAGS) | |||
all: $(LIB) | |||
%.o: %.s $(HEADERS) | |||
$(AS) -o $@ $< | |||
%.o: %.c $(HEADERS) | |||
$(CC) $(CFLAGS) -c -o $@ $< | |||
$(LIB): $(OBJECTS) | |||
$(AR) -r $@ $(OBJECTS) | |||
clean: | |||
$(RM) $(OBJECTS) | |||
$(RM) $(LIB) |
@@ -0,0 +1,19 @@ | |||
#ifndef PQCLEAN_NTRUHPS2048509_AVX2_API_H | |||
#define PQCLEAN_NTRUHPS2048509_AVX2_API_H | |||
#include <stdint.h> | |||
#define PQCLEAN_NTRUHPS2048509_AVX2_CRYPTO_SECRETKEYBYTES 935 | |||
#define PQCLEAN_NTRUHPS2048509_AVX2_CRYPTO_PUBLICKEYBYTES 699 | |||
#define PQCLEAN_NTRUHPS2048509_AVX2_CRYPTO_CIPHERTEXTBYTES 699 | |||
#define PQCLEAN_NTRUHPS2048509_AVX2_CRYPTO_BYTES 32 | |||
#define PQCLEAN_NTRUHPS2048509_AVX2_CRYPTO_ALGNAME "ntruhps2048509" | |||
int PQCLEAN_NTRUHPS2048509_AVX2_crypto_kem_keypair(uint8_t *pk, uint8_t *sk); | |||
int PQCLEAN_NTRUHPS2048509_AVX2_crypto_kem_enc(uint8_t *c, uint8_t *k, const uint8_t *pk); | |||
int PQCLEAN_NTRUHPS2048509_AVX2_crypto_kem_dec(uint8_t *k, const uint8_t *c, const uint8_t *sk); | |||
#endif |
@@ -0,0 +1,11 @@ | |||
#include "cmov.h" | |||
/* b = 1 means mov, b = 0 means don't mov*/ | |||
void PQCLEAN_NTRUHPS2048509_AVX2_cmov(unsigned char *r, const unsigned char *x, size_t len, unsigned char b) { | |||
size_t i; | |||
b = (~b + 1); | |||
for (i = 0; i < len; i++) { | |||
r[i] ^= b & (x[i] ^ r[i]); | |||
} | |||
} |
@@ -0,0 +1,10 @@ | |||
#ifndef VERIFY_H | |||
#define VERIFY_H | |||
#include "params.h" | |||
#include <stddef.h> | |||
void PQCLEAN_NTRUHPS2048509_AVX2_cmov(unsigned char *r, const unsigned char *x, size_t len, unsigned char b); | |||
#endif |
@@ -0,0 +1,11 @@ | |||
#ifndef CRYPTO_SORT | |||
#define CRYPTO_SORT | |||
#include "params.h" | |||
#include <stddef.h> | |||
#include <stdint.h> | |||
void PQCLEAN_NTRUHPS2048509_AVX2_crypto_sort_int32(int32_t *x, size_t n); | |||
#endif |
@@ -0,0 +1,68 @@ | |||
#include "api.h" | |||
#include "cmov.h" | |||
#include "fips202.h" | |||
#include "owcpa.h" | |||
#include "params.h" | |||
#include "randombytes.h" | |||
#include "sample.h" | |||
// API FUNCTIONS | |||
int PQCLEAN_NTRUHPS2048509_AVX2_crypto_kem_keypair(uint8_t *pk, uint8_t *sk) { | |||
uint8_t seed[NTRU_SAMPLE_FG_BYTES]; | |||
randombytes(seed, NTRU_SAMPLE_FG_BYTES); | |||
PQCLEAN_NTRUHPS2048509_AVX2_owcpa_keypair(pk, sk, seed); | |||
randombytes(sk + NTRU_OWCPA_SECRETKEYBYTES, NTRU_PRFKEYBYTES); | |||
return 0; | |||
} | |||
int PQCLEAN_NTRUHPS2048509_AVX2_crypto_kem_enc(uint8_t *c, uint8_t *k, const uint8_t *pk) { | |||
poly r, m; | |||
uint8_t rm[NTRU_OWCPA_MSGBYTES]; | |||
uint8_t rm_seed[NTRU_SAMPLE_RM_BYTES]; | |||
randombytes(rm_seed, NTRU_SAMPLE_RM_BYTES); | |||
PQCLEAN_NTRUHPS2048509_AVX2_sample_rm(&r, &m, rm_seed); | |||
PQCLEAN_NTRUHPS2048509_AVX2_poly_S3_tobytes(rm, &r); | |||
PQCLEAN_NTRUHPS2048509_AVX2_poly_S3_tobytes(rm + NTRU_PACK_TRINARY_BYTES, &m); | |||
sha3_256(k, rm, NTRU_OWCPA_MSGBYTES); | |||
PQCLEAN_NTRUHPS2048509_AVX2_poly_Z3_to_Zq(&r); | |||
PQCLEAN_NTRUHPS2048509_AVX2_owcpa_enc(c, &r, &m, pk); | |||
return 0; | |||
} | |||
int PQCLEAN_NTRUHPS2048509_AVX2_crypto_kem_dec(uint8_t *k, const uint8_t *c, const uint8_t *sk) { | |||
int i, fail; | |||
uint8_t rm[NTRU_OWCPA_MSGBYTES]; | |||
uint8_t buf[NTRU_PRFKEYBYTES + NTRU_CIPHERTEXTBYTES]; | |||
fail = 0; | |||
/* Check that unused bits of last byte of ciphertext are zero */ | |||
fail |= c[NTRU_CIPHERTEXTBYTES - 1] & (0xff << (8 - (7 & (NTRU_LOGQ * NTRU_PACK_DEG)))); | |||
fail |= PQCLEAN_NTRUHPS2048509_AVX2_owcpa_dec(rm, c, sk); | |||
/* If fail = 0 then c = Enc(h, rm). There is no need to re-encapsulate. */ | |||
/* See comment in PQCLEAN_NTRUHPS2048509_AVX2_owcpa_dec for details. */ | |||
sha3_256(k, rm, NTRU_OWCPA_MSGBYTES); | |||
/* shake(secret PRF key || input ciphertext) */ | |||
for (i = 0; i < NTRU_PRFKEYBYTES; i++) { | |||
buf[i] = sk[i + NTRU_OWCPA_SECRETKEYBYTES]; | |||
} | |||
for (i = 0; i < NTRU_CIPHERTEXTBYTES; i++) { | |||
buf[NTRU_PRFKEYBYTES + i] = c[i]; | |||
} | |||
sha3_256(rm, buf, NTRU_PRFKEYBYTES + NTRU_CIPHERTEXTBYTES); | |||
PQCLEAN_NTRUHPS2048509_AVX2_cmov(k, rm, NTRU_SHAREDKEYBYTES, (unsigned char) fail); | |||
return 0; | |||
} |
@@ -0,0 +1,160 @@ | |||
#include "owcpa.h" | |||
#include "poly.h" | |||
#include "sample.h" | |||
static int owcpa_check_r(const poly *r) { | |||
/* Check that r is in message space. */ | |||
/* Note: Assumes that r has coefficients in {0, 1, ..., q-1} */ | |||
int i; | |||
uint64_t t = 0; | |||
uint16_t c; | |||
for (i = 0; i < NTRU_N; i++) { | |||
c = MODQ(r->coeffs[i] + 1); | |||
t |= c & (NTRU_Q - 4); /* 0 if c is in {0,1,2,3} */ | |||
t |= (c + 1) & 0x4; /* 0 if c is in {0,1,2} */ | |||
} | |||
t |= MODQ(r->coeffs[NTRU_N - 1]); /* Coefficient n-1 must be zero */ | |||
t = (~t + 1); // two's complement | |||
t >>= 63; | |||
return (int) t; | |||
} | |||
static int owcpa_check_m(const poly *m) { | |||
/* Check that m is in message space. */ | |||
/* Note: Assumes that m has coefficients in {0,1,2}. */ | |||
int i; | |||
uint64_t t = 0; | |||
uint16_t p1 = 0; | |||
uint16_t m1 = 0; | |||
for (i = 0; i < NTRU_N; i++) { | |||
p1 += m->coeffs[i] & 0x01; | |||
m1 += (m->coeffs[i] & 0x02) >> 1; | |||
} | |||
/* Need p1 = m1 and p1 + m1 = NTRU_WEIGHT */ | |||
t |= p1 ^ m1; | |||
t |= (p1 + m1) ^ NTRU_WEIGHT; | |||
t = (~t + 1); // two's complement | |||
t >>= 63; | |||
return (int) t; | |||
} | |||
void PQCLEAN_NTRUHPS2048509_AVX2_owcpa_keypair(unsigned char *pk, | |||
unsigned char *sk, | |||
const unsigned char seed[NTRU_SAMPLE_FG_BYTES]) { | |||
int i; | |||
poly x1, x2, x3, x4, x5; | |||
poly *f = &x1, *g = &x2, *invf_mod3 = &x3; | |||
poly *gf = &x3, *invgf = &x4, *tmp = &x5; | |||
poly *invh = &x3, *h = &x3; | |||
PQCLEAN_NTRUHPS2048509_AVX2_sample_fg(f, g, seed); | |||
PQCLEAN_NTRUHPS2048509_AVX2_poly_S3_inv(invf_mod3, f); | |||
PQCLEAN_NTRUHPS2048509_AVX2_poly_S3_tobytes(sk, f); | |||
PQCLEAN_NTRUHPS2048509_AVX2_poly_S3_tobytes(sk + NTRU_PACK_TRINARY_BYTES, invf_mod3); | |||
/* Lift coeffs of f and g from Z_p to Z_q */ | |||
PQCLEAN_NTRUHPS2048509_AVX2_poly_Z3_to_Zq(f); | |||
PQCLEAN_NTRUHPS2048509_AVX2_poly_Z3_to_Zq(g); | |||
/* g = 3*g */ | |||
for (i = 0; i < NTRU_N; i++) { | |||
g->coeffs[i] = 3 * g->coeffs[i]; | |||
} | |||
PQCLEAN_NTRUHPS2048509_AVX2_poly_Rq_mul(gf, g, f); | |||
PQCLEAN_NTRUHPS2048509_AVX2_poly_Rq_inv(invgf, gf); | |||
PQCLEAN_NTRUHPS2048509_AVX2_poly_Rq_mul(tmp, invgf, f); | |||
PQCLEAN_NTRUHPS2048509_AVX2_poly_Sq_mul(invh, tmp, f); | |||
PQCLEAN_NTRUHPS2048509_AVX2_poly_Sq_tobytes(sk + 2 * NTRU_PACK_TRINARY_BYTES, invh); | |||
PQCLEAN_NTRUHPS2048509_AVX2_poly_Rq_mul(tmp, invgf, g); | |||
PQCLEAN_NTRUHPS2048509_AVX2_poly_Rq_mul(h, tmp, g); | |||
PQCLEAN_NTRUHPS2048509_AVX2_poly_Rq_sum_zero_tobytes(pk, h); | |||
} | |||
void PQCLEAN_NTRUHPS2048509_AVX2_owcpa_enc(unsigned char *c, | |||
const poly *r, | |||
const poly *m, | |||
const unsigned char *pk) { | |||
int i; | |||
poly x1, x2; | |||
poly *h = &x1, *liftm = &x1; | |||
poly *ct = &x2; | |||
PQCLEAN_NTRUHPS2048509_AVX2_poly_Rq_sum_zero_frombytes(h, pk); | |||
PQCLEAN_NTRUHPS2048509_AVX2_poly_Rq_mul(ct, r, h); | |||
PQCLEAN_NTRUHPS2048509_AVX2_poly_lift(liftm, m); | |||
for (i = 0; i < NTRU_N; i++) { | |||
ct->coeffs[i] = ct->coeffs[i] + liftm->coeffs[i]; | |||
} | |||
PQCLEAN_NTRUHPS2048509_AVX2_poly_Rq_sum_zero_tobytes(c, ct); | |||
} | |||
int PQCLEAN_NTRUHPS2048509_AVX2_owcpa_dec(unsigned char *rm, | |||
const unsigned char *ciphertext, | |||
const unsigned char *secretkey) { | |||
int i; | |||
int fail; | |||
poly x1, x2, x3, x4; | |||
poly *c = &x1, *f = &x2, *cf = &x3; | |||
poly *mf = &x2, *finv3 = &x3, *m = &x4; | |||
poly *liftm = &x2, *invh = &x3, *r = &x4; | |||
poly *b = &x1; | |||
PQCLEAN_NTRUHPS2048509_AVX2_poly_Rq_sum_zero_frombytes(c, ciphertext); | |||
PQCLEAN_NTRUHPS2048509_AVX2_poly_S3_frombytes(f, secretkey); | |||
PQCLEAN_NTRUHPS2048509_AVX2_poly_Z3_to_Zq(f); | |||
PQCLEAN_NTRUHPS2048509_AVX2_poly_Rq_mul(cf, c, f); | |||
PQCLEAN_NTRUHPS2048509_AVX2_poly_Rq_to_S3(mf, cf); | |||
PQCLEAN_NTRUHPS2048509_AVX2_poly_S3_frombytes(finv3, secretkey + NTRU_PACK_TRINARY_BYTES); | |||
PQCLEAN_NTRUHPS2048509_AVX2_poly_S3_mul(m, mf, finv3); | |||
PQCLEAN_NTRUHPS2048509_AVX2_poly_S3_tobytes(rm + NTRU_PACK_TRINARY_BYTES, m); | |||
/* NOTE: For the IND-CCA2 KEM we must ensure that c = Enc(h, (r,m)). */ | |||
/* We can avoid re-computing r*h + Lift(m) as long as we check that */ | |||
/* r (defined as b/h mod (q, Phi_n)) and m are in the message space. */ | |||
/* (m can take any value in S3 in NTRU_HRSS) */ | |||
fail = 0; | |||
fail |= owcpa_check_m(m); | |||
/* b = c - Lift(m) mod (q, x^n - 1) */ | |||
PQCLEAN_NTRUHPS2048509_AVX2_poly_lift(liftm, m); | |||
for (i = 0; i < NTRU_N; i++) { | |||
b->coeffs[i] = c->coeffs[i] - liftm->coeffs[i]; | |||
} | |||
/* r = b / h mod (q, Phi_n) */ | |||
PQCLEAN_NTRUHPS2048509_AVX2_poly_Sq_frombytes(invh, secretkey + 2 * NTRU_PACK_TRINARY_BYTES); | |||
PQCLEAN_NTRUHPS2048509_AVX2_poly_Sq_mul(r, b, invh); | |||
/* NOTE: Our definition of r as b/h mod (q, Phi_n) follows Figure 4 of */ | |||
/* [Sch18] https://eprint.iacr.org/2018/1174/20181203:032458. */ | |||
/* This differs from Figure 10 of Saito--Xagawa--Yamakawa */ | |||
/* [SXY17] https://eprint.iacr.org/2017/1005/20180516:055500 */ | |||
/* where r gets a final reduction modulo p. */ | |||
/* We need this change to use Proposition 1 of [Sch18]. */ | |||
/* Proposition 1 of [Sch18] shows that re-encryption with (r,m) yields c. */ | |||
/* if and only if fail==0 after the following call to owcpa_check_r */ | |||
/* The procedure given in Fig. 8 of [Sch18] can be skipped because we have */ | |||
/* c(1) = 0 due to the use of poly_Rq_sum_zero_{to,from}bytes. */ | |||
fail |= owcpa_check_r(r); | |||
PQCLEAN_NTRUHPS2048509_AVX2_poly_trinary_Zq_to_Z3(r); | |||
PQCLEAN_NTRUHPS2048509_AVX2_poly_S3_tobytes(rm, r); | |||
return fail; | |||
} |
@@ -0,0 +1,22 @@ | |||
#ifndef OWCPA_H | |||
#define OWCPA_H | |||
#include "params.h" | |||
#include "poly.h" | |||
void PQCLEAN_NTRUHPS2048509_AVX2_owcpa_samplemsg(unsigned char msg[NTRU_OWCPA_MSGBYTES], | |||
const unsigned char seed[NTRU_SEEDBYTES]); | |||
void PQCLEAN_NTRUHPS2048509_AVX2_owcpa_keypair(unsigned char *pk, | |||
unsigned char *sk, | |||
const unsigned char seed[NTRU_SEEDBYTES]); | |||
void PQCLEAN_NTRUHPS2048509_AVX2_owcpa_enc(unsigned char *c, | |||
const poly *r, | |||
const poly *m, | |||
const unsigned char *pk); | |||
int PQCLEAN_NTRUHPS2048509_AVX2_owcpa_dec(unsigned char *rm, | |||
const unsigned char *ciphertext, | |||
const unsigned char *secretkey); | |||
#endif |
@@ -0,0 +1,46 @@ | |||
#include "poly.h" | |||
void PQCLEAN_NTRUHPS2048509_AVX2_poly_S3_tobytes(unsigned char msg[NTRU_OWCPA_MSGBYTES], const poly *a) { | |||
int i; | |||
unsigned char c; | |||
int j; | |||
for (i = 0; i < NTRU_PACK_DEG / 5; i++) { | |||
c = a->coeffs[5 * i + 4] & 255; | |||
c = (3 * c + a->coeffs[5 * i + 3]) & 255; | |||
c = (3 * c + a->coeffs[5 * i + 2]) & 255; | |||
c = (3 * c + a->coeffs[5 * i + 1]) & 255; | |||
c = (3 * c + a->coeffs[5 * i + 0]) & 255; | |||
msg[i] = c; | |||
} | |||
i = NTRU_PACK_DEG / 5; | |||
c = 0; | |||
for (j = NTRU_PACK_DEG - (5 * i) - 1; j >= 0; j--) { | |||
c = (3 * c + a->coeffs[5 * i + j]) & 255; | |||
} | |||
msg[i] = c; | |||
} | |||
void PQCLEAN_NTRUHPS2048509_AVX2_poly_S3_frombytes(poly *r, const unsigned char msg[NTRU_OWCPA_MSGBYTES]) { | |||
int i; | |||
unsigned char c; | |||
int j; | |||
for (i = 0; i < NTRU_PACK_DEG / 5; i++) { | |||
c = msg[i]; | |||
r->coeffs[5 * i + 0] = c; | |||
r->coeffs[5 * i + 1] = c * 171 >> 9; // this is division by 3 | |||
r->coeffs[5 * i + 2] = c * 57 >> 9; // division by 3^2 | |||
r->coeffs[5 * i + 3] = c * 19 >> 9; // division by 3^3 | |||
r->coeffs[5 * i + 4] = c * 203 >> 14; // etc. | |||
} | |||
i = NTRU_PACK_DEG / 5; | |||
c = msg[i]; | |||
for (j = 0; (5 * i + j) < NTRU_PACK_DEG; j++) { | |||
r->coeffs[5 * i + j] = c; | |||
c = c * 171 >> 9; | |||
} | |||
r->coeffs[NTRU_N - 1] = 0; | |||
PQCLEAN_NTRUHPS2048509_AVX2_poly_mod_3_Phi_n(r); | |||
} | |||
@@ -0,0 +1,93 @@ | |||
#include "poly.h" | |||
void PQCLEAN_NTRUHPS2048509_AVX2_poly_Sq_tobytes(unsigned char *r, const poly *a) { | |||
int i, j; | |||
uint16_t t[8]; | |||
for (i = 0; i < NTRU_PACK_DEG / 8; i++) { | |||
for (j = 0; j < 8; j++) { | |||
t[j] = MODQ(a->coeffs[8 * i + j]); | |||
} | |||
r[11 * i + 0] = (unsigned char) ( t[0] & 0xff); | |||
r[11 * i + 1] = (unsigned char) ((t[0] >> 8) | ((t[1] & 0x1f) << 3)); | |||
r[11 * i + 2] = (unsigned char) ((t[1] >> 5) | ((t[2] & 0x03) << 6)); | |||
r[11 * i + 3] = (unsigned char) ((t[2] >> 2) & 0xff); | |||
r[11 * i + 4] = (unsigned char) ((t[2] >> 10) | ((t[3] & 0x7f) << 1)); | |||
r[11 * i + 5] = (unsigned char) ((t[3] >> 7) | ((t[4] & 0x0f) << 4)); | |||
r[11 * i + 6] = (unsigned char) ((t[4] >> 4) | ((t[5] & 0x01) << 7)); | |||
r[11 * i + 7] = (unsigned char) ((t[5] >> 1) & 0xff); | |||
r[11 * i + 8] = (unsigned char) ((t[5] >> 9) | ((t[6] & 0x3f) << 2)); | |||
r[11 * i + 9] = (unsigned char) ((t[6] >> 6) | ((t[7] & 0x07) << 5)); | |||
r[11 * i + 10] = (unsigned char) ((t[7] >> 3)); | |||
} | |||
for (j = 0; j < NTRU_PACK_DEG - 8 * i; j++) { | |||
t[j] = MODQ(a->coeffs[8 * i + j]); | |||
} | |||
for (; j < 8; j++) { | |||
t[j] = 0; | |||
} | |||
switch (NTRU_PACK_DEG & 0x07) { | |||
// cases 0 and 6 are impossible since 2 generates (Z/n)* and | |||
// p mod 8 in {1, 7} implies that 2 is a quadratic residue. | |||
case 4: | |||
r[11 * i + 0] = (unsigned char) (t[0] & 0xff); | |||
r[11 * i + 1] = (unsigned char) (t[0] >> 8) | ((t[1] & 0x1f) << 3); | |||
r[11 * i + 2] = (unsigned char) (t[1] >> 5) | ((t[2] & 0x03) << 6); | |||
r[11 * i + 3] = (unsigned char) (t[2] >> 2) & 0xff; | |||
r[11 * i + 4] = (unsigned char) (t[2] >> 10) | ((t[3] & 0x7f) << 1); | |||
r[11 * i + 5] = (unsigned char) (t[3] >> 7) | ((t[4] & 0x0f) << 4); | |||
break; | |||
case 2: | |||
r[11 * i + 0] = (unsigned char) (t[0] & 0xff); | |||
r[11 * i + 1] = (unsigned char) (t[0] >> 8) | ((t[1] & 0x1f) << 3); | |||
r[11 * i + 2] = (unsigned char) (t[1] >> 5) | ((t[2] & 0x03) << 6); | |||
break; | |||
} | |||
} | |||
void PQCLEAN_NTRUHPS2048509_AVX2_poly_Sq_frombytes(poly *r, const unsigned char *a) { | |||
int i; | |||
for (i = 0; i < NTRU_PACK_DEG / 8; i++) { | |||
r->coeffs[8 * i + 0] = (a[11 * i + 0] >> 0) | (((uint16_t)a[11 * i + 1] & 0x07) << 8); | |||
r->coeffs[8 * i + 1] = (a[11 * i + 1] >> 3) | (((uint16_t)a[11 * i + 2] & 0x3f) << 5); | |||
r->coeffs[8 * i + 2] = (a[11 * i + 2] >> 6) | (((uint16_t)a[11 * i + 3] & 0xff) << 2) | (((uint16_t)a[11 * i + 4] & 0x01) << 10); | |||
r->coeffs[8 * i + 3] = (a[11 * i + 4] >> 1) | (((uint16_t)a[11 * i + 5] & 0x0f) << 7); | |||
r->coeffs[8 * i + 4] = (a[11 * i + 5] >> 4) | (((uint16_t)a[11 * i + 6] & 0x7f) << 4); | |||
r->coeffs[8 * i + 5] = (a[11 * i + 6] >> 7) | (((uint16_t)a[11 * i + 7] & 0xff) << 1) | (((uint16_t)a[11 * i + 8] & 0x03) << 9); | |||
r->coeffs[8 * i + 6] = (a[11 * i + 8] >> 2) | (((uint16_t)a[11 * i + 9] & 0x1f) << 6); | |||
r->coeffs[8 * i + 7] = (a[11 * i + 9] >> 5) | (((uint16_t)a[11 * i + 10] & 0xff) << 3); | |||
} | |||
switch (NTRU_PACK_DEG & 0x07) { | |||
// cases 0 and 6 are impossible since 2 generates (Z/n)* and | |||
// p mod 8 in {1, 7} implies that 2 is a quadratic residue. | |||
case 4: | |||
r->coeffs[8 * i + 0] = (a[11 * i + 0] >> 0) | (((uint16_t)a[11 * i + 1] & 0x07) << 8); | |||
r->coeffs[8 * i + 1] = (a[11 * i + 1] >> 3) | (((uint16_t)a[11 * i + 2] & 0x3f) << 5); | |||
r->coeffs[8 * i + 2] = (a[11 * i + 2] >> 6) | (((uint16_t)a[11 * i + 3] & 0xff) << 2) | (((uint16_t)a[11 * i + 4] & 0x01) << 10); | |||
r->coeffs[8 * i + 3] = (a[11 * i + 4] >> 1) | (((uint16_t)a[11 * i + 5] & 0x0f) << 7); | |||
break; | |||
case 2: | |||
r->coeffs[8 * i + 0] = (a[11 * i + 0] >> 0) | (((uint16_t)a[11 * i + 1] & 0x07) << 8); | |||
r->coeffs[8 * i + 1] = (a[11 * i + 1] >> 3) | (((uint16_t)a[11 * i + 2] & 0x3f) << 5); | |||
break; | |||
} | |||
r->coeffs[NTRU_N - 1] = 0; | |||
} | |||
void PQCLEAN_NTRUHPS2048509_AVX2_poly_Rq_sum_zero_tobytes(unsigned char *r, const poly *a) { | |||
PQCLEAN_NTRUHPS2048509_AVX2_poly_Sq_tobytes(r, a); | |||
} | |||
void PQCLEAN_NTRUHPS2048509_AVX2_poly_Rq_sum_zero_frombytes(poly *r, const unsigned char *a) { | |||
int i; | |||
PQCLEAN_NTRUHPS2048509_AVX2_poly_Sq_frombytes(r, a); | |||
/* Set r[n-1] so that the sum of coefficients is zero mod q */ | |||
r->coeffs[NTRU_N - 1] = 0; | |||
for (i = 0; i < NTRU_PACK_DEG; i++) { | |||
r->coeffs[NTRU_N - 1] -= r->coeffs[i]; | |||
} | |||
} |
@@ -0,0 +1,37 @@ | |||
#ifndef PARAMS_H | |||
#define PARAMS_H | |||
#define NTRU_HPS | |||
#define NTRU_N 509 | |||
#define NTRU_LOGQ 11 | |||
/* Do not modify below this line */ | |||
#define PAD32(X) ((((X) + 31)/32)*32) | |||
#define NTRU_Q (1 << NTRU_LOGQ) | |||
#define NTRU_WEIGHT (NTRU_Q/8 - 2) | |||
#define NTRU_SEEDBYTES 32 | |||
#define NTRU_PRFKEYBYTES 32 | |||
#define NTRU_SHAREDKEYBYTES 32 | |||
#define NTRU_SAMPLE_IID_BYTES (NTRU_N-1) | |||
#define NTRU_SAMPLE_FT_BYTES ((30*(NTRU_N-1)+7)/8) | |||
#define NTRU_SAMPLE_FG_BYTES (NTRU_SAMPLE_IID_BYTES+NTRU_SAMPLE_FT_BYTES) | |||
#define NTRU_SAMPLE_RM_BYTES (NTRU_SAMPLE_IID_BYTES+NTRU_SAMPLE_FT_BYTES) | |||
#define NTRU_PACK_DEG (NTRU_N-1) | |||
#define NTRU_PACK_TRINARY_BYTES ((NTRU_PACK_DEG+4)/5) | |||
#define NTRU_OWCPA_MSGBYTES (2*NTRU_PACK_TRINARY_BYTES) | |||
#define NTRU_OWCPA_PUBLICKEYBYTES ((NTRU_LOGQ*NTRU_PACK_DEG+7)/8) | |||
#define NTRU_OWCPA_SECRETKEYBYTES (2*NTRU_PACK_TRINARY_BYTES + NTRU_OWCPA_PUBLICKEYBYTES) | |||
#define NTRU_OWCPA_BYTES ((NTRU_LOGQ*NTRU_PACK_DEG+7)/8) | |||
#define NTRU_PUBLICKEYBYTES (NTRU_OWCPA_PUBLICKEYBYTES) | |||
#define NTRU_SECRETKEYBYTES (NTRU_OWCPA_SECRETKEYBYTES + NTRU_PRFKEYBYTES) | |||
#define NTRU_CIPHERTEXTBYTES (NTRU_OWCPA_BYTES) | |||
#endif |
@@ -0,0 +1,67 @@ | |||
#include "poly.h" | |||
/* Map {0, 1, 2} -> {0,1,q-1} in place */ | |||
void PQCLEAN_NTRUHPS2048509_AVX2_poly_Z3_to_Zq(poly *r) { | |||
int i; | |||
for (i = 0; i < NTRU_N; i++) { | |||
r->coeffs[i] = r->coeffs[i] | ((-(r->coeffs[i] >> 1)) & (NTRU_Q - 1)); | |||
} | |||
} | |||
/* Map {0, 1, q-1} -> {0,1,2} in place */ | |||
void PQCLEAN_NTRUHPS2048509_AVX2_poly_trinary_Zq_to_Z3(poly *r) { | |||
int i; | |||
for (i = 0; i < NTRU_N; i++) { | |||
r->coeffs[i] = MODQ(r->coeffs[i]); | |||
r->coeffs[i] = 3 & (r->coeffs[i] ^ (r->coeffs[i] >> (NTRU_LOGQ - 1))); | |||
} | |||
} | |||
void PQCLEAN_NTRUHPS2048509_AVX2_poly_Sq_mul(poly *r, const poly *a, const poly *b) { | |||
PQCLEAN_NTRUHPS2048509_AVX2_poly_Rq_mul(r, a, b); | |||
PQCLEAN_NTRUHPS2048509_AVX2_poly_mod_q_Phi_n(r); | |||
} | |||
void PQCLEAN_NTRUHPS2048509_AVX2_poly_S3_mul(poly *r, const poly *a, const poly *b) { | |||
PQCLEAN_NTRUHPS2048509_AVX2_poly_Rq_mul(r, a, b); | |||
PQCLEAN_NTRUHPS2048509_AVX2_poly_mod_3_Phi_n(r); | |||
} | |||
static void PQCLEAN_NTRUHPS2048509_AVX2_poly_R2_inv_to_Rq_inv(poly *r, const poly *ai, const poly *a) { | |||
int i; | |||
poly b, c; | |||
poly s; | |||
// for 0..4 | |||
// ai = ai * (2 - a*ai) mod q | |||
for (i = 0; i < NTRU_N; i++) { | |||
b.coeffs[i] = -(a->coeffs[i]); | |||
} | |||
for (i = 0; i < NTRU_N; i++) { | |||
r->coeffs[i] = ai->coeffs[i]; | |||
} | |||
PQCLEAN_NTRUHPS2048509_AVX2_poly_Rq_mul(&c, r, &b); | |||
c.coeffs[0] += 2; // c = 2 - a*ai | |||
PQCLEAN_NTRUHPS2048509_AVX2_poly_Rq_mul(&s, &c, r); // s = ai*c | |||
PQCLEAN_NTRUHPS2048509_AVX2_poly_Rq_mul(&c, &s, &b); | |||
c.coeffs[0] += 2; // c = 2 - a*s | |||
PQCLEAN_NTRUHPS2048509_AVX2_poly_Rq_mul(r, &c, &s); // r = s*c | |||
PQCLEAN_NTRUHPS2048509_AVX2_poly_Rq_mul(&c, r, &b); | |||
c.coeffs[0] += 2; // c = 2 - a*r | |||
PQCLEAN_NTRUHPS2048509_AVX2_poly_Rq_mul(&s, &c, r); // s = r*c | |||
PQCLEAN_NTRUHPS2048509_AVX2_poly_Rq_mul(&c, &s, &b); | |||
c.coeffs[0] += 2; // c = 2 - a*s | |||
PQCLEAN_NTRUHPS2048509_AVX2_poly_Rq_mul(r, &c, &s); // r = s*c | |||
} | |||
void PQCLEAN_NTRUHPS2048509_AVX2_poly_Rq_inv(poly *r, const poly *a) { | |||
poly ai2; | |||
PQCLEAN_NTRUHPS2048509_AVX2_poly_R2_inv(&ai2, a); | |||
PQCLEAN_NTRUHPS2048509_AVX2_poly_R2_inv_to_Rq_inv(r, &ai2, a); | |||
} |
@@ -0,0 +1,41 @@ | |||
#ifndef POLY_H | |||
#define POLY_H | |||
#include <immintrin.h> | |||
#include <stdint.h> | |||
#include "params.h" | |||
#define MODQ(X) ((X) & (NTRU_Q-1)) | |||
typedef union { /* align to 32 byte boundary for vmovdqa */ | |||
uint16_t coeffs[PAD32(NTRU_N)]; | |||
__m256i coeffs_x16[PAD32(NTRU_N) / 16]; | |||
} poly; | |||
void PQCLEAN_NTRUHPS2048509_AVX2_poly_mod_3_Phi_n(poly *r); | |||
void PQCLEAN_NTRUHPS2048509_AVX2_poly_mod_q_Phi_n(poly *r); | |||
void PQCLEAN_NTRUHPS2048509_AVX2_poly_Sq_tobytes(unsigned char *r, const poly *a); | |||
void PQCLEAN_NTRUHPS2048509_AVX2_poly_Sq_frombytes(poly *r, const unsigned char *a); | |||
void PQCLEAN_NTRUHPS2048509_AVX2_poly_Rq_sum_zero_tobytes(unsigned char *r, const poly *a); | |||
void PQCLEAN_NTRUHPS2048509_AVX2_poly_Rq_sum_zero_frombytes(poly *r, const unsigned char *a); | |||
void PQCLEAN_NTRUHPS2048509_AVX2_poly_S3_tobytes(unsigned char msg[NTRU_PACK_TRINARY_BYTES], const poly *a); | |||
void PQCLEAN_NTRUHPS2048509_AVX2_poly_S3_frombytes(poly *r, const unsigned char msg[NTRU_PACK_TRINARY_BYTES]); | |||
void PQCLEAN_NTRUHPS2048509_AVX2_poly_Sq_mul(poly *r, const poly *a, const poly *b); | |||
void PQCLEAN_NTRUHPS2048509_AVX2_poly_Rq_mul(poly *r, const poly *a, const poly *b); | |||
void PQCLEAN_NTRUHPS2048509_AVX2_poly_S3_mul(poly *r, const poly *a, const poly *b); | |||
void PQCLEAN_NTRUHPS2048509_AVX2_poly_lift(poly *r, const poly *a); | |||
void PQCLEAN_NTRUHPS2048509_AVX2_poly_Rq_to_S3(poly *r, const poly *a); | |||
void PQCLEAN_NTRUHPS2048509_AVX2_poly_R2_inv(poly *r, const poly *a); | |||
void PQCLEAN_NTRUHPS2048509_AVX2_poly_Rq_inv(poly *r, const poly *a); | |||
void PQCLEAN_NTRUHPS2048509_AVX2_poly_S3_inv(poly *r, const poly *a); | |||
void PQCLEAN_NTRUHPS2048509_AVX2_poly_Z3_to_Zq(poly *r); | |||
void PQCLEAN_NTRUHPS2048509_AVX2_poly_trinary_Zq_to_Z3(poly *r); | |||
#endif |
@@ -0,0 +1,11 @@ | |||
#include "poly.h" | |||
void PQCLEAN_NTRUHPS2048509_AVX2_poly_lift(poly *r, const poly *a) { | |||
int i; | |||
for (i = 0; i < NTRU_N; i++) { | |||
r->coeffs[i] = a->coeffs[i]; | |||
} | |||
PQCLEAN_NTRUHPS2048509_AVX2_poly_Z3_to_Zq(r); | |||
} | |||
@@ -0,0 +1,676 @@ | |||
.data | |||
.p2align 5 | |||
mask_ff: | |||
.word 0xff | |||
.word 0xff | |||
.word 0xff | |||
.word 0xff | |||
.word 0xff | |||
.word 0xff | |||
.word 0xff | |||
.word 0xff | |||
.word 0xff | |||
.word 0xff | |||
.word 0xff | |||
.word 0xff | |||
.word 0xff | |||
.word 0xff | |||
.word 0xff | |||
.word 0xff | |||
mask_f: | |||
.word 0xf | |||
.word 0xf | |||
.word 0xf | |||
.word 0xf | |||
.word 0xf | |||
.word 0xf | |||
.word 0xf | |||
.word 0xf | |||
.word 0xf | |||
.word 0xf | |||
.word 0xf | |||
.word 0xf | |||
.word 0xf | |||
.word 0xf | |||
.word 0xf | |||
.word 0xf | |||
mask_3: | |||
.word 0x03 | |||
.word 0x03 | |||
.word 0x03 | |||
.word 0x03 | |||
.word 0x03 | |||
.word 0x03 | |||
.word 0x03 | |||
.word 0x03 | |||
.word 0x03 | |||
.word 0x03 | |||
.word 0x03 | |||
.word 0x03 | |||
.word 0x03 | |||
.word 0x03 | |||
.word 0x03 | |||
.word 0x03 | |||
.text | |||
.global PQCLEAN_NTRUHPS2048509_AVX2_poly_mod_3_Phi_n | |||
.global _PQCLEAN_NTRUHPS2048509_AVX2_poly_mod_3_Phi_n | |||
PQCLEAN_NTRUHPS2048509_AVX2_poly_mod_3_Phi_n: | |||
_PQCLEAN_NTRUHPS2048509_AVX2_poly_mod_3_Phi_n: | |||
vmovdqa 992(%rdi), %ymm0 | |||
vpermq $3, %ymm0, %ymm0 | |||
vpslld $17, %ymm0, %ymm0 | |||
vpsrld $16, %ymm0, %ymm1 | |||
vpor %ymm0, %ymm1, %ymm0 | |||
vbroadcastss %xmm0, %ymm0 | |||
vpaddw 0(%rdi), %ymm0, %ymm1 | |||
vpsrlw $8, %ymm1, %ymm2 | |||
vpand mask_ff(%rip), %ymm1, %ymm1 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_f(%rip), %ymm2, %ymm1 | |||
vpsrlw $4, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpsubw mask_3(%rip), %ymm2, %ymm14 | |||
vpsraw $15, %ymm14, %ymm15 | |||
vpandn %ymm14, %ymm15, %ymm1 | |||
vpand %ymm15, %ymm2, %ymm14 | |||
vpxor %ymm14, %ymm1, %ymm2 | |||
vmovdqa %ymm2, 0(%rdi) | |||
vpaddw 32(%rdi), %ymm0, %ymm1 | |||
vpsrlw $8, %ymm1, %ymm2 | |||
vpand mask_ff(%rip), %ymm1, %ymm1 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_f(%rip), %ymm2, %ymm1 | |||
vpsrlw $4, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpsubw mask_3(%rip), %ymm2, %ymm14 | |||
vpsraw $15, %ymm14, %ymm15 | |||
vpandn %ymm14, %ymm15, %ymm1 | |||
vpand %ymm15, %ymm2, %ymm14 | |||
vpxor %ymm14, %ymm1, %ymm2 | |||
vmovdqa %ymm2, 32(%rdi) | |||
vpaddw 64(%rdi), %ymm0, %ymm1 | |||
vpsrlw $8, %ymm1, %ymm2 | |||
vpand mask_ff(%rip), %ymm1, %ymm1 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_f(%rip), %ymm2, %ymm1 | |||
vpsrlw $4, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpsubw mask_3(%rip), %ymm2, %ymm14 | |||
vpsraw $15, %ymm14, %ymm15 | |||
vpandn %ymm14, %ymm15, %ymm1 | |||
vpand %ymm15, %ymm2, %ymm14 | |||
vpxor %ymm14, %ymm1, %ymm2 | |||
vmovdqa %ymm2, 64(%rdi) | |||
vpaddw 96(%rdi), %ymm0, %ymm1 | |||
vpsrlw $8, %ymm1, %ymm2 | |||
vpand mask_ff(%rip), %ymm1, %ymm1 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_f(%rip), %ymm2, %ymm1 | |||
vpsrlw $4, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpsubw mask_3(%rip), %ymm2, %ymm14 | |||
vpsraw $15, %ymm14, %ymm15 | |||
vpandn %ymm14, %ymm15, %ymm1 | |||
vpand %ymm15, %ymm2, %ymm14 | |||
vpxor %ymm14, %ymm1, %ymm2 | |||
vmovdqa %ymm2, 96(%rdi) | |||
vpaddw 128(%rdi), %ymm0, %ymm1 | |||
vpsrlw $8, %ymm1, %ymm2 | |||
vpand mask_ff(%rip), %ymm1, %ymm1 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_f(%rip), %ymm2, %ymm1 | |||
vpsrlw $4, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpsubw mask_3(%rip), %ymm2, %ymm14 | |||
vpsraw $15, %ymm14, %ymm15 | |||
vpandn %ymm14, %ymm15, %ymm1 | |||
vpand %ymm15, %ymm2, %ymm14 | |||
vpxor %ymm14, %ymm1, %ymm2 | |||
vmovdqa %ymm2, 128(%rdi) | |||
vpaddw 160(%rdi), %ymm0, %ymm1 | |||
vpsrlw $8, %ymm1, %ymm2 | |||
vpand mask_ff(%rip), %ymm1, %ymm1 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_f(%rip), %ymm2, %ymm1 | |||
vpsrlw $4, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpsubw mask_3(%rip), %ymm2, %ymm14 | |||
vpsraw $15, %ymm14, %ymm15 | |||
vpandn %ymm14, %ymm15, %ymm1 | |||
vpand %ymm15, %ymm2, %ymm14 | |||
vpxor %ymm14, %ymm1, %ymm2 | |||
vmovdqa %ymm2, 160(%rdi) | |||
vpaddw 192(%rdi), %ymm0, %ymm1 | |||
vpsrlw $8, %ymm1, %ymm2 | |||
vpand mask_ff(%rip), %ymm1, %ymm1 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_f(%rip), %ymm2, %ymm1 | |||
vpsrlw $4, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpsubw mask_3(%rip), %ymm2, %ymm14 | |||
vpsraw $15, %ymm14, %ymm15 | |||
vpandn %ymm14, %ymm15, %ymm1 | |||
vpand %ymm15, %ymm2, %ymm14 | |||
vpxor %ymm14, %ymm1, %ymm2 | |||
vmovdqa %ymm2, 192(%rdi) | |||
vpaddw 224(%rdi), %ymm0, %ymm1 | |||
vpsrlw $8, %ymm1, %ymm2 | |||
vpand mask_ff(%rip), %ymm1, %ymm1 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_f(%rip), %ymm2, %ymm1 | |||
vpsrlw $4, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpsubw mask_3(%rip), %ymm2, %ymm14 | |||
vpsraw $15, %ymm14, %ymm15 | |||
vpandn %ymm14, %ymm15, %ymm1 | |||
vpand %ymm15, %ymm2, %ymm14 | |||
vpxor %ymm14, %ymm1, %ymm2 | |||
vmovdqa %ymm2, 224(%rdi) | |||
vpaddw 256(%rdi), %ymm0, %ymm1 | |||
vpsrlw $8, %ymm1, %ymm2 | |||
vpand mask_ff(%rip), %ymm1, %ymm1 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_f(%rip), %ymm2, %ymm1 | |||
vpsrlw $4, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpsubw mask_3(%rip), %ymm2, %ymm14 | |||
vpsraw $15, %ymm14, %ymm15 | |||
vpandn %ymm14, %ymm15, %ymm1 | |||
vpand %ymm15, %ymm2, %ymm14 | |||
vpxor %ymm14, %ymm1, %ymm2 | |||
vmovdqa %ymm2, 256(%rdi) | |||
vpaddw 288(%rdi), %ymm0, %ymm1 | |||
vpsrlw $8, %ymm1, %ymm2 | |||
vpand mask_ff(%rip), %ymm1, %ymm1 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_f(%rip), %ymm2, %ymm1 | |||
vpsrlw $4, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpsubw mask_3(%rip), %ymm2, %ymm14 | |||
vpsraw $15, %ymm14, %ymm15 | |||
vpandn %ymm14, %ymm15, %ymm1 | |||
vpand %ymm15, %ymm2, %ymm14 | |||
vpxor %ymm14, %ymm1, %ymm2 | |||
vmovdqa %ymm2, 288(%rdi) | |||
vpaddw 320(%rdi), %ymm0, %ymm1 | |||
vpsrlw $8, %ymm1, %ymm2 | |||
vpand mask_ff(%rip), %ymm1, %ymm1 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_f(%rip), %ymm2, %ymm1 | |||
vpsrlw $4, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpsubw mask_3(%rip), %ymm2, %ymm14 | |||
vpsraw $15, %ymm14, %ymm15 | |||
vpandn %ymm14, %ymm15, %ymm1 | |||
vpand %ymm15, %ymm2, %ymm14 | |||
vpxor %ymm14, %ymm1, %ymm2 | |||
vmovdqa %ymm2, 320(%rdi) | |||
vpaddw 352(%rdi), %ymm0, %ymm1 | |||
vpsrlw $8, %ymm1, %ymm2 | |||
vpand mask_ff(%rip), %ymm1, %ymm1 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_f(%rip), %ymm2, %ymm1 | |||
vpsrlw $4, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpsubw mask_3(%rip), %ymm2, %ymm14 | |||
vpsraw $15, %ymm14, %ymm15 | |||
vpandn %ymm14, %ymm15, %ymm1 | |||
vpand %ymm15, %ymm2, %ymm14 | |||
vpxor %ymm14, %ymm1, %ymm2 | |||
vmovdqa %ymm2, 352(%rdi) | |||
vpaddw 384(%rdi), %ymm0, %ymm1 | |||
vpsrlw $8, %ymm1, %ymm2 | |||
vpand mask_ff(%rip), %ymm1, %ymm1 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_f(%rip), %ymm2, %ymm1 | |||
vpsrlw $4, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpsubw mask_3(%rip), %ymm2, %ymm14 | |||
vpsraw $15, %ymm14, %ymm15 | |||
vpandn %ymm14, %ymm15, %ymm1 | |||
vpand %ymm15, %ymm2, %ymm14 | |||
vpxor %ymm14, %ymm1, %ymm2 | |||
vmovdqa %ymm2, 384(%rdi) | |||
vpaddw 416(%rdi), %ymm0, %ymm1 | |||
vpsrlw $8, %ymm1, %ymm2 | |||
vpand mask_ff(%rip), %ymm1, %ymm1 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_f(%rip), %ymm2, %ymm1 | |||
vpsrlw $4, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpsubw mask_3(%rip), %ymm2, %ymm14 | |||
vpsraw $15, %ymm14, %ymm15 | |||
vpandn %ymm14, %ymm15, %ymm1 | |||
vpand %ymm15, %ymm2, %ymm14 | |||
vpxor %ymm14, %ymm1, %ymm2 | |||
vmovdqa %ymm2, 416(%rdi) | |||
vpaddw 448(%rdi), %ymm0, %ymm1 | |||
vpsrlw $8, %ymm1, %ymm2 | |||
vpand mask_ff(%rip), %ymm1, %ymm1 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_f(%rip), %ymm2, %ymm1 | |||
vpsrlw $4, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpsubw mask_3(%rip), %ymm2, %ymm14 | |||
vpsraw $15, %ymm14, %ymm15 | |||
vpandn %ymm14, %ymm15, %ymm1 | |||
vpand %ymm15, %ymm2, %ymm14 | |||
vpxor %ymm14, %ymm1, %ymm2 | |||
vmovdqa %ymm2, 448(%rdi) | |||
vpaddw 480(%rdi), %ymm0, %ymm1 | |||
vpsrlw $8, %ymm1, %ymm2 | |||
vpand mask_ff(%rip), %ymm1, %ymm1 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_f(%rip), %ymm2, %ymm1 | |||
vpsrlw $4, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpsubw mask_3(%rip), %ymm2, %ymm14 | |||
vpsraw $15, %ymm14, %ymm15 | |||
vpandn %ymm14, %ymm15, %ymm1 | |||
vpand %ymm15, %ymm2, %ymm14 | |||
vpxor %ymm14, %ymm1, %ymm2 | |||
vmovdqa %ymm2, 480(%rdi) | |||
vpaddw 512(%rdi), %ymm0, %ymm1 | |||
vpsrlw $8, %ymm1, %ymm2 | |||
vpand mask_ff(%rip), %ymm1, %ymm1 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_f(%rip), %ymm2, %ymm1 | |||
vpsrlw $4, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpsubw mask_3(%rip), %ymm2, %ymm14 | |||
vpsraw $15, %ymm14, %ymm15 | |||
vpandn %ymm14, %ymm15, %ymm1 | |||
vpand %ymm15, %ymm2, %ymm14 | |||
vpxor %ymm14, %ymm1, %ymm2 | |||
vmovdqa %ymm2, 512(%rdi) | |||
vpaddw 544(%rdi), %ymm0, %ymm1 | |||
vpsrlw $8, %ymm1, %ymm2 | |||
vpand mask_ff(%rip), %ymm1, %ymm1 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_f(%rip), %ymm2, %ymm1 | |||
vpsrlw $4, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpsubw mask_3(%rip), %ymm2, %ymm14 | |||
vpsraw $15, %ymm14, %ymm15 | |||
vpandn %ymm14, %ymm15, %ymm1 | |||
vpand %ymm15, %ymm2, %ymm14 | |||
vpxor %ymm14, %ymm1, %ymm2 | |||
vmovdqa %ymm2, 544(%rdi) | |||
vpaddw 576(%rdi), %ymm0, %ymm1 | |||
vpsrlw $8, %ymm1, %ymm2 | |||
vpand mask_ff(%rip), %ymm1, %ymm1 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_f(%rip), %ymm2, %ymm1 | |||
vpsrlw $4, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpsubw mask_3(%rip), %ymm2, %ymm14 | |||
vpsraw $15, %ymm14, %ymm15 | |||
vpandn %ymm14, %ymm15, %ymm1 | |||
vpand %ymm15, %ymm2, %ymm14 | |||
vpxor %ymm14, %ymm1, %ymm2 | |||
vmovdqa %ymm2, 576(%rdi) | |||
vpaddw 608(%rdi), %ymm0, %ymm1 | |||
vpsrlw $8, %ymm1, %ymm2 | |||
vpand mask_ff(%rip), %ymm1, %ymm1 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_f(%rip), %ymm2, %ymm1 | |||
vpsrlw $4, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpsubw mask_3(%rip), %ymm2, %ymm14 | |||
vpsraw $15, %ymm14, %ymm15 | |||
vpandn %ymm14, %ymm15, %ymm1 | |||
vpand %ymm15, %ymm2, %ymm14 | |||
vpxor %ymm14, %ymm1, %ymm2 | |||
vmovdqa %ymm2, 608(%rdi) | |||
vpaddw 640(%rdi), %ymm0, %ymm1 | |||
vpsrlw $8, %ymm1, %ymm2 | |||
vpand mask_ff(%rip), %ymm1, %ymm1 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_f(%rip), %ymm2, %ymm1 | |||
vpsrlw $4, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpsubw mask_3(%rip), %ymm2, %ymm14 | |||
vpsraw $15, %ymm14, %ymm15 | |||
vpandn %ymm14, %ymm15, %ymm1 | |||
vpand %ymm15, %ymm2, %ymm14 | |||
vpxor %ymm14, %ymm1, %ymm2 | |||
vmovdqa %ymm2, 640(%rdi) | |||
vpaddw 672(%rdi), %ymm0, %ymm1 | |||
vpsrlw $8, %ymm1, %ymm2 | |||
vpand mask_ff(%rip), %ymm1, %ymm1 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_f(%rip), %ymm2, %ymm1 | |||
vpsrlw $4, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpsubw mask_3(%rip), %ymm2, %ymm14 | |||
vpsraw $15, %ymm14, %ymm15 | |||
vpandn %ymm14, %ymm15, %ymm1 | |||
vpand %ymm15, %ymm2, %ymm14 | |||
vpxor %ymm14, %ymm1, %ymm2 | |||
vmovdqa %ymm2, 672(%rdi) | |||
vpaddw 704(%rdi), %ymm0, %ymm1 | |||
vpsrlw $8, %ymm1, %ymm2 | |||
vpand mask_ff(%rip), %ymm1, %ymm1 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_f(%rip), %ymm2, %ymm1 | |||
vpsrlw $4, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpsubw mask_3(%rip), %ymm2, %ymm14 | |||
vpsraw $15, %ymm14, %ymm15 | |||
vpandn %ymm14, %ymm15, %ymm1 | |||
vpand %ymm15, %ymm2, %ymm14 | |||
vpxor %ymm14, %ymm1, %ymm2 | |||
vmovdqa %ymm2, 704(%rdi) | |||
vpaddw 736(%rdi), %ymm0, %ymm1 | |||
vpsrlw $8, %ymm1, %ymm2 | |||
vpand mask_ff(%rip), %ymm1, %ymm1 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_f(%rip), %ymm2, %ymm1 | |||
vpsrlw $4, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpsubw mask_3(%rip), %ymm2, %ymm14 | |||
vpsraw $15, %ymm14, %ymm15 | |||
vpandn %ymm14, %ymm15, %ymm1 | |||
vpand %ymm15, %ymm2, %ymm14 | |||
vpxor %ymm14, %ymm1, %ymm2 | |||
vmovdqa %ymm2, 736(%rdi) | |||
vpaddw 768(%rdi), %ymm0, %ymm1 | |||
vpsrlw $8, %ymm1, %ymm2 | |||
vpand mask_ff(%rip), %ymm1, %ymm1 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_f(%rip), %ymm2, %ymm1 | |||
vpsrlw $4, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpsubw mask_3(%rip), %ymm2, %ymm14 | |||
vpsraw $15, %ymm14, %ymm15 | |||
vpandn %ymm14, %ymm15, %ymm1 | |||
vpand %ymm15, %ymm2, %ymm14 | |||
vpxor %ymm14, %ymm1, %ymm2 | |||
vmovdqa %ymm2, 768(%rdi) | |||
vpaddw 800(%rdi), %ymm0, %ymm1 | |||
vpsrlw $8, %ymm1, %ymm2 | |||
vpand mask_ff(%rip), %ymm1, %ymm1 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_f(%rip), %ymm2, %ymm1 | |||
vpsrlw $4, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpsubw mask_3(%rip), %ymm2, %ymm14 | |||
vpsraw $15, %ymm14, %ymm15 | |||
vpandn %ymm14, %ymm15, %ymm1 | |||
vpand %ymm15, %ymm2, %ymm14 | |||
vpxor %ymm14, %ymm1, %ymm2 | |||
vmovdqa %ymm2, 800(%rdi) | |||
vpaddw 832(%rdi), %ymm0, %ymm1 | |||
vpsrlw $8, %ymm1, %ymm2 | |||
vpand mask_ff(%rip), %ymm1, %ymm1 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_f(%rip), %ymm2, %ymm1 | |||
vpsrlw $4, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpsubw mask_3(%rip), %ymm2, %ymm14 | |||
vpsraw $15, %ymm14, %ymm15 | |||
vpandn %ymm14, %ymm15, %ymm1 | |||
vpand %ymm15, %ymm2, %ymm14 | |||
vpxor %ymm14, %ymm1, %ymm2 | |||
vmovdqa %ymm2, 832(%rdi) | |||
vpaddw 864(%rdi), %ymm0, %ymm1 | |||
vpsrlw $8, %ymm1, %ymm2 | |||
vpand mask_ff(%rip), %ymm1, %ymm1 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_f(%rip), %ymm2, %ymm1 | |||
vpsrlw $4, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpsubw mask_3(%rip), %ymm2, %ymm14 | |||
vpsraw $15, %ymm14, %ymm15 | |||
vpandn %ymm14, %ymm15, %ymm1 | |||
vpand %ymm15, %ymm2, %ymm14 | |||
vpxor %ymm14, %ymm1, %ymm2 | |||
vmovdqa %ymm2, 864(%rdi) | |||
vpaddw 896(%rdi), %ymm0, %ymm1 | |||
vpsrlw $8, %ymm1, %ymm2 | |||
vpand mask_ff(%rip), %ymm1, %ymm1 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_f(%rip), %ymm2, %ymm1 | |||
vpsrlw $4, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpsubw mask_3(%rip), %ymm2, %ymm14 | |||
vpsraw $15, %ymm14, %ymm15 | |||
vpandn %ymm14, %ymm15, %ymm1 | |||
vpand %ymm15, %ymm2, %ymm14 | |||
vpxor %ymm14, %ymm1, %ymm2 | |||
vmovdqa %ymm2, 896(%rdi) | |||
vpaddw 928(%rdi), %ymm0, %ymm1 | |||
vpsrlw $8, %ymm1, %ymm2 | |||
vpand mask_ff(%rip), %ymm1, %ymm1 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_f(%rip), %ymm2, %ymm1 | |||
vpsrlw $4, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpsubw mask_3(%rip), %ymm2, %ymm14 | |||
vpsraw $15, %ymm14, %ymm15 | |||
vpandn %ymm14, %ymm15, %ymm1 | |||
vpand %ymm15, %ymm2, %ymm14 | |||
vpxor %ymm14, %ymm1, %ymm2 | |||
vmovdqa %ymm2, 928(%rdi) | |||
vpaddw 960(%rdi), %ymm0, %ymm1 | |||
vpsrlw $8, %ymm1, %ymm2 | |||
vpand mask_ff(%rip), %ymm1, %ymm1 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_f(%rip), %ymm2, %ymm1 | |||
vpsrlw $4, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpsubw mask_3(%rip), %ymm2, %ymm14 | |||
vpsraw $15, %ymm14, %ymm15 | |||
vpandn %ymm14, %ymm15, %ymm1 | |||
vpand %ymm15, %ymm2, %ymm14 | |||
vpxor %ymm14, %ymm1, %ymm2 | |||
vmovdqa %ymm2, 960(%rdi) | |||
vpaddw 992(%rdi), %ymm0, %ymm1 | |||
vpsrlw $8, %ymm1, %ymm2 | |||
vpand mask_ff(%rip), %ymm1, %ymm1 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_f(%rip), %ymm2, %ymm1 | |||
vpsrlw $4, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpsubw mask_3(%rip), %ymm2, %ymm14 | |||
vpsraw $15, %ymm14, %ymm15 | |||
vpandn %ymm14, %ymm15, %ymm1 | |||
vpand %ymm15, %ymm2, %ymm14 | |||
vpxor %ymm14, %ymm1, %ymm2 | |||
vmovdqa %ymm2, 992(%rdi) | |||
movw $0, 1018(%rdi) | |||
movw $0, 1020(%rdi) | |||
movw $0, 1022(%rdi) | |||
ret |
@@ -0,0 +1,80 @@ | |||
.data | |||
.p2align 5 | |||
.text | |||
.global PQCLEAN_NTRUHPS2048509_AVX2_poly_mod_q_Phi_n | |||
.global _PQCLEAN_NTRUHPS2048509_AVX2_poly_mod_q_Phi_n | |||
PQCLEAN_NTRUHPS2048509_AVX2_poly_mod_q_Phi_n: | |||
_PQCLEAN_NTRUHPS2048509_AVX2_poly_mod_q_Phi_n: | |||
vmovdqa 992(%rdi), %ymm0 | |||
vpermq $3, %ymm0, %ymm0 | |||
vpslld $16, %ymm0, %ymm0 | |||
vpsrld $16, %ymm0, %ymm1 | |||
vpor %ymm0, %ymm1, %ymm0 | |||
vbroadcastss %xmm0, %ymm0 | |||
vxorpd %ymm1, %ymm1, %ymm1 | |||
vpsubw %ymm0, %ymm1, %ymm0 | |||
vpaddw 0(%rdi), %ymm0, %ymm1 | |||
vmovdqa %ymm1, 0(%rdi) | |||
vpaddw 32(%rdi), %ymm0, %ymm1 | |||
vmovdqa %ymm1, 32(%rdi) | |||
vpaddw 64(%rdi), %ymm0, %ymm1 | |||
vmovdqa %ymm1, 64(%rdi) | |||
vpaddw 96(%rdi), %ymm0, %ymm1 | |||
vmovdqa %ymm1, 96(%rdi) | |||
vpaddw 128(%rdi), %ymm0, %ymm1 | |||
vmovdqa %ymm1, 128(%rdi) | |||
vpaddw 160(%rdi), %ymm0, %ymm1 | |||
vmovdqa %ymm1, 160(%rdi) | |||
vpaddw 192(%rdi), %ymm0, %ymm1 | |||
vmovdqa %ymm1, 192(%rdi) | |||
vpaddw 224(%rdi), %ymm0, %ymm1 | |||
vmovdqa %ymm1, 224(%rdi) | |||
vpaddw 256(%rdi), %ymm0, %ymm1 | |||
vmovdqa %ymm1, 256(%rdi) | |||
vpaddw 288(%rdi), %ymm0, %ymm1 | |||
vmovdqa %ymm1, 288(%rdi) | |||
vpaddw 320(%rdi), %ymm0, %ymm1 | |||
vmovdqa %ymm1, 320(%rdi) | |||
vpaddw 352(%rdi), %ymm0, %ymm1 | |||
vmovdqa %ymm1, 352(%rdi) | |||
vpaddw 384(%rdi), %ymm0, %ymm1 | |||
vmovdqa %ymm1, 384(%rdi) | |||
vpaddw 416(%rdi), %ymm0, %ymm1 | |||
vmovdqa %ymm1, 416(%rdi) | |||
vpaddw 448(%rdi), %ymm0, %ymm1 | |||
vmovdqa %ymm1, 448(%rdi) | |||
vpaddw 480(%rdi), %ymm0, %ymm1 | |||
vmovdqa %ymm1, 480(%rdi) | |||
vpaddw 512(%rdi), %ymm0, %ymm1 | |||
vmovdqa %ymm1, 512(%rdi) | |||
vpaddw 544(%rdi), %ymm0, %ymm1 | |||
vmovdqa %ymm1, 544(%rdi) | |||
vpaddw 576(%rdi), %ymm0, %ymm1 | |||
vmovdqa %ymm1, 576(%rdi) | |||
vpaddw 608(%rdi), %ymm0, %ymm1 | |||
vmovdqa %ymm1, 608(%rdi) | |||
vpaddw 640(%rdi), %ymm0, %ymm1 | |||
vmovdqa %ymm1, 640(%rdi) | |||
vpaddw 672(%rdi), %ymm0, %ymm1 | |||
vmovdqa %ymm1, 672(%rdi) | |||
vpaddw 704(%rdi), %ymm0, %ymm1 | |||
vmovdqa %ymm1, 704(%rdi) | |||
vpaddw 736(%rdi), %ymm0, %ymm1 | |||
vmovdqa %ymm1, 736(%rdi) | |||
vpaddw 768(%rdi), %ymm0, %ymm1 | |||
vmovdqa %ymm1, 768(%rdi) | |||
vpaddw 800(%rdi), %ymm0, %ymm1 | |||
vmovdqa %ymm1, 800(%rdi) | |||
vpaddw 832(%rdi), %ymm0, %ymm1 | |||
vmovdqa %ymm1, 832(%rdi) | |||
vpaddw 864(%rdi), %ymm0, %ymm1 | |||
vmovdqa %ymm1, 864(%rdi) | |||
vpaddw 896(%rdi), %ymm0, %ymm1 | |||
vmovdqa %ymm1, 896(%rdi) | |||
vpaddw 928(%rdi), %ymm0, %ymm1 | |||
vmovdqa %ymm1, 928(%rdi) | |||
vpaddw 960(%rdi), %ymm0, %ymm1 | |||
vmovdqa %ymm1, 960(%rdi) | |||
vpaddw 992(%rdi), %ymm0, %ymm1 | |||
vmovdqa %ymm1, 992(%rdi) | |||
ret |
@@ -0,0 +1,80 @@ | |||
#include <immintrin.h> | |||
#include "poly_r2_inv.h" | |||
#include "poly.h" | |||
// Using pdep/pext for these two functions is faster but not a lot since they work on uint64_t which means | |||
// we can only do 4 coefficients at a time. Per byte (where we store 8 coefficients) we will thus need 2 pdeps/pexts | |||
// and an additional shift. In the case of tobytes we also need a logical or. | |||
// On AMD Ryzen pdep/pext are quite slow and the naive solution (looping through and setting each bit individually) | |||
// is preferred. | |||
void PQCLEAN_NTRUHPS2048509_AVX2_poly_R2_tobytes(unsigned char *out, const poly *a) { | |||
// Since pext works on a uint64_t we view the coefficient pointer as a 64-bit pointer | |||
// so that we can extract 4 coefficient at a time. It also makes arithmetic a little easier. | |||
uint64_t *coeff_pointer = (void *) a->coeffs; | |||
int i; | |||
for (i = 0; i < 63; i++) { | |||
out[i] = _pext_u64(coeff_pointer[2 * i], 0x1000100010001); | |||
out[i] |= _pext_u64(coeff_pointer[2 * i + 1], 0x1000100010001) << 4; | |||
} | |||
out[i] = _pext_u64(coeff_pointer[2 * 63], 0x1000100010001); | |||
out[i] |= _pext_u64(coeff_pointer[2 * 63 + 1], 0x1) << 4; | |||
} | |||
void PQCLEAN_NTRUHPS2048509_AVX2_poly_R2_frombytes(poly *a, const unsigned char *in) { | |||
// Since pdep results in a uint64_t we view the coefficient pointer as a 64-bit pointer | |||
// so that we can store 4 coefficient at a time. It also makes arithmetic a little easier. | |||
uint64_t *coeff_pointer = (void *) a->coeffs; | |||
int i; | |||
for (i = 0; i < 63; i++) { | |||
coeff_pointer[2 * i] = _pdep_u64(in[i], 0x1000100010001); | |||
coeff_pointer[2 * i + 1] = _pdep_u64(in[i] >> 4, 0x1000100010001); | |||
} | |||
// From the last byte we only want 5 bits (since we have 509 total, not 512). | |||
coeff_pointer[2 * 63] = _pdep_u64(in[i], 0x1000100010001); | |||
coeff_pointer[2 * 63 + 1] = _pdep_u64(in[i] >> 4, 0x1); | |||
} | |||
void PQCLEAN_NTRUHPS2048509_AVX2_poly_R2_inv(poly *r, const poly *a) { | |||
union { | |||
unsigned char s[64]; | |||
__m256i s_x32[2]; | |||
} squares[13]; | |||
#define s(x) squares[(x)].s | |||
// This relies on the following addition chain: | |||
// 1, 2, 3, 6, 12, 15, 30, 60, 63, 126, 252, 504, 507 | |||
PQCLEAN_NTRUHPS2048509_AVX2_poly_R2_tobytes(s(0), a); // TODO alignment | |||
PQCLEAN_NTRUHPS2048509_AVX2_square_1_509(s(1), s(0)); | |||
PQCLEAN_NTRUHPS2048509_AVX2_poly_R2_mul(s(1), s(1), s(0)); | |||
PQCLEAN_NTRUHPS2048509_AVX2_square_1_509(s(2), s(1)); | |||
PQCLEAN_NTRUHPS2048509_AVX2_poly_R2_mul(s(2), s(2), s(0)); | |||
PQCLEAN_NTRUHPS2048509_AVX2_square_3_509(s(3), s(2)); | |||
PQCLEAN_NTRUHPS2048509_AVX2_poly_R2_mul(s(3), s(3), s(2)); | |||
PQCLEAN_NTRUHPS2048509_AVX2_square_6_509(s(4), s(3)); | |||
PQCLEAN_NTRUHPS2048509_AVX2_poly_R2_mul(s(4), s(4), s(3)); | |||
PQCLEAN_NTRUHPS2048509_AVX2_square_3_509(s(5), s(4)); | |||
PQCLEAN_NTRUHPS2048509_AVX2_poly_R2_mul(s(5), s(5), s(2)); | |||
PQCLEAN_NTRUHPS2048509_AVX2_square_15_509(s(6), s(5)); | |||
PQCLEAN_NTRUHPS2048509_AVX2_poly_R2_mul(s(6), s(6), s(5)); | |||
PQCLEAN_NTRUHPS2048509_AVX2_square_30_509(s(7), s(6)); | |||
PQCLEAN_NTRUHPS2048509_AVX2_poly_R2_mul(s(7), s(7), s(6)); | |||
PQCLEAN_NTRUHPS2048509_AVX2_square_3_509(s(8), s(7)); | |||
PQCLEAN_NTRUHPS2048509_AVX2_poly_R2_mul(s(8), s(8), s(2)); | |||
PQCLEAN_NTRUHPS2048509_AVX2_square_63_509(s(9), s(8)); | |||
PQCLEAN_NTRUHPS2048509_AVX2_poly_R2_mul(s(9), s(9), s(8)); | |||
PQCLEAN_NTRUHPS2048509_AVX2_square_126_509(s(10), s(9)); | |||
PQCLEAN_NTRUHPS2048509_AVX2_poly_R2_mul(s(10), s(10), s(9)); | |||
PQCLEAN_NTRUHPS2048509_AVX2_square_252_509(s(11), s(10)); | |||
PQCLEAN_NTRUHPS2048509_AVX2_poly_R2_mul(s(11), s(11), s(10)); | |||
PQCLEAN_NTRUHPS2048509_AVX2_square_3_509(s(12), s(11)); | |||
PQCLEAN_NTRUHPS2048509_AVX2_poly_R2_mul(s(12), s(12), s(2)); | |||
PQCLEAN_NTRUHPS2048509_AVX2_square_1_509(s(0), s(12)); | |||
PQCLEAN_NTRUHPS2048509_AVX2_poly_R2_frombytes(r, s(0)); | |||
#undef s | |||
} |
@@ -0,0 +1,20 @@ | |||
#ifndef POLY_R2_INV_H | |||
#define POLY_R2_INV_H | |||
#include "poly.h" | |||
void PQCLEAN_NTRUHPS2048509_AVX2_poly_R2_tobytes(unsigned char *out, const poly *a); | |||
void PQCLEAN_NTRUHPS2048509_AVX2_poly_R2_frombytes(poly *a, const unsigned char *in); | |||
extern void PQCLEAN_NTRUHPS2048509_AVX2_square_1_509(unsigned char *out, const unsigned char *a); | |||
extern void PQCLEAN_NTRUHPS2048509_AVX2_square_3_509(unsigned char *out, const unsigned char *a); | |||
extern void PQCLEAN_NTRUHPS2048509_AVX2_square_6_509(unsigned char *out, const unsigned char *a); | |||
extern void PQCLEAN_NTRUHPS2048509_AVX2_square_15_509(unsigned char *out, const unsigned char *a); | |||
extern void PQCLEAN_NTRUHPS2048509_AVX2_square_30_509(unsigned char *out, const unsigned char *a); | |||
extern void PQCLEAN_NTRUHPS2048509_AVX2_square_63_509(unsigned char *out, const unsigned char *a); | |||
extern void PQCLEAN_NTRUHPS2048509_AVX2_square_126_509(unsigned char *out, const unsigned char *a); | |||
extern void PQCLEAN_NTRUHPS2048509_AVX2_square_252_509(unsigned char *out, const unsigned char *a); | |||
extern void PQCLEAN_NTRUHPS2048509_AVX2_poly_R2_mul(unsigned char *out, const unsigned char *a, | |||
const unsigned char *b); | |||
#endif |
@@ -0,0 +1,285 @@ | |||
.data | |||
.p2align 5 | |||
mask1100: | |||
.word 0 | |||
.word 0 | |||
.word 0 | |||
.word 0 | |||
.word 0 | |||
.word 0 | |||
.word 0 | |||
.word 0 | |||
.word 65535 | |||
.word 65535 | |||
.word 65535 | |||
.word 65535 | |||
.word 65535 | |||
.word 65535 | |||
.word 65535 | |||
.word 65535 | |||
mask0110: | |||
.word 0 | |||
.word 0 | |||
.word 0 | |||
.word 0 | |||
.word 65535 | |||
.word 65535 | |||
.word 65535 | |||
.word 65535 | |||
.word 65535 | |||
.word 65535 | |||
.word 65535 | |||
.word 65535 | |||
.word 0 | |||
.word 0 | |||
.word 0 | |||
.word 0 | |||
mask0011: | |||
.word 65535 | |||
.word 65535 | |||
.word 65535 | |||
.word 65535 | |||
.word 65535 | |||
.word 65535 | |||
.word 65535 | |||
.word 65535 | |||
.word 0 | |||
.word 0 | |||
.word 0 | |||
.word 0 | |||
.word 0 | |||
.word 0 | |||
.word 0 | |||
.word 0 | |||
mask1000: | |||
.word 0 | |||
.word 0 | |||
.word 0 | |||
.word 0 | |||
.word 0 | |||
.word 0 | |||
.word 0 | |||
.word 0 | |||
.word 0 | |||
.word 0 | |||
.word 0 | |||
.word 0 | |||
.word 65535 | |||
.word 65535 | |||
.word 65535 | |||
.word 65535 | |||
mask0111: | |||
.word 65535 | |||
.word 65535 | |||
.word 65535 | |||
.word 65535 | |||
.word 65535 | |||
.word 65535 | |||
.word 65535 | |||
.word 65535 | |||
.word 65535 | |||
.word 65535 | |||
.word 65535 | |||
.word 65535 | |||
.word 0 | |||
.word 0 | |||
.word 0 | |||
.word 0 | |||
low253: | |||
.word 65535 | |||
.word 65535 | |||
.word 65535 | |||
.word 65535 | |||
.word 65535 | |||
.word 65535 | |||
.word 65535 | |||
.word 65535 | |||
.word 65535 | |||
.word 65535 | |||
.word 65535 | |||
.word 65535 | |||
.word 65535 | |||
.word 65535 | |||
.word 65535 | |||
.word 8191 | |||
.text | |||
.global PQCLEAN_NTRUHPS2048509_AVX2_poly_R2_mul | |||
.global _PQCLEAN_NTRUHPS2048509_AVX2_poly_R2_mul | |||
PQCLEAN_NTRUHPS2048509_AVX2_poly_R2_mul: | |||
_PQCLEAN_NTRUHPS2048509_AVX2_poly_R2_mul: | |||
vmovdqa 0(%rsi), %ymm0 | |||
vmovdqa 32(%rsi), %ymm1 | |||
vmovdqa 0(%rdx), %ymm3 | |||
vmovdqa 32(%rdx), %ymm4 | |||
vpxor %ymm0, %ymm1, %ymm6 | |||
vpxor %ymm3, %ymm4, %ymm7 | |||
vextracti128 $1, %ymm0, %xmm11 | |||
vextracti128 $1, %ymm3, %xmm12 | |||
vpclmulqdq $1, %xmm11, %xmm12, %xmm5 | |||
vpclmulqdq $16, %xmm11, %xmm12, %xmm14 | |||
vpclmulqdq $17, %xmm11, %xmm12, %xmm15 | |||
vpxor %xmm5, %xmm14, %xmm14 | |||
vpclmulqdq $0, %xmm11, %xmm12, %xmm5 | |||
vpermq $16, %ymm14, %ymm14 | |||
vinserti128 $1, %xmm15, %ymm15, %ymm15 | |||
vpand mask0011(%rip), %ymm5, %ymm5 | |||
vpand mask0110(%rip), %ymm14, %ymm14 | |||
vpand mask1100(%rip), %ymm15, %ymm15 | |||
vpxor %ymm5, %ymm14, %ymm14 | |||
vpxor %ymm14, %ymm15, %ymm5 | |||
vpxor %xmm0, %xmm11, %xmm11 | |||
vpxor %xmm3, %xmm12, %xmm12 | |||
vpclmulqdq $1, %xmm11, %xmm12, %xmm13 | |||
vpclmulqdq $16, %xmm11, %xmm12, %xmm14 | |||
vpclmulqdq $17, %xmm11, %xmm12, %xmm15 | |||
vpxor %xmm13, %xmm14, %xmm14 | |||
vpclmulqdq $0, %xmm11, %xmm12, %xmm13 | |||
vpermq $16, %ymm14, %ymm14 | |||
vinserti128 $1, %xmm15, %ymm15, %ymm15 | |||
vpand mask0011(%rip), %ymm13, %ymm13 | |||
vpand mask0110(%rip), %ymm14, %ymm14 | |||
vpand mask1100(%rip), %ymm15, %ymm15 | |||
vpxor %ymm13, %ymm14, %ymm14 | |||
vpxor %ymm14, %ymm15, %ymm13 | |||
vpclmulqdq $1, %xmm0, %xmm3, %xmm2 | |||
vpclmulqdq $16, %xmm0, %xmm3, %xmm14 | |||
vpclmulqdq $17, %xmm0, %xmm3, %xmm15 | |||
vpxor %xmm2, %xmm14, %xmm14 | |||
vpclmulqdq $0, %xmm0, %xmm3, %xmm2 | |||
vpermq $16, %ymm14, %ymm14 | |||
vinserti128 $1, %xmm15, %ymm15, %ymm15 | |||
vpand mask0011(%rip), %ymm2, %ymm2 | |||
vpand mask0110(%rip), %ymm14, %ymm14 | |||
vpand mask1100(%rip), %ymm15, %ymm15 | |||
vpxor %ymm2, %ymm14, %ymm14 | |||
vpxor %ymm14, %ymm15, %ymm2 | |||
vpxor %ymm13, %ymm5, %ymm13 | |||
vpxor %ymm13, %ymm2, %ymm13 | |||
vpxor %ymm11, %ymm11, %ymm11 | |||
vextracti128 $1, %ymm13, %xmm11 | |||
vpxor %ymm5, %ymm11, %ymm5 | |||
vpxor %ymm11, %ymm11, %ymm11 | |||
vinserti128 $1, %xmm13, %ymm11, %ymm11 | |||
vpxor %ymm11, %ymm2, %ymm2 | |||
vextracti128 $1, %ymm1, %xmm11 | |||
vextracti128 $1, %ymm4, %xmm12 | |||
vpclmulqdq $1, %xmm11, %xmm12, %xmm9 | |||
vpclmulqdq $16, %xmm11, %xmm12, %xmm14 | |||
vpclmulqdq $17, %xmm11, %xmm12, %xmm15 | |||
vpxor %xmm9, %xmm14, %xmm14 | |||
vpclmulqdq $0, %xmm11, %xmm12, %xmm9 | |||
vpermq $16, %ymm14, %ymm14 | |||
vinserti128 $1, %xmm15, %ymm15, %ymm15 | |||
vpand mask0011(%rip), %ymm9, %ymm9 | |||
vpand mask0110(%rip), %ymm14, %ymm14 | |||
vpand mask1100(%rip), %ymm15, %ymm15 | |||
vpxor %ymm9, %ymm14, %ymm14 | |||
vpxor %ymm14, %ymm15, %ymm9 | |||
vpxor %xmm1, %xmm11, %xmm11 | |||
vpxor %xmm4, %xmm12, %xmm12 | |||
vpclmulqdq $1, %xmm11, %xmm12, %xmm13 | |||
vpclmulqdq $16, %xmm11, %xmm12, %xmm14 | |||
vpclmulqdq $17, %xmm11, %xmm12, %xmm15 | |||
vpxor %xmm13, %xmm14, %xmm14 | |||
vpclmulqdq $0, %xmm11, %xmm12, %xmm13 | |||
vpermq $16, %ymm14, %ymm14 | |||
vinserti128 $1, %xmm15, %ymm15, %ymm15 | |||
vpand mask0011(%rip), %ymm13, %ymm13 | |||
vpand mask0110(%rip), %ymm14, %ymm14 | |||
vpand mask1100(%rip), %ymm15, %ymm15 | |||
vpxor %ymm13, %ymm14, %ymm14 | |||
vpxor %ymm14, %ymm15, %ymm13 | |||
vpclmulqdq $1, %xmm1, %xmm4, %xmm8 | |||
vpclmulqdq $16, %xmm1, %xmm4, %xmm14 | |||
vpclmulqdq $17, %xmm1, %xmm4, %xmm15 | |||
vpxor %xmm8, %xmm14, %xmm14 | |||
vpclmulqdq $0, %xmm1, %xmm4, %xmm8 | |||
vpermq $16, %ymm14, %ymm14 | |||
vinserti128 $1, %xmm15, %ymm15, %ymm15 | |||
vpand mask0011(%rip), %ymm8, %ymm8 | |||
vpand mask0110(%rip), %ymm14, %ymm14 | |||
vpand mask1100(%rip), %ymm15, %ymm15 | |||
vpxor %ymm8, %ymm14, %ymm14 | |||
vpxor %ymm14, %ymm15, %ymm8 | |||
vpxor %ymm13, %ymm9, %ymm13 | |||
vpxor %ymm13, %ymm8, %ymm13 | |||
vpxor %ymm11, %ymm11, %ymm11 | |||
vextracti128 $1, %ymm13, %xmm11 | |||
vpxor %ymm9, %ymm11, %ymm9 | |||
vpxor %ymm11, %ymm11, %ymm11 | |||
vinserti128 $1, %xmm13, %ymm11, %ymm11 | |||
vpxor %ymm11, %ymm8, %ymm8 | |||
vextracti128 $1, %ymm6, %xmm11 | |||
vextracti128 $1, %ymm7, %xmm12 | |||
vpclmulqdq $1, %xmm11, %xmm12, %xmm1 | |||
vpclmulqdq $16, %xmm11, %xmm12, %xmm14 | |||
vpclmulqdq $17, %xmm11, %xmm12, %xmm15 | |||
vpxor %xmm1, %xmm14, %xmm14 | |||
vpclmulqdq $0, %xmm11, %xmm12, %xmm1 | |||
vpermq $16, %ymm14, %ymm14 | |||
vinserti128 $1, %xmm15, %ymm15, %ymm15 | |||
vpand mask0011(%rip), %ymm1, %ymm1 | |||
vpand mask0110(%rip), %ymm14, %ymm14 | |||
vpand mask1100(%rip), %ymm15, %ymm15 | |||
vpxor %ymm1, %ymm14, %ymm14 | |||
vpxor %ymm14, %ymm15, %ymm1 | |||
vpxor %xmm6, %xmm11, %xmm11 | |||
vpxor %xmm7, %xmm12, %xmm12 | |||
vpclmulqdq $1, %xmm11, %xmm12, %xmm13 | |||
vpclmulqdq $16, %xmm11, %xmm12, %xmm14 | |||
vpclmulqdq $17, %xmm11, %xmm12, %xmm15 | |||
vpxor %xmm13, %xmm14, %xmm14 | |||
vpclmulqdq $0, %xmm11, %xmm12, %xmm13 | |||
vpermq $16, %ymm14, %ymm14 | |||
vinserti128 $1, %xmm15, %ymm15, %ymm15 | |||
vpand mask0011(%rip), %ymm13, %ymm13 | |||
vpand mask0110(%rip), %ymm14, %ymm14 | |||
vpand mask1100(%rip), %ymm15, %ymm15 | |||
vpxor %ymm13, %ymm14, %ymm14 | |||
vpxor %ymm14, %ymm15, %ymm13 | |||
vpclmulqdq $1, %xmm6, %xmm7, %xmm0 | |||
vpclmulqdq $16, %xmm6, %xmm7, %xmm14 | |||
vpclmulqdq $17, %xmm6, %xmm7, %xmm15 | |||
vpxor %xmm0, %xmm14, %xmm14 | |||
vpclmulqdq $0, %xmm6, %xmm7, %xmm0 | |||
vpermq $16, %ymm14, %ymm14 | |||
vinserti128 $1, %xmm15, %ymm15, %ymm15 | |||
vpand mask0011(%rip), %ymm0, %ymm0 | |||
vpand mask0110(%rip), %ymm14, %ymm14 | |||
vpand mask1100(%rip), %ymm15, %ymm15 | |||
vpxor %ymm0, %ymm14, %ymm14 | |||
vpxor %ymm14, %ymm15, %ymm0 | |||
vpxor %ymm13, %ymm1, %ymm13 | |||
vpxor %ymm13, %ymm0, %ymm13 | |||
vpxor %ymm11, %ymm11, %ymm11 | |||
vextracti128 $1, %ymm13, %xmm11 | |||
vpxor %ymm1, %ymm11, %ymm1 | |||
vpxor %ymm11, %ymm11, %ymm11 | |||
vinserti128 $1, %xmm13, %ymm11, %ymm11 | |||
vpxor %ymm11, %ymm0, %ymm0 | |||
vpxor %ymm0, %ymm2, %ymm0 | |||
vpxor %ymm0, %ymm8, %ymm0 | |||
vpxor %ymm1, %ymm5, %ymm1 | |||
vpxor %ymm1, %ymm9, %ymm1 | |||
vpxor %ymm0, %ymm5, %ymm5 | |||
vpxor %ymm1, %ymm8, %ymm8 | |||
vpand mask1000(%rip), %ymm5, %ymm13 | |||
vpand mask0111(%rip), %ymm8, %ymm12 | |||
vpxor %ymm12, %ymm13, %ymm12 | |||
vpsrlq $61, %ymm12, %ymm12 | |||
vpermq $147, %ymm12, %ymm12 | |||
vpxor %ymm12, %ymm2, %ymm2 | |||
vpsllq $3, %ymm8, %ymm12 | |||
vpxor %ymm12, %ymm2, %ymm2 | |||
vpand mask1000(%rip), %ymm8, %ymm13 | |||
vpand mask0111(%rip), %ymm9, %ymm12 | |||
vpxor %ymm12, %ymm13, %ymm12 | |||
vpsrlq $61, %ymm12, %ymm12 | |||
vpermq $147, %ymm12, %ymm12 | |||
vpxor %ymm12, %ymm5, %ymm5 | |||
vpsllq $3, %ymm9, %ymm12 | |||
vpxor %ymm12, %ymm5, %ymm5 | |||
vpand low253(%rip), %ymm5, %ymm5 | |||
vmovdqa %ymm2, 0(%rdi) | |||
vmovdqa %ymm5, 32(%rdi) | |||
ret |
@@ -0,0 +1,955 @@ | |||
.data | |||
.p2align 5 | |||
const_3_repeating: | |||
.word 0x3 | |||
.word 0x3 | |||
.word 0x3 | |||
.word 0x3 | |||
.word 0x3 | |||
.word 0x3 | |||
.word 0x3 | |||
.word 0x3 | |||
.word 0x3 | |||
.word 0x3 | |||
.word 0x3 | |||
.word 0x3 | |||
.word 0x3 | |||
.word 0x3 | |||
.word 0x3 | |||
.word 0x3 | |||
shuf_b8_to_low_doubleword: | |||
.byte 8 | |||
.byte 255 | |||
.byte 8 | |||
.byte 255 | |||
.byte 8 | |||
.byte 255 | |||
.byte 8 | |||
.byte 255 | |||
.byte 8 | |||
.byte 255 | |||
.byte 8 | |||
.byte 255 | |||
.byte 8 | |||
.byte 255 | |||
.byte 8 | |||
.byte 255 | |||
.byte 8 | |||
.byte 255 | |||
.byte 8 | |||
.byte 255 | |||
.byte 8 | |||
.byte 255 | |||
.byte 8 | |||
.byte 255 | |||
.byte 8 | |||
.byte 255 | |||
.byte 8 | |||
.byte 255 | |||
.byte 8 | |||
.byte 255 | |||
.byte 8 | |||
.byte 255 | |||
mask_modq: | |||
.word 2047 | |||
.word 2047 | |||
.word 2047 | |||
.word 2047 | |||
.word 2047 | |||
.word 2047 | |||
.word 2047 | |||
.word 2047 | |||
.word 2047 | |||
.word 2047 | |||
.word 2047 | |||
.word 2047 | |||
.word 2047 | |||
.word 2047 | |||
.word 2047 | |||
.word 2047 | |||
mask_ff: | |||
.word 0xff | |||
.word 0xff | |||
.word 0xff | |||
.word 0xff | |||
.word 0xff | |||
.word 0xff | |||
.word 0xff | |||
.word 0xff | |||
.word 0xff | |||
.word 0xff | |||
.word 0xff | |||
.word 0xff | |||
.word 0xff | |||
.word 0xff | |||
.word 0xff | |||
.word 0xff | |||
mask_f: | |||
.word 0xf | |||
.word 0xf | |||
.word 0xf | |||
.word 0xf | |||
.word 0xf | |||
.word 0xf | |||
.word 0xf | |||
.word 0xf | |||
.word 0xf | |||
.word 0xf | |||
.word 0xf | |||
.word 0xf | |||
.word 0xf | |||
.word 0xf | |||
.word 0xf | |||
.word 0xf | |||
mask_3: | |||
.word 0x03 | |||
.word 0x03 | |||
.word 0x03 | |||
.word 0x03 | |||
.word 0x03 | |||
.word 0x03 | |||
.word 0x03 | |||
.word 0x03 | |||
.word 0x03 | |||
.word 0x03 | |||
.word 0x03 | |||
.word 0x03 | |||
.word 0x03 | |||
.word 0x03 | |||
.word 0x03 | |||
.word 0x03 | |||
.text | |||
.global PQCLEAN_NTRUHPS2048509_AVX2_poly_Rq_to_S3 | |||
.global _PQCLEAN_NTRUHPS2048509_AVX2_poly_Rq_to_S3 | |||
PQCLEAN_NTRUHPS2048509_AVX2_poly_Rq_to_S3: | |||
_PQCLEAN_NTRUHPS2048509_AVX2_poly_Rq_to_S3: | |||
vmovdqa const_3_repeating(%rip), %ymm3 | |||
vmovdqa mask_modq(%rip), %ymm6 | |||
vmovdqa 992(%rsi), %ymm4 | |||
vpand %ymm6, %ymm4, %ymm4 | |||
vpsrlw $10, %ymm4, %ymm0 | |||
vpxor %ymm3, %ymm0, %ymm0 | |||
vpsllw $11, %ymm0, %ymm0 | |||
vpaddw %ymm4, %ymm0, %ymm4 | |||
vpsrlw $8, %ymm4, %ymm5 | |||
vpand mask_ff(%rip), %ymm4, %ymm4 | |||
vpaddw %ymm5, %ymm4, %ymm5 | |||
vpand mask_f(%rip), %ymm5, %ymm4 | |||
vpsrlw $4, %ymm5, %ymm5 | |||
vpaddw %ymm5, %ymm4, %ymm5 | |||
vpand mask_3(%rip), %ymm5, %ymm4 | |||
vpsrlw $2, %ymm5, %ymm5 | |||
vpaddw %ymm5, %ymm4, %ymm5 | |||
vpand mask_3(%rip), %ymm5, %ymm4 | |||
vpsrlw $2, %ymm5, %ymm5 | |||
vpaddw %ymm5, %ymm4, %ymm5 | |||
vpsubw mask_3(%rip), %ymm5, %ymm14 | |||
vpsraw $15, %ymm14, %ymm15 | |||
vpandn %ymm14, %ymm15, %ymm4 | |||
vpand %ymm15, %ymm5, %ymm14 | |||
vpxor %ymm14, %ymm4, %ymm5 | |||
vpsllw $1, %ymm5, %ymm4 | |||
vextracti128 $1, %ymm4, %xmm4 | |||
vpshufb shuf_b8_to_low_doubleword(%rip), %ymm4, %ymm4 | |||
vinserti128 $1, %xmm4, %ymm4, %ymm4 | |||
vmovdqa 0(%rsi), %ymm1 | |||
vpand %ymm6, %ymm1, %ymm1 | |||
vpsrlw $10, %ymm1, %ymm0 | |||
vpxor %ymm3, %ymm0, %ymm0 | |||
vpsllw $11, %ymm0, %ymm0 | |||
vpaddw %ymm1, %ymm0, %ymm0 | |||
vpaddw %ymm4, %ymm0, %ymm0 | |||
vpsrlw $8, %ymm0, %ymm5 | |||
vpand mask_ff(%rip), %ymm0, %ymm0 | |||
vpaddw %ymm5, %ymm0, %ymm5 | |||
vpand mask_f(%rip), %ymm5, %ymm0 | |||
vpsrlw $4, %ymm5, %ymm5 | |||
vpaddw %ymm5, %ymm0, %ymm5 | |||
vpand mask_3(%rip), %ymm5, %ymm0 | |||
vpsrlw $2, %ymm5, %ymm5 | |||
vpaddw %ymm5, %ymm0, %ymm5 | |||
vpand mask_3(%rip), %ymm5, %ymm0 | |||
vpsrlw $2, %ymm5, %ymm5 | |||
vpaddw %ymm5, %ymm0, %ymm5 | |||
vpsubw mask_3(%rip), %ymm5, %ymm14 | |||
vpsraw $15, %ymm14, %ymm15 | |||
vpandn %ymm14, %ymm15, %ymm0 | |||
vpand %ymm15, %ymm5, %ymm14 | |||
vpxor %ymm14, %ymm0, %ymm5 | |||
vmovdqa %ymm5, 0(%rdi) | |||
vmovdqa 32(%rsi), %ymm1 | |||
vpand %ymm6, %ymm1, %ymm1 | |||
vpsrlw $10, %ymm1, %ymm0 | |||
vpxor %ymm3, %ymm0, %ymm0 | |||
vpsllw $11, %ymm0, %ymm0 | |||
vpaddw %ymm1, %ymm0, %ymm0 | |||
vpaddw %ymm4, %ymm0, %ymm0 | |||
vpsrlw $8, %ymm0, %ymm5 | |||
vpand mask_ff(%rip), %ymm0, %ymm0 | |||
vpaddw %ymm5, %ymm0, %ymm5 | |||
vpand mask_f(%rip), %ymm5, %ymm0 | |||
vpsrlw $4, %ymm5, %ymm5 | |||
vpaddw %ymm5, %ymm0, %ymm5 | |||
vpand mask_3(%rip), %ymm5, %ymm0 | |||
vpsrlw $2, %ymm5, %ymm5 | |||
vpaddw %ymm5, %ymm0, %ymm5 | |||
vpand mask_3(%rip), %ymm5, %ymm0 | |||
vpsrlw $2, %ymm5, %ymm5 | |||
vpaddw %ymm5, %ymm0, %ymm5 | |||
vpsubw mask_3(%rip), %ymm5, %ymm14 | |||
vpsraw $15, %ymm14, %ymm15 | |||
vpandn %ymm14, %ymm15, %ymm0 | |||
vpand %ymm15, %ymm5, %ymm14 | |||
vpxor %ymm14, %ymm0, %ymm5 | |||
vmovdqa %ymm5, 32(%rdi) | |||
vmovdqa 64(%rsi), %ymm1 | |||
vpand %ymm6, %ymm1, %ymm1 | |||
vpsrlw $10, %ymm1, %ymm0 | |||
vpxor %ymm3, %ymm0, %ymm0 | |||
vpsllw $11, %ymm0, %ymm0 | |||
vpaddw %ymm1, %ymm0, %ymm0 | |||
vpaddw %ymm4, %ymm0, %ymm0 | |||
vpsrlw $8, %ymm0, %ymm5 | |||
vpand mask_ff(%rip), %ymm0, %ymm0 | |||
vpaddw %ymm5, %ymm0, %ymm5 | |||
vpand mask_f(%rip), %ymm5, %ymm0 | |||
vpsrlw $4, %ymm5, %ymm5 | |||
vpaddw %ymm5, %ymm0, %ymm5 | |||
vpand mask_3(%rip), %ymm5, %ymm0 | |||
vpsrlw $2, %ymm5, %ymm5 | |||
vpaddw %ymm5, %ymm0, %ymm5 | |||
vpand mask_3(%rip), %ymm5, %ymm0 | |||
vpsrlw $2, %ymm5, %ymm5 | |||
vpaddw %ymm5, %ymm0, %ymm5 | |||
vpsubw mask_3(%rip), %ymm5, %ymm14 | |||
vpsraw $15, %ymm14, %ymm15 | |||
vpandn %ymm14, %ymm15, %ymm0 | |||
vpand %ymm15, %ymm5, %ymm14 | |||
vpxor %ymm14, %ymm0, %ymm5 | |||
vmovdqa %ymm5, 64(%rdi) | |||
vmovdqa 96(%rsi), %ymm1 | |||
vpand %ymm6, %ymm1, %ymm1 | |||
vpsrlw $10, %ymm1, %ymm0 | |||
vpxor %ymm3, %ymm0, %ymm0 | |||
vpsllw $11, %ymm0, %ymm0 | |||
vpaddw %ymm1, %ymm0, %ymm0 | |||
vpaddw %ymm4, %ymm0, %ymm0 | |||
vpsrlw $8, %ymm0, %ymm5 | |||
vpand mask_ff(%rip), %ymm0, %ymm0 | |||
vpaddw %ymm5, %ymm0, %ymm5 | |||
vpand mask_f(%rip), %ymm5, %ymm0 | |||
vpsrlw $4, %ymm5, %ymm5 | |||
vpaddw %ymm5, %ymm0, %ymm5 | |||
vpand mask_3(%rip), %ymm5, %ymm0 | |||
vpsrlw $2, %ymm5, %ymm5 | |||
vpaddw %ymm5, %ymm0, %ymm5 | |||
vpand mask_3(%rip), %ymm5, %ymm0 | |||
vpsrlw $2, %ymm5, %ymm5 | |||
vpaddw %ymm5, %ymm0, %ymm5 | |||
vpsubw mask_3(%rip), %ymm5, %ymm14 | |||
vpsraw $15, %ymm14, %ymm15 | |||
vpandn %ymm14, %ymm15, %ymm0 | |||
vpand %ymm15, %ymm5, %ymm14 | |||
vpxor %ymm14, %ymm0, %ymm5 | |||
vmovdqa %ymm5, 96(%rdi) | |||
vmovdqa 128(%rsi), %ymm1 | |||
vpand %ymm6, %ymm1, %ymm1 | |||
vpsrlw $10, %ymm1, %ymm0 | |||
vpxor %ymm3, %ymm0, %ymm0 | |||
vpsllw $11, %ymm0, %ymm0 | |||
vpaddw %ymm1, %ymm0, %ymm0 | |||
vpaddw %ymm4, %ymm0, %ymm0 | |||
vpsrlw $8, %ymm0, %ymm5 | |||
vpand mask_ff(%rip), %ymm0, %ymm0 | |||
vpaddw %ymm5, %ymm0, %ymm5 | |||
vpand mask_f(%rip), %ymm5, %ymm0 | |||
vpsrlw $4, %ymm5, %ymm5 | |||
vpaddw %ymm5, %ymm0, %ymm5 | |||
vpand mask_3(%rip), %ymm5, %ymm0 | |||
vpsrlw $2, %ymm5, %ymm5 | |||
vpaddw %ymm5, %ymm0, %ymm5 | |||
vpand mask_3(%rip), %ymm5, %ymm0 | |||
vpsrlw $2, %ymm5, %ymm5 | |||
vpaddw %ymm5, %ymm0, %ymm5 | |||
vpsubw mask_3(%rip), %ymm5, %ymm14 | |||
vpsraw $15, %ymm14, %ymm15 | |||
vpandn %ymm14, %ymm15, %ymm0 | |||
vpand %ymm15, %ymm5, %ymm14 | |||
vpxor %ymm14, %ymm0, %ymm5 | |||
vmovdqa %ymm5, 128(%rdi) | |||
vmovdqa 160(%rsi), %ymm1 | |||
vpand %ymm6, %ymm1, %ymm1 | |||
vpsrlw $10, %ymm1, %ymm0 | |||
vpxor %ymm3, %ymm0, %ymm0 | |||
vpsllw $11, %ymm0, %ymm0 | |||
vpaddw %ymm1, %ymm0, %ymm0 | |||
vpaddw %ymm4, %ymm0, %ymm0 | |||
vpsrlw $8, %ymm0, %ymm5 | |||
vpand mask_ff(%rip), %ymm0, %ymm0 | |||
vpaddw %ymm5, %ymm0, %ymm5 | |||
vpand mask_f(%rip), %ymm5, %ymm0 | |||
vpsrlw $4, %ymm5, %ymm5 | |||
vpaddw %ymm5, %ymm0, %ymm5 | |||
vpand mask_3(%rip), %ymm5, %ymm0 | |||
vpsrlw $2, %ymm5, %ymm5 | |||
vpaddw %ymm5, %ymm0, %ymm5 | |||
vpand mask_3(%rip), %ymm5, %ymm0 | |||
vpsrlw $2, %ymm5, %ymm5 | |||
vpaddw %ymm5, %ymm0, %ymm5 | |||
vpsubw mask_3(%rip), %ymm5, %ymm14 | |||
vpsraw $15, %ymm14, %ymm15 | |||
vpandn %ymm14, %ymm15, %ymm0 | |||
vpand %ymm15, %ymm5, %ymm14 | |||
vpxor %ymm14, %ymm0, %ymm5 | |||
vmovdqa %ymm5, 160(%rdi) | |||
vmovdqa 192(%rsi), %ymm1 | |||
vpand %ymm6, %ymm1, %ymm1 | |||
vpsrlw $10, %ymm1, %ymm0 | |||
vpxor %ymm3, %ymm0, %ymm0 | |||
vpsllw $11, %ymm0, %ymm0 | |||
vpaddw %ymm1, %ymm0, %ymm0 | |||
vpaddw %ymm4, %ymm0, %ymm0 | |||
vpsrlw $8, %ymm0, %ymm5 | |||
vpand mask_ff(%rip), %ymm0, %ymm0 | |||
vpaddw %ymm5, %ymm0, %ymm5 | |||
vpand mask_f(%rip), %ymm5, %ymm0 | |||
vpsrlw $4, %ymm5, %ymm5 | |||
vpaddw %ymm5, %ymm0, %ymm5 | |||
vpand mask_3(%rip), %ymm5, %ymm0 | |||
vpsrlw $2, %ymm5, %ymm5 | |||
vpaddw %ymm5, %ymm0, %ymm5 | |||
vpand mask_3(%rip), %ymm5, %ymm0 | |||
vpsrlw $2, %ymm5, %ymm5 | |||
vpaddw %ymm5, %ymm0, %ymm5 | |||
vpsubw mask_3(%rip), %ymm5, %ymm14 | |||
vpsraw $15, %ymm14, %ymm15 | |||
vpandn %ymm14, %ymm15, %ymm0 | |||
vpand %ymm15, %ymm5, %ymm14 | |||
vpxor %ymm14, %ymm0, %ymm5 | |||
vmovdqa %ymm5, 192(%rdi) | |||
vmovdqa 224(%rsi), %ymm1 | |||
vpand %ymm6, %ymm1, %ymm1 | |||
vpsrlw $10, %ymm1, %ymm0 | |||
vpxor %ymm3, %ymm0, %ymm0 | |||
vpsllw $11, %ymm0, %ymm0 | |||
vpaddw %ymm1, %ymm0, %ymm0 | |||
vpaddw %ymm4, %ymm0, %ymm0 | |||
vpsrlw $8, %ymm0, %ymm5 | |||
vpand mask_ff(%rip), %ymm0, %ymm0 | |||
vpaddw %ymm5, %ymm0, %ymm5 | |||
vpand mask_f(%rip), %ymm5, %ymm0 | |||
vpsrlw $4, %ymm5, %ymm5 | |||
vpaddw %ymm5, %ymm0, %ymm5 | |||
vpand mask_3(%rip), %ymm5, %ymm0 | |||
vpsrlw $2, %ymm5, %ymm5 | |||
vpaddw %ymm5, %ymm0, %ymm5 | |||
vpand mask_3(%rip), %ymm5, %ymm0 | |||
vpsrlw $2, %ymm5, %ymm5 | |||
vpaddw %ymm5, %ymm0, %ymm5 | |||
vpsubw mask_3(%rip), %ymm5, %ymm14 | |||
vpsraw $15, %ymm14, %ymm15 | |||
vpandn %ymm14, %ymm15, %ymm0 | |||
vpand %ymm15, %ymm5, %ymm14 | |||
vpxor %ymm14, %ymm0, %ymm5 | |||
vmovdqa %ymm5, 224(%rdi) | |||
vmovdqa 256(%rsi), %ymm1 | |||
vpand %ymm6, %ymm1, %ymm1 | |||
vpsrlw $10, %ymm1, %ymm0 | |||
vpxor %ymm3, %ymm0, %ymm0 | |||
vpsllw $11, %ymm0, %ymm0 | |||
vpaddw %ymm1, %ymm0, %ymm0 | |||
vpaddw %ymm4, %ymm0, %ymm0 | |||
vpsrlw $8, %ymm0, %ymm5 | |||
vpand mask_ff(%rip), %ymm0, %ymm0 | |||
vpaddw %ymm5, %ymm0, %ymm5 | |||
vpand mask_f(%rip), %ymm5, %ymm0 | |||
vpsrlw $4, %ymm5, %ymm5 | |||
vpaddw %ymm5, %ymm0, %ymm5 | |||
vpand mask_3(%rip), %ymm5, %ymm0 | |||
vpsrlw $2, %ymm5, %ymm5 | |||
vpaddw %ymm5, %ymm0, %ymm5 | |||
vpand mask_3(%rip), %ymm5, %ymm0 | |||
vpsrlw $2, %ymm5, %ymm5 | |||
vpaddw %ymm5, %ymm0, %ymm5 | |||
vpsubw mask_3(%rip), %ymm5, %ymm14 | |||
vpsraw $15, %ymm14, %ymm15 | |||
vpandn %ymm14, %ymm15, %ymm0 | |||
vpand %ymm15, %ymm5, %ymm14 | |||
vpxor %ymm14, %ymm0, %ymm5 | |||
vmovdqa %ymm5, 256(%rdi) | |||
vmovdqa 288(%rsi), %ymm1 | |||
vpand %ymm6, %ymm1, %ymm1 | |||
vpsrlw $10, %ymm1, %ymm0 | |||
vpxor %ymm3, %ymm0, %ymm0 | |||
vpsllw $11, %ymm0, %ymm0 | |||
vpaddw %ymm1, %ymm0, %ymm0 | |||
vpaddw %ymm4, %ymm0, %ymm0 | |||
vpsrlw $8, %ymm0, %ymm5 | |||
vpand mask_ff(%rip), %ymm0, %ymm0 | |||
vpaddw %ymm5, %ymm0, %ymm5 | |||
vpand mask_f(%rip), %ymm5, %ymm0 | |||
vpsrlw $4, %ymm5, %ymm5 | |||
vpaddw %ymm5, %ymm0, %ymm5 | |||
vpand mask_3(%rip), %ymm5, %ymm0 | |||
vpsrlw $2, %ymm5, %ymm5 | |||
vpaddw %ymm5, %ymm0, %ymm5 | |||
vpand mask_3(%rip), %ymm5, %ymm0 | |||
vpsrlw $2, %ymm5, %ymm5 | |||
vpaddw %ymm5, %ymm0, %ymm5 | |||
vpsubw mask_3(%rip), %ymm5, %ymm14 | |||
vpsraw $15, %ymm14, %ymm15 | |||
vpandn %ymm14, %ymm15, %ymm0 | |||
vpand %ymm15, %ymm5, %ymm14 | |||
vpxor %ymm14, %ymm0, %ymm5 | |||
vmovdqa %ymm5, 288(%rdi) | |||
vmovdqa 320(%rsi), %ymm1 | |||
vpand %ymm6, %ymm1, %ymm1 | |||
vpsrlw $10, %ymm1, %ymm0 | |||
vpxor %ymm3, %ymm0, %ymm0 | |||
vpsllw $11, %ymm0, %ymm0 | |||
vpaddw %ymm1, %ymm0, %ymm0 | |||
vpaddw %ymm4, %ymm0, %ymm0 | |||
vpsrlw $8, %ymm0, %ymm5 | |||
vpand mask_ff(%rip), %ymm0, %ymm0 | |||
vpaddw %ymm5, %ymm0, %ymm5 | |||
vpand mask_f(%rip), %ymm5, %ymm0 | |||
vpsrlw $4, %ymm5, %ymm5 | |||
vpaddw %ymm5, %ymm0, %ymm5 | |||
vpand mask_3(%rip), %ymm5, %ymm0 | |||
vpsrlw $2, %ymm5, %ymm5 | |||
vpaddw %ymm5, %ymm0, %ymm5 | |||
vpand mask_3(%rip), %ymm5, %ymm0 | |||
vpsrlw $2, %ymm5, %ymm5 | |||
vpaddw %ymm5, %ymm0, %ymm5 | |||
vpsubw mask_3(%rip), %ymm5, %ymm14 | |||
vpsraw $15, %ymm14, %ymm15 | |||
vpandn %ymm14, %ymm15, %ymm0 | |||
vpand %ymm15, %ymm5, %ymm14 | |||
vpxor %ymm14, %ymm0, %ymm5 | |||
vmovdqa %ymm5, 320(%rdi) | |||
vmovdqa 352(%rsi), %ymm1 | |||
vpand %ymm6, %ymm1, %ymm1 | |||
vpsrlw $10, %ymm1, %ymm0 | |||
vpxor %ymm3, %ymm0, %ymm0 | |||
vpsllw $11, %ymm0, %ymm0 | |||
vpaddw %ymm1, %ymm0, %ymm0 | |||
vpaddw %ymm4, %ymm0, %ymm0 | |||
vpsrlw $8, %ymm0, %ymm5 | |||
vpand mask_ff(%rip), %ymm0, %ymm0 | |||
vpaddw %ymm5, %ymm0, %ymm5 | |||
vpand mask_f(%rip), %ymm5, %ymm0 | |||
vpsrlw $4, %ymm5, %ymm5 | |||
vpaddw %ymm5, %ymm0, %ymm5 | |||
vpand mask_3(%rip), %ymm5, %ymm0 | |||
vpsrlw $2, %ymm5, %ymm5 | |||
vpaddw %ymm5, %ymm0, %ymm5 | |||
vpand mask_3(%rip), %ymm5, %ymm0 | |||
vpsrlw $2, %ymm5, %ymm5 | |||
vpaddw %ymm5, %ymm0, %ymm5 | |||
vpsubw mask_3(%rip), %ymm5, %ymm14 | |||
vpsraw $15, %ymm14, %ymm15 | |||
vpandn %ymm14, %ymm15, %ymm0 | |||
vpand %ymm15, %ymm5, %ymm14 | |||
vpxor %ymm14, %ymm0, %ymm5 | |||
vmovdqa %ymm5, 352(%rdi) | |||
vmovdqa 384(%rsi), %ymm1 | |||
vpand %ymm6, %ymm1, %ymm1 | |||
vpsrlw $10, %ymm1, %ymm0 | |||
vpxor %ymm3, %ymm0, %ymm0 | |||
vpsllw $11, %ymm0, %ymm0 | |||
vpaddw %ymm1, %ymm0, %ymm0 | |||
vpaddw %ymm4, %ymm0, %ymm0 | |||
vpsrlw $8, %ymm0, %ymm5 | |||
vpand mask_ff(%rip), %ymm0, %ymm0 | |||
vpaddw %ymm5, %ymm0, %ymm5 | |||
vpand mask_f(%rip), %ymm5, %ymm0 | |||
vpsrlw $4, %ymm5, %ymm5 | |||
vpaddw %ymm5, %ymm0, %ymm5 | |||
vpand mask_3(%rip), %ymm5, %ymm0 | |||
vpsrlw $2, %ymm5, %ymm5 | |||
vpaddw %ymm5, %ymm0, %ymm5 | |||
vpand mask_3(%rip), %ymm5, %ymm0 | |||
vpsrlw $2, %ymm5, %ymm5 | |||
vpaddw %ymm5, %ymm0, %ymm5 | |||
vpsubw mask_3(%rip), %ymm5, %ymm14 | |||
vpsraw $15, %ymm14, %ymm15 | |||
vpandn %ymm14, %ymm15, %ymm0 | |||
vpand %ymm15, %ymm5, %ymm14 | |||
vpxor %ymm14, %ymm0, %ymm5 | |||
vmovdqa %ymm5, 384(%rdi) | |||
vmovdqa 416(%rsi), %ymm1 | |||
vpand %ymm6, %ymm1, %ymm1 | |||
vpsrlw $10, %ymm1, %ymm0 | |||
vpxor %ymm3, %ymm0, %ymm0 | |||
vpsllw $11, %ymm0, %ymm0 | |||
vpaddw %ymm1, %ymm0, %ymm0 | |||
vpaddw %ymm4, %ymm0, %ymm0 | |||
vpsrlw $8, %ymm0, %ymm5 | |||
vpand mask_ff(%rip), %ymm0, %ymm0 | |||
vpaddw %ymm5, %ymm0, %ymm5 | |||
vpand mask_f(%rip), %ymm5, %ymm0 | |||
vpsrlw $4, %ymm5, %ymm5 | |||
vpaddw %ymm5, %ymm0, %ymm5 | |||
vpand mask_3(%rip), %ymm5, %ymm0 | |||
vpsrlw $2, %ymm5, %ymm5 | |||
vpaddw %ymm5, %ymm0, %ymm5 | |||
vpand mask_3(%rip), %ymm5, %ymm0 | |||
vpsrlw $2, %ymm5, %ymm5 | |||
vpaddw %ymm5, %ymm0, %ymm5 | |||
vpsubw mask_3(%rip), %ymm5, %ymm14 | |||
vpsraw $15, %ymm14, %ymm15 | |||
vpandn %ymm14, %ymm15, %ymm0 | |||
vpand %ymm15, %ymm5, %ymm14 | |||
vpxor %ymm14, %ymm0, %ymm5 | |||
vmovdqa %ymm5, 416(%rdi) | |||
vmovdqa 448(%rsi), %ymm1 | |||
vpand %ymm6, %ymm1, %ymm1 | |||
vpsrlw $10, %ymm1, %ymm0 | |||
vpxor %ymm3, %ymm0, %ymm0 | |||
vpsllw $11, %ymm0, %ymm0 | |||
vpaddw %ymm1, %ymm0, %ymm0 | |||
vpaddw %ymm4, %ymm0, %ymm0 | |||
vpsrlw $8, %ymm0, %ymm5 | |||
vpand mask_ff(%rip), %ymm0, %ymm0 | |||
vpaddw %ymm5, %ymm0, %ymm5 | |||
vpand mask_f(%rip), %ymm5, %ymm0 | |||
vpsrlw $4, %ymm5, %ymm5 | |||
vpaddw %ymm5, %ymm0, %ymm5 | |||
vpand mask_3(%rip), %ymm5, %ymm0 | |||
vpsrlw $2, %ymm5, %ymm5 | |||
vpaddw %ymm5, %ymm0, %ymm5 | |||
vpand mask_3(%rip), %ymm5, %ymm0 | |||
vpsrlw $2, %ymm5, %ymm5 | |||
vpaddw %ymm5, %ymm0, %ymm5 | |||
vpsubw mask_3(%rip), %ymm5, %ymm14 | |||
vpsraw $15, %ymm14, %ymm15 | |||
vpandn %ymm14, %ymm15, %ymm0 | |||
vpand %ymm15, %ymm5, %ymm14 | |||
vpxor %ymm14, %ymm0, %ymm5 | |||
vmovdqa %ymm5, 448(%rdi) | |||
vmovdqa 480(%rsi), %ymm1 | |||
vpand %ymm6, %ymm1, %ymm1 | |||
vpsrlw $10, %ymm1, %ymm0 | |||
vpxor %ymm3, %ymm0, %ymm0 | |||
vpsllw $11, %ymm0, %ymm0 | |||
vpaddw %ymm1, %ymm0, %ymm0 | |||
vpaddw %ymm4, %ymm0, %ymm0 | |||
vpsrlw $8, %ymm0, %ymm5 | |||
vpand mask_ff(%rip), %ymm0, %ymm0 | |||
vpaddw %ymm5, %ymm0, %ymm5 | |||
vpand mask_f(%rip), %ymm5, %ymm0 | |||
vpsrlw $4, %ymm5, %ymm5 | |||
vpaddw %ymm5, %ymm0, %ymm5 | |||
vpand mask_3(%rip), %ymm5, %ymm0 | |||
vpsrlw $2, %ymm5, %ymm5 | |||
vpaddw %ymm5, %ymm0, %ymm5 | |||
vpand mask_3(%rip), %ymm5, %ymm0 | |||
vpsrlw $2, %ymm5, %ymm5 | |||
vpaddw %ymm5, %ymm0, %ymm5 | |||
vpsubw mask_3(%rip), %ymm5, %ymm14 | |||
vpsraw $15, %ymm14, %ymm15 | |||
vpandn %ymm14, %ymm15, %ymm0 | |||
vpand %ymm15, %ymm5, %ymm14 | |||
vpxor %ymm14, %ymm0, %ymm5 | |||
vmovdqa %ymm5, 480(%rdi) | |||
vmovdqa 512(%rsi), %ymm1 | |||
vpand %ymm6, %ymm1, %ymm1 | |||
vpsrlw $10, %ymm1, %ymm0 | |||
vpxor %ymm3, %ymm0, %ymm0 | |||
vpsllw $11, %ymm0, %ymm0 | |||
vpaddw %ymm1, %ymm0, %ymm0 | |||
vpaddw %ymm4, %ymm0, %ymm0 | |||
vpsrlw $8, %ymm0, %ymm5 | |||
vpand mask_ff(%rip), %ymm0, %ymm0 | |||
vpaddw %ymm5, %ymm0, %ymm5 | |||
vpand mask_f(%rip), %ymm5, %ymm0 | |||
vpsrlw $4, %ymm5, %ymm5 | |||
vpaddw %ymm5, %ymm0, %ymm5 | |||
vpand mask_3(%rip), %ymm5, %ymm0 | |||
vpsrlw $2, %ymm5, %ymm5 | |||
vpaddw %ymm5, %ymm0, %ymm5 | |||
vpand mask_3(%rip), %ymm5, %ymm0 | |||
vpsrlw $2, %ymm5, %ymm5 | |||
vpaddw %ymm5, %ymm0, %ymm5 | |||
vpsubw mask_3(%rip), %ymm5, %ymm14 | |||
vpsraw $15, %ymm14, %ymm15 | |||
vpandn %ymm14, %ymm15, %ymm0 | |||
vpand %ymm15, %ymm5, %ymm14 | |||
vpxor %ymm14, %ymm0, %ymm5 | |||
vmovdqa %ymm5, 512(%rdi) | |||
vmovdqa 544(%rsi), %ymm1 | |||
vpand %ymm6, %ymm1, %ymm1 | |||
vpsrlw $10, %ymm1, %ymm0 | |||
vpxor %ymm3, %ymm0, %ymm0 | |||
vpsllw $11, %ymm0, %ymm0 | |||
vpaddw %ymm1, %ymm0, %ymm0 | |||
vpaddw %ymm4, %ymm0, %ymm0 | |||
vpsrlw $8, %ymm0, %ymm5 | |||
vpand mask_ff(%rip), %ymm0, %ymm0 | |||
vpaddw %ymm5, %ymm0, %ymm5 | |||
vpand mask_f(%rip), %ymm5, %ymm0 | |||
vpsrlw $4, %ymm5, %ymm5 | |||
vpaddw %ymm5, %ymm0, %ymm5 | |||
vpand mask_3(%rip), %ymm5, %ymm0 | |||
vpsrlw $2, %ymm5, %ymm5 | |||
vpaddw %ymm5, %ymm0, %ymm5 | |||
vpand mask_3(%rip), %ymm5, %ymm0 | |||
vpsrlw $2, %ymm5, %ymm5 | |||
vpaddw %ymm5, %ymm0, %ymm5 | |||
vpsubw mask_3(%rip), %ymm5, %ymm14 | |||
vpsraw $15, %ymm14, %ymm15 | |||
vpandn %ymm14, %ymm15, %ymm0 | |||
vpand %ymm15, %ymm5, %ymm14 | |||
vpxor %ymm14, %ymm0, %ymm5 | |||
vmovdqa %ymm5, 544(%rdi) | |||
vmovdqa 576(%rsi), %ymm1 | |||
vpand %ymm6, %ymm1, %ymm1 | |||
vpsrlw $10, %ymm1, %ymm0 | |||
vpxor %ymm3, %ymm0, %ymm0 | |||
vpsllw $11, %ymm0, %ymm0 | |||
vpaddw %ymm1, %ymm0, %ymm0 | |||
vpaddw %ymm4, %ymm0, %ymm0 | |||
vpsrlw $8, %ymm0, %ymm5 | |||
vpand mask_ff(%rip), %ymm0, %ymm0 | |||
vpaddw %ymm5, %ymm0, %ymm5 | |||
vpand mask_f(%rip), %ymm5, %ymm0 | |||
vpsrlw $4, %ymm5, %ymm5 | |||
vpaddw %ymm5, %ymm0, %ymm5 | |||
vpand mask_3(%rip), %ymm5, %ymm0 | |||
vpsrlw $2, %ymm5, %ymm5 | |||
vpaddw %ymm5, %ymm0, %ymm5 | |||
vpand mask_3(%rip), %ymm5, %ymm0 | |||
vpsrlw $2, %ymm5, %ymm5 | |||
vpaddw %ymm5, %ymm0, %ymm5 | |||
vpsubw mask_3(%rip), %ymm5, %ymm14 | |||
vpsraw $15, %ymm14, %ymm15 | |||
vpandn %ymm14, %ymm15, %ymm0 | |||
vpand %ymm15, %ymm5, %ymm14 | |||
vpxor %ymm14, %ymm0, %ymm5 | |||
vmovdqa %ymm5, 576(%rdi) | |||
vmovdqa 608(%rsi), %ymm1 | |||
vpand %ymm6, %ymm1, %ymm1 | |||
vpsrlw $10, %ymm1, %ymm0 | |||
vpxor %ymm3, %ymm0, %ymm0 | |||
vpsllw $11, %ymm0, %ymm0 | |||
vpaddw %ymm1, %ymm0, %ymm0 | |||
vpaddw %ymm4, %ymm0, %ymm0 | |||
vpsrlw $8, %ymm0, %ymm5 | |||
vpand mask_ff(%rip), %ymm0, %ymm0 | |||
vpaddw %ymm5, %ymm0, %ymm5 | |||
vpand mask_f(%rip), %ymm5, %ymm0 | |||
vpsrlw $4, %ymm5, %ymm5 | |||
vpaddw %ymm5, %ymm0, %ymm5 | |||
vpand mask_3(%rip), %ymm5, %ymm0 | |||
vpsrlw $2, %ymm5, %ymm5 | |||
vpaddw %ymm5, %ymm0, %ymm5 | |||
vpand mask_3(%rip), %ymm5, %ymm0 | |||
vpsrlw $2, %ymm5, %ymm5 | |||
vpaddw %ymm5, %ymm0, %ymm5 | |||
vpsubw mask_3(%rip), %ymm5, %ymm14 | |||
vpsraw $15, %ymm14, %ymm15 | |||
vpandn %ymm14, %ymm15, %ymm0 | |||
vpand %ymm15, %ymm5, %ymm14 | |||
vpxor %ymm14, %ymm0, %ymm5 | |||
vmovdqa %ymm5, 608(%rdi) | |||
vmovdqa 640(%rsi), %ymm1 | |||
vpand %ymm6, %ymm1, %ymm1 | |||
vpsrlw $10, %ymm1, %ymm0 | |||
vpxor %ymm3, %ymm0, %ymm0 | |||
vpsllw $11, %ymm0, %ymm0 | |||
vpaddw %ymm1, %ymm0, %ymm0 | |||
vpaddw %ymm4, %ymm0, %ymm0 | |||
vpsrlw $8, %ymm0, %ymm5 | |||
vpand mask_ff(%rip), %ymm0, %ymm0 | |||
vpaddw %ymm5, %ymm0, %ymm5 | |||
vpand mask_f(%rip), %ymm5, %ymm0 | |||
vpsrlw $4, %ymm5, %ymm5 | |||
vpaddw %ymm5, %ymm0, %ymm5 | |||
vpand mask_3(%rip), %ymm5, %ymm0 | |||
vpsrlw $2, %ymm5, %ymm5 | |||
vpaddw %ymm5, %ymm0, %ymm5 | |||
vpand mask_3(%rip), %ymm5, %ymm0 | |||
vpsrlw $2, %ymm5, %ymm5 | |||
vpaddw %ymm5, %ymm0, %ymm5 | |||
vpsubw mask_3(%rip), %ymm5, %ymm14 | |||
vpsraw $15, %ymm14, %ymm15 | |||
vpandn %ymm14, %ymm15, %ymm0 | |||
vpand %ymm15, %ymm5, %ymm14 | |||
vpxor %ymm14, %ymm0, %ymm5 | |||
vmovdqa %ymm5, 640(%rdi) | |||
vmovdqa 672(%rsi), %ymm1 | |||
vpand %ymm6, %ymm1, %ymm1 | |||
vpsrlw $10, %ymm1, %ymm0 | |||
vpxor %ymm3, %ymm0, %ymm0 | |||
vpsllw $11, %ymm0, %ymm0 | |||
vpaddw %ymm1, %ymm0, %ymm0 | |||
vpaddw %ymm4, %ymm0, %ymm0 | |||
vpsrlw $8, %ymm0, %ymm5 | |||
vpand mask_ff(%rip), %ymm0, %ymm0 | |||
vpaddw %ymm5, %ymm0, %ymm5 | |||
vpand mask_f(%rip), %ymm5, %ymm0 | |||
vpsrlw $4, %ymm5, %ymm5 | |||
vpaddw %ymm5, %ymm0, %ymm5 | |||
vpand mask_3(%rip), %ymm5, %ymm0 | |||
vpsrlw $2, %ymm5, %ymm5 | |||
vpaddw %ymm5, %ymm0, %ymm5 | |||
vpand mask_3(%rip), %ymm5, %ymm0 | |||
vpsrlw $2, %ymm5, %ymm5 | |||
vpaddw %ymm5, %ymm0, %ymm5 | |||
vpsubw mask_3(%rip), %ymm5, %ymm14 | |||
vpsraw $15, %ymm14, %ymm15 | |||
vpandn %ymm14, %ymm15, %ymm0 | |||
vpand %ymm15, %ymm5, %ymm14 | |||
vpxor %ymm14, %ymm0, %ymm5 | |||
vmovdqa %ymm5, 672(%rdi) | |||
vmovdqa 704(%rsi), %ymm1 | |||
vpand %ymm6, %ymm1, %ymm1 | |||
vpsrlw $10, %ymm1, %ymm0 | |||
vpxor %ymm3, %ymm0, %ymm0 | |||
vpsllw $11, %ymm0, %ymm0 | |||
vpaddw %ymm1, %ymm0, %ymm0 | |||
vpaddw %ymm4, %ymm0, %ymm0 | |||
vpsrlw $8, %ymm0, %ymm5 | |||
vpand mask_ff(%rip), %ymm0, %ymm0 | |||
vpaddw %ymm5, %ymm0, %ymm5 | |||
vpand mask_f(%rip), %ymm5, %ymm0 | |||
vpsrlw $4, %ymm5, %ymm5 | |||
vpaddw %ymm5, %ymm0, %ymm5 | |||
vpand mask_3(%rip), %ymm5, %ymm0 | |||
vpsrlw $2, %ymm5, %ymm5 | |||
vpaddw %ymm5, %ymm0, %ymm5 | |||
vpand mask_3(%rip), %ymm5, %ymm0 | |||
vpsrlw $2, %ymm5, %ymm5 | |||
vpaddw %ymm5, %ymm0, %ymm5 | |||
vpsubw mask_3(%rip), %ymm5, %ymm14 | |||
vpsraw $15, %ymm14, %ymm15 | |||
vpandn %ymm14, %ymm15, %ymm0 | |||
vpand %ymm15, %ymm5, %ymm14 | |||
vpxor %ymm14, %ymm0, %ymm5 | |||
vmovdqa %ymm5, 704(%rdi) | |||
vmovdqa 736(%rsi), %ymm1 | |||
vpand %ymm6, %ymm1, %ymm1 | |||
vpsrlw $10, %ymm1, %ymm0 | |||
vpxor %ymm3, %ymm0, %ymm0 | |||
vpsllw $11, %ymm0, %ymm0 | |||
vpaddw %ymm1, %ymm0, %ymm0 | |||
vpaddw %ymm4, %ymm0, %ymm0 | |||
vpsrlw $8, %ymm0, %ymm5 | |||
vpand mask_ff(%rip), %ymm0, %ymm0 | |||
vpaddw %ymm5, %ymm0, %ymm5 | |||
vpand mask_f(%rip), %ymm5, %ymm0 | |||
vpsrlw $4, %ymm5, %ymm5 | |||
vpaddw %ymm5, %ymm0, %ymm5 | |||
vpand mask_3(%rip), %ymm5, %ymm0 | |||
vpsrlw $2, %ymm5, %ymm5 | |||
vpaddw %ymm5, %ymm0, %ymm5 | |||
vpand mask_3(%rip), %ymm5, %ymm0 | |||
vpsrlw $2, %ymm5, %ymm5 | |||
vpaddw %ymm5, %ymm0, %ymm5 | |||
vpsubw mask_3(%rip), %ymm5, %ymm14 | |||
vpsraw $15, %ymm14, %ymm15 | |||
vpandn %ymm14, %ymm15, %ymm0 | |||
vpand %ymm15, %ymm5, %ymm14 | |||
vpxor %ymm14, %ymm0, %ymm5 | |||
vmovdqa %ymm5, 736(%rdi) | |||
vmovdqa 768(%rsi), %ymm1 | |||
vpand %ymm6, %ymm1, %ymm1 | |||
vpsrlw $10, %ymm1, %ymm0 | |||
vpxor %ymm3, %ymm0, %ymm0 | |||
vpsllw $11, %ymm0, %ymm0 | |||
vpaddw %ymm1, %ymm0, %ymm0 | |||
vpaddw %ymm4, %ymm0, %ymm0 | |||
vpsrlw $8, %ymm0, %ymm5 | |||
vpand mask_ff(%rip), %ymm0, %ymm0 | |||
vpaddw %ymm5, %ymm0, %ymm5 | |||
vpand mask_f(%rip), %ymm5, %ymm0 | |||
vpsrlw $4, %ymm5, %ymm5 | |||
vpaddw %ymm5, %ymm0, %ymm5 | |||
vpand mask_3(%rip), %ymm5, %ymm0 | |||
vpsrlw $2, %ymm5, %ymm5 | |||
vpaddw %ymm5, %ymm0, %ymm5 | |||
vpand mask_3(%rip), %ymm5, %ymm0 | |||
vpsrlw $2, %ymm5, %ymm5 | |||
vpaddw %ymm5, %ymm0, %ymm5 | |||
vpsubw mask_3(%rip), %ymm5, %ymm14 | |||
vpsraw $15, %ymm14, %ymm15 | |||
vpandn %ymm14, %ymm15, %ymm0 | |||
vpand %ymm15, %ymm5, %ymm14 | |||
vpxor %ymm14, %ymm0, %ymm5 | |||
vmovdqa %ymm5, 768(%rdi) | |||
vmovdqa 800(%rsi), %ymm1 | |||
vpand %ymm6, %ymm1, %ymm1 | |||
vpsrlw $10, %ymm1, %ymm0 | |||
vpxor %ymm3, %ymm0, %ymm0 | |||
vpsllw $11, %ymm0, %ymm0 | |||
vpaddw %ymm1, %ymm0, %ymm0 | |||
vpaddw %ymm4, %ymm0, %ymm0 | |||
vpsrlw $8, %ymm0, %ymm5 | |||
vpand mask_ff(%rip), %ymm0, %ymm0 | |||
vpaddw %ymm5, %ymm0, %ymm5 | |||
vpand mask_f(%rip), %ymm5, %ymm0 | |||
vpsrlw $4, %ymm5, %ymm5 | |||
vpaddw %ymm5, %ymm0, %ymm5 | |||
vpand mask_3(%rip), %ymm5, %ymm0 | |||
vpsrlw $2, %ymm5, %ymm5 | |||
vpaddw %ymm5, %ymm0, %ymm5 | |||
vpand mask_3(%rip), %ymm5, %ymm0 | |||
vpsrlw $2, %ymm5, %ymm5 | |||
vpaddw %ymm5, %ymm0, %ymm5 | |||
vpsubw mask_3(%rip), %ymm5, %ymm14 | |||
vpsraw $15, %ymm14, %ymm15 | |||
vpandn %ymm14, %ymm15, %ymm0 | |||
vpand %ymm15, %ymm5, %ymm14 | |||
vpxor %ymm14, %ymm0, %ymm5 | |||
vmovdqa %ymm5, 800(%rdi) | |||
vmovdqa 832(%rsi), %ymm1 | |||
vpand %ymm6, %ymm1, %ymm1 | |||
vpsrlw $10, %ymm1, %ymm0 | |||
vpxor %ymm3, %ymm0, %ymm0 | |||
vpsllw $11, %ymm0, %ymm0 | |||
vpaddw %ymm1, %ymm0, %ymm0 | |||
vpaddw %ymm4, %ymm0, %ymm0 | |||
vpsrlw $8, %ymm0, %ymm5 | |||
vpand mask_ff(%rip), %ymm0, %ymm0 | |||
vpaddw %ymm5, %ymm0, %ymm5 | |||
vpand mask_f(%rip), %ymm5, %ymm0 | |||
vpsrlw $4, %ymm5, %ymm5 | |||
vpaddw %ymm5, %ymm0, %ymm5 | |||
vpand mask_3(%rip), %ymm5, %ymm0 | |||
vpsrlw $2, %ymm5, %ymm5 | |||
vpaddw %ymm5, %ymm0, %ymm5 | |||
vpand mask_3(%rip), %ymm5, %ymm0 | |||
vpsrlw $2, %ymm5, %ymm5 | |||
vpaddw %ymm5, %ymm0, %ymm5 | |||
vpsubw mask_3(%rip), %ymm5, %ymm14 | |||
vpsraw $15, %ymm14, %ymm15 | |||
vpandn %ymm14, %ymm15, %ymm0 | |||
vpand %ymm15, %ymm5, %ymm14 | |||
vpxor %ymm14, %ymm0, %ymm5 | |||
vmovdqa %ymm5, 832(%rdi) | |||
vmovdqa 864(%rsi), %ymm1 | |||
vpand %ymm6, %ymm1, %ymm1 | |||
vpsrlw $10, %ymm1, %ymm0 | |||
vpxor %ymm3, %ymm0, %ymm0 | |||
vpsllw $11, %ymm0, %ymm0 | |||
vpaddw %ymm1, %ymm0, %ymm0 | |||
vpaddw %ymm4, %ymm0, %ymm0 | |||
vpsrlw $8, %ymm0, %ymm5 | |||
vpand mask_ff(%rip), %ymm0, %ymm0 | |||
vpaddw %ymm5, %ymm0, %ymm5 | |||
vpand mask_f(%rip), %ymm5, %ymm0 | |||
vpsrlw $4, %ymm5, %ymm5 | |||
vpaddw %ymm5, %ymm0, %ymm5 | |||
vpand mask_3(%rip), %ymm5, %ymm0 | |||
vpsrlw $2, %ymm5, %ymm5 | |||
vpaddw %ymm5, %ymm0, %ymm5 | |||
vpand mask_3(%rip), %ymm5, %ymm0 | |||
vpsrlw $2, %ymm5, %ymm5 | |||
vpaddw %ymm5, %ymm0, %ymm5 | |||
vpsubw mask_3(%rip), %ymm5, %ymm14 | |||
vpsraw $15, %ymm14, %ymm15 | |||
vpandn %ymm14, %ymm15, %ymm0 | |||
vpand %ymm15, %ymm5, %ymm14 | |||
vpxor %ymm14, %ymm0, %ymm5 | |||
vmovdqa %ymm5, 864(%rdi) | |||
vmovdqa 896(%rsi), %ymm1 | |||
vpand %ymm6, %ymm1, %ymm1 | |||
vpsrlw $10, %ymm1, %ymm0 | |||
vpxor %ymm3, %ymm0, %ymm0 | |||
vpsllw $11, %ymm0, %ymm0 | |||
vpaddw %ymm1, %ymm0, %ymm0 | |||
vpaddw %ymm4, %ymm0, %ymm0 | |||
vpsrlw $8, %ymm0, %ymm5 | |||
vpand mask_ff(%rip), %ymm0, %ymm0 | |||
vpaddw %ymm5, %ymm0, %ymm5 | |||
vpand mask_f(%rip), %ymm5, %ymm0 | |||
vpsrlw $4, %ymm5, %ymm5 | |||
vpaddw %ymm5, %ymm0, %ymm5 | |||
vpand mask_3(%rip), %ymm5, %ymm0 | |||
vpsrlw $2, %ymm5, %ymm5 | |||
vpaddw %ymm5, %ymm0, %ymm5 | |||
vpand mask_3(%rip), %ymm5, %ymm0 | |||
vpsrlw $2, %ymm5, %ymm5 | |||
vpaddw %ymm5, %ymm0, %ymm5 | |||
vpsubw mask_3(%rip), %ymm5, %ymm14 | |||
vpsraw $15, %ymm14, %ymm15 | |||
vpandn %ymm14, %ymm15, %ymm0 | |||
vpand %ymm15, %ymm5, %ymm14 | |||
vpxor %ymm14, %ymm0, %ymm5 | |||
vmovdqa %ymm5, 896(%rdi) | |||
vmovdqa 928(%rsi), %ymm1 | |||
vpand %ymm6, %ymm1, %ymm1 | |||
vpsrlw $10, %ymm1, %ymm0 | |||
vpxor %ymm3, %ymm0, %ymm0 | |||
vpsllw $11, %ymm0, %ymm0 | |||
vpaddw %ymm1, %ymm0, %ymm0 | |||
vpaddw %ymm4, %ymm0, %ymm0 | |||
vpsrlw $8, %ymm0, %ymm5 | |||
vpand mask_ff(%rip), %ymm0, %ymm0 | |||
vpaddw %ymm5, %ymm0, %ymm5 | |||
vpand mask_f(%rip), %ymm5, %ymm0 | |||
vpsrlw $4, %ymm5, %ymm5 | |||
vpaddw %ymm5, %ymm0, %ymm5 | |||
vpand mask_3(%rip), %ymm5, %ymm0 | |||
vpsrlw $2, %ymm5, %ymm5 | |||
vpaddw %ymm5, %ymm0, %ymm5 | |||
vpand mask_3(%rip), %ymm5, %ymm0 | |||
vpsrlw $2, %ymm5, %ymm5 | |||
vpaddw %ymm5, %ymm0, %ymm5 | |||
vpsubw mask_3(%rip), %ymm5, %ymm14 | |||
vpsraw $15, %ymm14, %ymm15 | |||
vpandn %ymm14, %ymm15, %ymm0 | |||
vpand %ymm15, %ymm5, %ymm14 | |||
vpxor %ymm14, %ymm0, %ymm5 | |||
vmovdqa %ymm5, 928(%rdi) | |||
vmovdqa 960(%rsi), %ymm1 | |||
vpand %ymm6, %ymm1, %ymm1 | |||
vpsrlw $10, %ymm1, %ymm0 | |||
vpxor %ymm3, %ymm0, %ymm0 | |||
vpsllw $11, %ymm0, %ymm0 | |||
vpaddw %ymm1, %ymm0, %ymm0 | |||
vpaddw %ymm4, %ymm0, %ymm0 | |||
vpsrlw $8, %ymm0, %ymm5 | |||
vpand mask_ff(%rip), %ymm0, %ymm0 | |||
vpaddw %ymm5, %ymm0, %ymm5 | |||
vpand mask_f(%rip), %ymm5, %ymm0 | |||
vpsrlw $4, %ymm5, %ymm5 | |||
vpaddw %ymm5, %ymm0, %ymm5 | |||
vpand mask_3(%rip), %ymm5, %ymm0 | |||
vpsrlw $2, %ymm5, %ymm5 | |||
vpaddw %ymm5, %ymm0, %ymm5 | |||
vpand mask_3(%rip), %ymm5, %ymm0 | |||
vpsrlw $2, %ymm5, %ymm5 | |||
vpaddw %ymm5, %ymm0, %ymm5 | |||
vpsubw mask_3(%rip), %ymm5, %ymm14 | |||
vpsraw $15, %ymm14, %ymm15 | |||
vpandn %ymm14, %ymm15, %ymm0 | |||
vpand %ymm15, %ymm5, %ymm14 | |||
vpxor %ymm14, %ymm0, %ymm5 | |||
vmovdqa %ymm5, 960(%rdi) | |||
vmovdqa 992(%rsi), %ymm1 | |||
vpand %ymm6, %ymm1, %ymm1 | |||
vpsrlw $10, %ymm1, %ymm0 | |||
vpxor %ymm3, %ymm0, %ymm0 | |||
vpsllw $11, %ymm0, %ymm0 | |||
vpaddw %ymm1, %ymm0, %ymm0 | |||
vpaddw %ymm4, %ymm0, %ymm0 | |||
vpsrlw $8, %ymm0, %ymm5 | |||
vpand mask_ff(%rip), %ymm0, %ymm0 | |||
vpaddw %ymm5, %ymm0, %ymm5 | |||
vpand mask_f(%rip), %ymm5, %ymm0 | |||
vpsrlw $4, %ymm5, %ymm5 | |||
vpaddw %ymm5, %ymm0, %ymm5 | |||
vpand mask_3(%rip), %ymm5, %ymm0 | |||
vpsrlw $2, %ymm5, %ymm5 | |||
vpaddw %ymm5, %ymm0, %ymm5 | |||
vpand mask_3(%rip), %ymm5, %ymm0 | |||
vpsrlw $2, %ymm5, %ymm5 | |||
vpaddw %ymm5, %ymm0, %ymm5 | |||
vpsubw mask_3(%rip), %ymm5, %ymm14 | |||
vpsraw $15, %ymm14, %ymm15 | |||
vpandn %ymm14, %ymm15, %ymm0 | |||
vpand %ymm15, %ymm5, %ymm14 | |||
vpxor %ymm14, %ymm0, %ymm5 | |||
vmovdqa %ymm5, 992(%rdi) | |||
ret |
@@ -0,0 +1,463 @@ | |||
#include "poly.h" | |||
#include <immintrin.h> | |||
typedef signed char small; | |||
#define p 508 | |||
#define ppad 512 | |||
#define numvec 2 | |||
typedef __m256i vec256; | |||
/* | |||
This code stores 512-coeff poly as vec256[2]. | |||
Order of 256 coefficients in each vec256 | |||
is optimized in light of costs of vector instructions: | |||
0,4,...,252 in 64-bit word; | |||
1,5,...,253 in 64-bit word; | |||
2,6,...,254 in 64-bit word; | |||
3,7,...,255 in 64-bit word. | |||
*/ | |||
static inline void vec256_frombits(vec256 *v, const small *b) { | |||
int i; | |||
for (i = 0; i < numvec; ++i) { | |||
vec256 b0 = _mm256_loadu_si256((vec256 *) b); | |||
b += 32; /* 0,1,...,31 */ | |||
vec256 b1 = _mm256_loadu_si256((vec256 *) b); | |||
b += 32; /* 32,33,... */ | |||
vec256 b2 = _mm256_loadu_si256((vec256 *) b); | |||
b += 32; | |||
vec256 b3 = _mm256_loadu_si256((vec256 *) b); | |||
b += 32; | |||
vec256 b4 = _mm256_loadu_si256((vec256 *) b); | |||
b += 32; | |||
vec256 b5 = _mm256_loadu_si256((vec256 *) b); | |||
b += 32; | |||
vec256 b6 = _mm256_loadu_si256((vec256 *) b); | |||
b += 32; | |||
vec256 b7 = _mm256_loadu_si256((vec256 *) b); | |||
b += 32; | |||
vec256 c0 = _mm256_unpacklo_epi32(b0, b1); /* 0 1 2 3 32 33 34 35 4 5 6 7 36 37 38 39 ... 55 */ | |||
vec256 c1 = _mm256_unpackhi_epi32(b0, b1); /* 8 9 10 11 40 41 42 43 ... 63 */ | |||
vec256 c2 = _mm256_unpacklo_epi32(b2, b3); | |||
vec256 c3 = _mm256_unpackhi_epi32(b2, b3); | |||
vec256 c4 = _mm256_unpacklo_epi32(b4, b5); | |||
vec256 c5 = _mm256_unpackhi_epi32(b4, b5); | |||
vec256 c6 = _mm256_unpacklo_epi32(b6, b7); | |||
vec256 c7 = _mm256_unpackhi_epi32(b6, b7); | |||
vec256 d0 = c0 | _mm256_slli_epi32(c1, 2); /* 0 8, 1 9, 2 10, 3 11, 32 40, 33 41, ..., 55 63 */ | |||
vec256 d2 = c2 | _mm256_slli_epi32(c3, 2); | |||
vec256 d4 = c4 | _mm256_slli_epi32(c5, 2); | |||
vec256 d6 = c6 | _mm256_slli_epi32(c7, 2); | |||
vec256 e0 = _mm256_unpacklo_epi64(d0, d2); | |||
vec256 e2 = _mm256_unpackhi_epi64(d0, d2); | |||
vec256 e4 = _mm256_unpacklo_epi64(d4, d6); | |||
vec256 e6 = _mm256_unpackhi_epi64(d4, d6); | |||
vec256 f0 = e0 | _mm256_slli_epi32(e2, 1); | |||
vec256 f4 = e4 | _mm256_slli_epi32(e6, 1); | |||
vec256 g0 = _mm256_permute2x128_si256(f0, f4, 0x20); | |||
vec256 g4 = _mm256_permute2x128_si256(f0, f4, 0x31); | |||
vec256 h = g0 | _mm256_slli_epi32(g4, 4); | |||
#define TRANSPOSE _mm256_set_epi8( 31,27,23,19, 30,26,22,18, 29,25,21,17, 28,24,20,16, 15,11,7,3, 14,10,6,2, 13,9,5,1, 12,8,4,0 ) | |||
h = _mm256_shuffle_epi8(h, TRANSPOSE); | |||
h = _mm256_permute4x64_epi64(h, 0xd8); | |||
h = _mm256_shuffle_epi32(h, 0xd8); | |||
*v++ = h; | |||
} | |||
} | |||
static inline void vec256_tobits(const vec256 *v, small *b) { | |||
int i; | |||
for (i = 0; i < numvec; ++i) { | |||
vec256 h = *v++; | |||
h = _mm256_shuffle_epi32(h, 0xd8); | |||
h = _mm256_permute4x64_epi64(h, 0xd8); | |||
h = _mm256_shuffle_epi8(h, TRANSPOSE); | |||
vec256 g0 = h & _mm256_set1_epi8(15); | |||
vec256 g4 = _mm256_srli_epi32(h, 4) & _mm256_set1_epi8(15); | |||
vec256 f0 = _mm256_permute2x128_si256(g0, g4, 0x20); | |||
vec256 f4 = _mm256_permute2x128_si256(g0, g4, 0x31); | |||
vec256 e0 = f0 & _mm256_set1_epi8(5); | |||
vec256 e2 = _mm256_srli_epi32(f0, 1) & _mm256_set1_epi8(5); | |||
vec256 e4 = f4 & _mm256_set1_epi8(5); | |||
vec256 e6 = _mm256_srli_epi32(f4, 1) & _mm256_set1_epi8(5); | |||
vec256 d0 = _mm256_unpacklo_epi32(e0, e2); | |||
vec256 d2 = _mm256_unpackhi_epi32(e0, e2); | |||
vec256 d4 = _mm256_unpacklo_epi32(e4, e6); | |||
vec256 d6 = _mm256_unpackhi_epi32(e4, e6); | |||
vec256 c0 = d0 & _mm256_set1_epi8(1); | |||
vec256 c1 = _mm256_srli_epi32(d0, 2) & _mm256_set1_epi8(1); | |||
vec256 c2 = d2 & _mm256_set1_epi8(1); | |||
vec256 c3 = _mm256_srli_epi32(d2, 2) & _mm256_set1_epi8(1); | |||
vec256 c4 = d4 & _mm256_set1_epi8(1); | |||
vec256 c5 = _mm256_srli_epi32(d4, 2) & _mm256_set1_epi8(1); | |||
vec256 c6 = d6 & _mm256_set1_epi8(1); | |||
vec256 c7 = _mm256_srli_epi32(d6, 2) & _mm256_set1_epi8(1); | |||
vec256 b0 = _mm256_unpacklo_epi64(c0, c1); | |||
vec256 b1 = _mm256_unpackhi_epi64(c0, c1); | |||
vec256 b2 = _mm256_unpacklo_epi64(c2, c3); | |||
vec256 b3 = _mm256_unpackhi_epi64(c2, c3); | |||
vec256 b4 = _mm256_unpacklo_epi64(c4, c5); | |||
vec256 b5 = _mm256_unpackhi_epi64(c4, c5); | |||
vec256 b6 = _mm256_unpacklo_epi64(c6, c7); | |||
vec256 b7 = _mm256_unpackhi_epi64(c6, c7); | |||
_mm256_storeu_si256((vec256 *) b, b0); | |||
b += 32; | |||
_mm256_storeu_si256((vec256 *) b, b1); | |||
b += 32; | |||
_mm256_storeu_si256((vec256 *) b, b2); | |||
b += 32; | |||
_mm256_storeu_si256((vec256 *) b, b3); | |||
b += 32; | |||
_mm256_storeu_si256((vec256 *) b, b4); | |||
b += 32; | |||
_mm256_storeu_si256((vec256 *) b, b5); | |||
b += 32; | |||
_mm256_storeu_si256((vec256 *) b, b6); | |||
b += 32; | |||
_mm256_storeu_si256((vec256 *) b, b7); | |||
b += 32; | |||
} | |||
} | |||
static void vec256_init(vec256 *G0, vec256 *G1, const small *s) { | |||
int i; | |||
small srev[ppad + (ppad - p)]; | |||
small si; | |||
small g0[ppad]; | |||
small g1[ppad]; | |||
for (i = 0; i < p; ++i) { | |||
srev[ppad - 1 - i] = s[i]; | |||
} | |||
for (i = 0; i < ppad - p; ++i) { | |||
srev[i] = 0; | |||
} | |||
for (i = p; i < ppad; ++i) { | |||
srev[i + ppad - p] = 0; | |||
} | |||
for (i = 0; i < ppad; ++i) { | |||
si = srev[i + ppad - p]; | |||
g0[i] = si & 1; | |||
g1[i] = (si >> 1) & g0[i]; | |||
} | |||
vec256_frombits(G0, g0); | |||
vec256_frombits(G1, g1); | |||
} | |||
static void vec256_final(small *out, const vec256 *V0, const vec256 *V1) { | |||
int i; | |||
small v0[ppad]; | |||
small v1[ppad]; | |||
small v[ppad]; | |||
small vrev[ppad + (ppad - p)]; | |||
vec256_tobits(V0, v0); | |||
vec256_tobits(V1, v1); | |||
for (i = 0; i < ppad; ++i) { | |||
v[i] = v0[i] + 2 * v1[i] - 4 * (v0[i] & v1[i]); | |||
} | |||
for (i = 0; i < ppad; ++i) { | |||
vrev[i] = v[ppad - 1 - i]; | |||
} | |||
for (i = ppad; i < ppad + (ppad - p); ++i) { | |||
vrev[i] = 0; | |||
} | |||
for (i = 0; i < p; ++i) { | |||
out[i] = vrev[i + ppad - p]; | |||
} | |||
} | |||
static inline int negative_mask(int x) { | |||
return x >> 31; | |||
} | |||
static inline void vec256_swap(vec256 *f, vec256 *g, int len, vec256 mask) { | |||
vec256 flip; | |||
int i; | |||
for (i = 0; i < len; ++i) { | |||
flip = mask & (f[i] ^ g[i]); | |||
f[i] ^= flip; | |||
g[i] ^= flip; | |||
} | |||
} | |||
static inline void vec256_scale(vec256 *f0, vec256 *f1, const vec256 c0, const vec256 c1) { | |||
int i; | |||
for (i = 0; i < numvec; ++i) { | |||
vec256 f0i = f0[i]; | |||
vec256 f1i = f1[i]; | |||
f0i &= c0; | |||
f1i ^= c1; | |||
f1i &= f0i; | |||
f0[i] = f0i; | |||
f1[i] = f1i; | |||
} | |||
} | |||
static inline void vec256_eliminate(vec256 *f0, vec256 *f1, vec256 *g0, vec256 *g1, int len, const vec256 c0, const vec256 c1) { | |||
int i; | |||
for (i = 0; i < len; ++i) { | |||
vec256 f0i = f0[i]; | |||
vec256 f1i = f1[i]; | |||
vec256 g0i = g0[i]; | |||
vec256 g1i = g1[i]; | |||
vec256 t; | |||
f0i &= c0; | |||
f1i ^= c1; | |||
f1i &= f0i; | |||
t = g0i ^ f0i; | |||
g0[i] = t | (g1i ^ f1i); | |||
g1[i] = (g1i ^ f0i) & (f1i ^ t); | |||
} | |||
} | |||
static inline int vec256_bit0mask(vec256 *f) { | |||
return -(_mm_cvtsi128_si32(_mm256_castsi256_si128(f[0])) & 1); | |||
} | |||
static inline void vec256_divx_1(vec256 *f) { | |||
vec256 f0 = f[0]; | |||
unsigned long long low0 = _mm_cvtsi128_si64(_mm256_castsi256_si128(f0)); | |||
low0 = low0 >> 1; | |||
f0 = _mm256_blend_epi32(f0, _mm256_set_epi64x(0, 0, 0, low0), 0x3); | |||
f[0] = _mm256_permute4x64_epi64(f0, 0x39); | |||
} | |||
static inline void vec256_divx_2(vec256 *f) { | |||
vec256 f0 = f[0]; | |||
vec256 f1 = f[1]; | |||
unsigned long long low0 = _mm_cvtsi128_si64(_mm256_castsi256_si128(f0)); | |||
unsigned long long low1 = _mm_cvtsi128_si64(_mm256_castsi256_si128(f1)); | |||
low0 = (low0 >> 1) | (low1 << 63); | |||
low1 = low1 >> 1; | |||
f0 = _mm256_blend_epi32(f0, _mm256_set_epi64x(0, 0, 0, low0), 0x3); | |||
f1 = _mm256_blend_epi32(f1, _mm256_set_epi64x(0, 0, 0, low1), 0x3); | |||
f[0] = _mm256_permute4x64_epi64(f0, 0x39); | |||
f[1] = _mm256_permute4x64_epi64(f1, 0x39); | |||
} | |||
static inline void vec256_timesx_1(vec256 *f) { | |||
vec256 f0 = _mm256_permute4x64_epi64(f[0], 0x93); | |||
unsigned long long low0 = _mm_cvtsi128_si64(_mm256_castsi256_si128(f0)); | |||
low0 = low0 << 1; | |||
f0 = _mm256_blend_epi32(f0, _mm256_set_epi64x(0, 0, 0, low0), 0x3); | |||
f[0] = f0; | |||
} | |||
static inline void vec256_timesx_2(vec256 *f) { | |||
vec256 f0 = _mm256_permute4x64_epi64(f[0], 0x93); | |||
vec256 f1 = _mm256_permute4x64_epi64(f[1], 0x93); | |||
unsigned long long low0 = _mm_cvtsi128_si64(_mm256_castsi256_si128(f0)); | |||
unsigned long long low1 = _mm_cvtsi128_si64(_mm256_castsi256_si128(f1)); | |||
low1 = (low1 << 1) | (low0 >> 63); | |||
low0 = low0 << 1; | |||
f0 = _mm256_blend_epi32(f0, _mm256_set_epi64x(0, 0, 0, low0), 0x3); | |||
f1 = _mm256_blend_epi32(f1, _mm256_set_epi64x(0, 0, 0, low1), 0x3); | |||
f[0] = f0; | |||
f[1] = f1; | |||
} | |||
static int __poly_S3_inv(unsigned char *outbytes, const unsigned char *inbytes) { | |||
small *out = (void *) outbytes; | |||
small *in = (void *) inbytes; | |||
vec256 F0[numvec]; | |||
vec256 F1[numvec]; | |||
vec256 G0[numvec]; | |||
vec256 G1[numvec]; | |||
vec256 V0[numvec]; | |||
vec256 V1[numvec]; | |||
vec256 R0[numvec]; | |||
vec256 R1[numvec]; | |||
vec256 c0vec, c1vec; | |||
int loop; | |||
int c0, c1; | |||
int minusdelta = -1; | |||
int swapmask; | |||
vec256 swapvec; | |||
vec256_init(G0, G1, in); | |||
F0[0] = _mm256_set_epi32(-1, -1, -1, -1, -1, -1, -1, -1); | |||
F0[1] = _mm256_set_epi32(2147483647, -1, 2147483647, -1, 2147483647, -1, -1, -1); | |||
F1[0] = _mm256_set1_epi32(0); | |||
F1[1] = _mm256_set1_epi32(0); | |||
V0[0] = _mm256_set1_epi32(0); | |||
V1[0] = _mm256_set1_epi32(0); | |||
V0[1] = _mm256_set1_epi32(0); | |||
V1[1] = _mm256_set1_epi32(0); | |||
R0[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, 1); | |||
R1[0] = _mm256_set1_epi32(0); | |||
R0[1] = _mm256_set1_epi32(0); | |||
R1[1] = _mm256_set1_epi32(0); | |||
for (loop = 256; loop > 0; --loop) { | |||
vec256_timesx_1(V0); | |||
vec256_timesx_1(V1); | |||
swapmask = negative_mask(minusdelta) & vec256_bit0mask(G0); | |||
c0 = vec256_bit0mask(F0) & vec256_bit0mask(G0); | |||
c1 = vec256_bit0mask(F1) ^ vec256_bit0mask(G1); | |||
c1 &= c0; | |||
minusdelta ^= swapmask & (minusdelta ^ -minusdelta); | |||
minusdelta -= 1; | |||
swapvec = _mm256_set1_epi32(swapmask); | |||
vec256_swap(F0, G0, 2, swapvec); | |||
vec256_swap(F1, G1, 2, swapvec); | |||
c0vec = _mm256_set1_epi32(c0); | |||
c1vec = _mm256_set1_epi32(c1); | |||
vec256_eliminate(F0, F1, G0, G1, 2, c0vec, c1vec); | |||
vec256_divx_2(G0); | |||
vec256_divx_2(G1); | |||
vec256_swap(V0, R0, 1, swapvec); | |||
vec256_swap(V1, R1, 1, swapvec); | |||
vec256_eliminate(V0, V1, R0, R1, 1, c0vec, c1vec); | |||
} | |||
for (loop = 503; loop > 0; --loop) { | |||
vec256_timesx_2(V0); | |||
vec256_timesx_2(V1); | |||
swapmask = negative_mask(minusdelta) & vec256_bit0mask(G0); | |||
c0 = vec256_bit0mask(F0) & vec256_bit0mask(G0); | |||
c1 = vec256_bit0mask(F1) ^ vec256_bit0mask(G1); | |||
c1 &= c0; | |||
minusdelta ^= swapmask & (minusdelta ^ -minusdelta); | |||
minusdelta -= 1; | |||
swapvec = _mm256_set1_epi32(swapmask); | |||
vec256_swap(F0, G0, 2, swapvec); | |||
vec256_swap(F1, G1, 2, swapvec); | |||
c0vec = _mm256_set1_epi32(c0); | |||
c1vec = _mm256_set1_epi32(c1); | |||
vec256_eliminate(F0, F1, G0, G1, 2, c0vec, c1vec); | |||
vec256_divx_2(G0); | |||
vec256_divx_2(G1); | |||
vec256_swap(V0, R0, 2, swapvec); | |||
vec256_swap(V1, R1, 2, swapvec); | |||
vec256_eliminate(V0, V1, R0, R1, 2, c0vec, c1vec); | |||
} | |||
for (loop = 256; loop > 0; --loop) { | |||
vec256_timesx_2(V0); | |||
vec256_timesx_2(V1); | |||
swapmask = negative_mask(minusdelta) & vec256_bit0mask(G0); | |||
c0 = vec256_bit0mask(F0) & vec256_bit0mask(G0); | |||
c1 = vec256_bit0mask(F1) ^ vec256_bit0mask(G1); | |||
c1 &= c0; | |||
minusdelta ^= swapmask & (minusdelta ^ -minusdelta); | |||
minusdelta -= 1; | |||
swapvec = _mm256_set1_epi32(swapmask); | |||
vec256_swap(F0, G0, 1, swapvec); | |||
vec256_swap(F1, G1, 1, swapvec); | |||
c0vec = _mm256_set1_epi32(c0); | |||
c1vec = _mm256_set1_epi32(c1); | |||
vec256_eliminate(F0, F1, G0, G1, 1, c0vec, c1vec); | |||
vec256_divx_1(G0); | |||
vec256_divx_1(G1); | |||
vec256_swap(V0, R0, 2, swapvec); | |||
vec256_swap(V1, R1, 2, swapvec); | |||
vec256_eliminate(V0, V1, R0, R1, 2, c0vec, c1vec); | |||
} | |||
c0vec = _mm256_set1_epi32(vec256_bit0mask(F0)); | |||
c1vec = _mm256_set1_epi32(vec256_bit0mask(F1)); | |||
vec256_scale(V0, V1, c0vec, c1vec); | |||
vec256_final(out, V0, V1); | |||
out[p] = negative_mask(minusdelta); | |||
return 0; | |||
} | |||
// This code is based on crypto_core/invhrss701/faster from SUPERCOP. The code was written as a case study | |||
// for the paper "Fast constant-time gcd computation and modular inversion" by Daniel J. Bernstein and Bo-Yin Yang. | |||
void PQCLEAN_NTRUHPS2048509_AVX2_poly_S3_inv(poly *r_out, const poly *a) { | |||
const unsigned char *in = (void *) a; | |||
unsigned char *out = (void *) r_out; | |||
small input[ppad]; | |||
small output[ppad]; | |||
int i; | |||
/* XXX: obviously input/output format should be packed into bytes */ | |||
for (i = 0; i < p; ++i) { | |||
small x = in[2 * i] & 3; /* 0 1 2 3 */ | |||
x += 1; /* 0 1 2 3 4 5 6, offset by 1 */ | |||
x &= (x - 3) >> 5; /* 0 1 2, offset by 1 */ | |||
input[i] = x - 1; | |||
} | |||
/* XXX: merge with vec256_init */ | |||
__poly_S3_inv((unsigned char *)output, (unsigned char *)input); | |||
for (i = 0; i < p; ++i) { | |||
out[2 * i] = (3 & output[i]) ^ ((3 & output[i]) >> 1); | |||
out[2 * i + 1] = 0; | |||
} | |||
} |
@@ -0,0 +1,46 @@ | |||
#include "sample.h" | |||
void PQCLEAN_NTRUHPS2048509_AVX2_sample_fg(poly *f, poly *g, const unsigned char uniformbytes[NTRU_SAMPLE_FG_BYTES]) { | |||
PQCLEAN_NTRUHPS2048509_AVX2_sample_iid(f, uniformbytes); | |||
PQCLEAN_NTRUHPS2048509_AVX2_sample_fixed_type(g, uniformbytes + NTRU_SAMPLE_IID_BYTES); | |||
} | |||
void PQCLEAN_NTRUHPS2048509_AVX2_sample_rm(poly *r, poly *m, const unsigned char uniformbytes[NTRU_SAMPLE_RM_BYTES]) { | |||
PQCLEAN_NTRUHPS2048509_AVX2_sample_iid(r, uniformbytes); | |||
PQCLEAN_NTRUHPS2048509_AVX2_sample_fixed_type(m, uniformbytes + NTRU_SAMPLE_IID_BYTES); | |||
} | |||
#include "crypto_sort_int32.h" | |||
void PQCLEAN_NTRUHPS2048509_AVX2_sample_fixed_type(poly *r, const unsigned char u[NTRU_SAMPLE_FT_BYTES]) { | |||
// Assumes NTRU_SAMPLE_FT_BYTES = ceil(30*(n-1)/8) | |||
int32_t s[NTRU_N - 1]; | |||
int i; | |||
// Use 30 bits of u per word | |||
for (i = 0; i < (NTRU_N - 1) / 4; i++) { | |||
s[4 * i + 0] = (u[15 * i + 0] << 2) + (u[15 * i + 1] << 10) + (u[15 * i + 2] << 18) + ((uint32_t) u[15 * i + 3] << 26); | |||
s[4 * i + 1] = ((u[15 * i + 3] & 0xc0) >> 4) + (u[15 * i + 4] << 4) + (u[15 * i + 5] << 12) + (u[15 * i + 6] << 20) + ((uint32_t) u[15 * i + 7] << 28); | |||
s[4 * i + 2] = ((u[15 * i + 7] & 0xf0) >> 2) + (u[15 * i + 8] << 6) + (u[15 * i + 9] << 14) + (u[15 * i + 10] << 22) + ((uint32_t) u[15 * i + 11] << 30); | |||
s[4 * i + 3] = (u[15 * i + 11] & 0xfc) + (u[15 * i + 12] << 8) + (u[15 * i + 13] << 15) + ((uint32_t) u[15 * i + 14] << 24); | |||
} | |||
for (i = 0; i < NTRU_WEIGHT / 2; i++) { | |||
s[i] |= 1; | |||
} | |||
for (i = NTRU_WEIGHT / 2; i < NTRU_WEIGHT; i++) { | |||
s[i] |= 2; | |||
} | |||
PQCLEAN_NTRUHPS2048509_AVX2_crypto_sort_int32(s, NTRU_N - 1); | |||
for (i = 0; i < NTRU_N - 1; i++) { | |||
r->coeffs[i] = ((uint16_t) (s[i] & 3)); | |||
} | |||
r->coeffs[NTRU_N - 1] = 0; | |||
} |
@@ -0,0 +1,15 @@ | |||
#ifndef SAMPLE_H | |||
#define SAMPLE_H | |||
#include "params.h" | |||
#include "poly.h" | |||
void PQCLEAN_NTRUHPS2048509_AVX2_sample_fg(poly *f, poly *g, const unsigned char uniformbytes[NTRU_SAMPLE_FG_BYTES]); | |||
void PQCLEAN_NTRUHPS2048509_AVX2_sample_rm(poly *r, poly *m, const unsigned char uniformbytes[NTRU_SAMPLE_RM_BYTES]); | |||
void PQCLEAN_NTRUHPS2048509_AVX2_sample_iid(poly *r, const unsigned char uniformbytes[NTRU_SAMPLE_IID_BYTES]); | |||
void PQCLEAN_NTRUHPS2048509_AVX2_sample_fixed_type(poly *r, const unsigned char uniformbytes[NTRU_SAMPLE_FT_BYTES]); | |||
#endif |
@@ -0,0 +1,21 @@ | |||
#include <immintrin.h> | |||
#include "sample.h" | |||
extern void PQCLEAN_NTRUHPS2048509_AVX2_vec32_sample_iid(poly *r, const unsigned char uniformbytes[PAD32(NTRU_SAMPLE_IID_BYTES)]); | |||
void PQCLEAN_NTRUHPS2048509_AVX2_sample_iid(poly *r, const unsigned char uniformbytes[NTRU_SAMPLE_IID_BYTES]) { | |||
int i; | |||
union { /* align to 32 byte boundary for vmovdqa */ | |||
unsigned char b[PAD32(NTRU_SAMPLE_IID_BYTES)]; | |||
__m256i b_x32[PAD32(NTRU_SAMPLE_IID_BYTES) / 32]; | |||
} buffer; | |||
for (i = 0; i < NTRU_SAMPLE_IID_BYTES; i++) { | |||
buffer.b[i] = uniformbytes[i]; | |||
} | |||
for (i = NTRU_SAMPLE_IID_BYTES; i < PAD32(NTRU_SAMPLE_IID_BYTES); i++) { | |||
buffer.b[i] = 0; | |||
} | |||
PQCLEAN_NTRUHPS2048509_AVX2_vec32_sample_iid(r, buffer.b); | |||
} |
@@ -0,0 +1,109 @@ | |||
.data | |||
.p2align 5 | |||
.text | |||
.global PQCLEAN_NTRUHPS2048509_AVX2_square_1_509 | |||
.global _PQCLEAN_NTRUHPS2048509_AVX2_square_1_509 | |||
PQCLEAN_NTRUHPS2048509_AVX2_square_1_509: | |||
_PQCLEAN_NTRUHPS2048509_AVX2_square_1_509: | |||
push %r15 | |||
push %r14 | |||
push %r13 | |||
push %r12 | |||
push %rbx | |||
push %rbp | |||
mov 0(%rsi), %r11 | |||
mov %r11, %r10 | |||
and $-0x1, %r10 | |||
mov $0x5555555555555555, %rbp | |||
pdep %rbp, %r10, %r10 | |||
mov %r10, 0(%rdi) | |||
mov $0xffffffff00000000, %rbx | |||
pext %rbx, %r11, %r10 | |||
pdep %rbp, %r10, %r10 | |||
mov %r10, 8(%rdi) | |||
mov 8(%rsi), %r11 | |||
mov %r11, %r10 | |||
and $-0x1, %r10 | |||
pdep %rbp, %r10, %r10 | |||
mov %r10, 16(%rdi) | |||
pext %rbx, %r11, %r10 | |||
pdep %rbp, %r10, %r10 | |||
mov %r10, 24(%rdi) | |||
mov 16(%rsi), %r11 | |||
mov %r11, %r10 | |||
and $-0x1, %r10 | |||
pdep %rbp, %r10, %r10 | |||
mov %r10, 32(%rdi) | |||
pext %rbx, %r11, %r10 | |||
pdep %rbp, %r10, %r10 | |||
mov %r10, 40(%rdi) | |||
mov 24(%rsi), %r11 | |||
mov %r11, %r10 | |||
rol $2, %r10 | |||
and $0x2, %r10 | |||
xor %r10, 0(%rdi) | |||
mov %r11, %r10 | |||
and $-0x1, %r10 | |||
pdep %rbp, %r10, %r10 | |||
mov %r10, 48(%rdi) | |||
mov $0x7fffffff00000000, %r12 | |||
pext %r12, %r11, %r10 | |||
mov $0x1555555555555555, %r13 | |||
pdep %r13, %r10, %r10 | |||
mov %r10, 56(%rdi) | |||
mov 32(%rsi), %r11 | |||
mov %r11, %r10 | |||
and $0x7fffffff, %r10 | |||
mov $0xaaaaaaaaaaaaaaa8, %r14 | |||
pdep %r14, %r10, %r10 | |||
xor %r10, 0(%rdi) | |||
mov $0x7fffffff80000000, %r15 | |||
pext %r15, %r11, %r10 | |||
mov $0xaaaaaaaaaaaaaaaa, %r9 | |||
pdep %r9, %r10, %r10 | |||
xor %r10, 8(%rdi) | |||
mov %r11, %r10 | |||
rol $2, %r10 | |||
and $0x2, %r10 | |||
xor %r10, 16(%rdi) | |||
mov 40(%rsi), %r11 | |||
mov %r11, %r10 | |||
and $0x7fffffff, %r10 | |||
pdep %r14, %r10, %r10 | |||
xor %r10, 16(%rdi) | |||
pext %r15, %r11, %r10 | |||
pdep %r9, %r10, %r10 | |||
xor %r10, 24(%rdi) | |||
mov %r11, %r10 | |||
rol $2, %r10 | |||
and $0x2, %r10 | |||
xor %r10, 32(%rdi) | |||
mov 48(%rsi), %r11 | |||
mov %r11, %r10 | |||
and $0x7fffffff, %r10 | |||
pdep %r14, %r10, %r10 | |||
xor %r10, 32(%rdi) | |||
pext %r15, %r11, %r10 | |||
pdep %r9, %r10, %r10 | |||
xor %r10, 40(%rdi) | |||
mov %r11, %r10 | |||
rol $2, %r10 | |||
and $0x2, %r10 | |||
xor %r10, 48(%rdi) | |||
mov 56(%rsi), %r11 | |||
mov %r11, %r10 | |||
and $0x7fffffff, %r10 | |||
pdep %r14, %r10, %r10 | |||
xor %r10, 48(%rdi) | |||
mov $0x1fffffff80000000, %r8 | |||
pext %r8, %r11, %r10 | |||
mov $0xaaaaaaaaaaaaaaa, %rdx | |||
pdep %rdx, %r10, %r10 | |||
xor %r10, 56(%rdi) | |||
pop %rbp | |||
pop %rbx | |||
pop %r12 | |||
pop %r13 | |||
pop %r14 | |||
pop %r15 | |||
ret |
@@ -0,0 +1,272 @@ | |||
.data | |||
.p2align 5 | |||
.text | |||
.global PQCLEAN_NTRUHPS2048509_AVX2_square_3_509 | |||
.global _PQCLEAN_NTRUHPS2048509_AVX2_square_3_509 | |||
PQCLEAN_NTRUHPS2048509_AVX2_square_3_509: | |||
_PQCLEAN_NTRUHPS2048509_AVX2_square_3_509: | |||
push %r15 | |||
push %r14 | |||
push %r13 | |||
push %r12 | |||
push %rbx | |||
push %rbp | |||
mov 0(%rsi), %r11 | |||
mov %r11, %r10 | |||
and $0xff, %r10 | |||
mov $0x101010101010101, %rbp | |||
pdep %rbp, %r10, %r10 | |||
mov %r10, 0(%rdi) | |||
mov $0xff00, %rbx | |||
pext %rbx, %r11, %r10 | |||
pdep %rbp, %r10, %r10 | |||
mov %r10, 8(%rdi) | |||
mov $0xff0000, %r12 | |||
pext %r12, %r11, %r10 | |||
pdep %rbp, %r10, %r10 | |||
mov %r10, 16(%rdi) | |||
mov $0xff000000, %r13 | |||
pext %r13, %r11, %r10 | |||
pdep %rbp, %r10, %r10 | |||
mov %r10, 24(%rdi) | |||
mov $0xff00000000, %r14 | |||
pext %r14, %r11, %r10 | |||
pdep %rbp, %r10, %r10 | |||
mov %r10, 32(%rdi) | |||
mov $0xff0000000000, %r15 | |||
pext %r15, %r11, %r10 | |||
pdep %rbp, %r10, %r10 | |||
mov %r10, 40(%rdi) | |||
mov $0xff000000000000, %r9 | |||
pext %r9, %r11, %r10 | |||
pdep %rbp, %r10, %r10 | |||
mov %r10, 48(%rdi) | |||
mov $0xff00000000000000, %r8 | |||
pext %r8, %r11, %r10 | |||
pdep %rbp, %r10, %r10 | |||
mov %r10, 56(%rdi) | |||
mov 8(%rsi), %r11 | |||
mov %r11, %r10 | |||
and $0xff, %r10 | |||
mov $0x808080808080808, %rdx | |||
pdep %rdx, %r10, %r10 | |||
xor %r10, 0(%rdi) | |||
pext %rbx, %r11, %r10 | |||
pdep %rdx, %r10, %r10 | |||
xor %r10, 8(%rdi) | |||
pext %r12, %r11, %r10 | |||
pdep %rdx, %r10, %r10 | |||
xor %r10, 16(%rdi) | |||
pext %r13, %r11, %r10 | |||
pdep %rdx, %r10, %r10 | |||
xor %r10, 24(%rdi) | |||
pext %r14, %r11, %r10 | |||
pdep %rdx, %r10, %r10 | |||
xor %r10, 32(%rdi) | |||
pext %r15, %r11, %r10 | |||
pdep %rdx, %r10, %r10 | |||
xor %r10, 40(%rdi) | |||
pext %r9, %r11, %r10 | |||
pdep %rdx, %r10, %r10 | |||
xor %r10, 48(%rdi) | |||
pext %r8, %r11, %r10 | |||
pdep %rdx, %r10, %r10 | |||
xor %r10, 56(%rdi) | |||
mov 16(%rsi), %r11 | |||
mov $0x80000000000000ff, %rcx | |||
pext %rcx, %r11, %r10 | |||
mov $0x9010101010101010, %rax | |||
pdep %rax, %r10, %r10 | |||
rol $2, %r10 | |||
xor %r10, 0(%rdi) | |||
pext %rbx, %r11, %r10 | |||
mov $0x4040404040404040, %rbp | |||
pdep %rbp, %r10, %r10 | |||
xor %r10, 8(%rdi) | |||
pext %r12, %r11, %r10 | |||
pdep %rbp, %r10, %r10 | |||
xor %r10, 16(%rdi) | |||
pext %r13, %r11, %r10 | |||
pdep %rbp, %r10, %r10 | |||
xor %r10, 24(%rdi) | |||
pext %r14, %r11, %r10 | |||
pdep %rbp, %r10, %r10 | |||
xor %r10, 32(%rdi) | |||
pext %r15, %r11, %r10 | |||
pdep %rbp, %r10, %r10 | |||
xor %r10, 40(%rdi) | |||
pext %r9, %r11, %r10 | |||
pdep %rbp, %r10, %r10 | |||
xor %r10, 48(%rdi) | |||
mov $0x7f00000000000000, %r8 | |||
pext %r8, %r11, %r10 | |||
mov $0x40404040404040, %rdx | |||
pdep %rdx, %r10, %r10 | |||
xor %r10, 56(%rdi) | |||
mov 24(%rsi), %r11 | |||
mov $0x800000000000007f, %rcx | |||
pext %rcx, %r11, %r10 | |||
mov $0x8010101010101010, %rax | |||
pdep %rax, %r10, %r10 | |||
rol $5, %r10 | |||
xor %r10, 0(%rdi) | |||
mov $0x7f80, %rbx | |||
pext %rbx, %r11, %r10 | |||
mov $0x202020202020202, %r12 | |||
pdep %r12, %r10, %r10 | |||
xor %r10, 8(%rdi) | |||
mov $0x7f8000, %r13 | |||
pext %r13, %r11, %r10 | |||
pdep %r12, %r10, %r10 | |||
xor %r10, 16(%rdi) | |||
mov $0x7f800000, %r14 | |||
pext %r14, %r11, %r10 | |||
pdep %r12, %r10, %r10 | |||
xor %r10, 24(%rdi) | |||
mov $0x7f80000000, %r15 | |||
pext %r15, %r11, %r10 | |||
pdep %r12, %r10, %r10 | |||
xor %r10, 32(%rdi) | |||
mov $0x7f8000000000, %r9 | |||
pext %r9, %r11, %r10 | |||
pdep %r12, %r10, %r10 | |||
xor %r10, 40(%rdi) | |||
mov $0x7f800000000000, %rbp | |||
pext %rbp, %r11, %r10 | |||
pdep %r12, %r10, %r10 | |||
xor %r10, 48(%rdi) | |||
mov $0x7f80000000000000, %r8 | |||
pext %r8, %r11, %r10 | |||
pdep %r12, %r10, %r10 | |||
xor %r10, 56(%rdi) | |||
mov 32(%rsi), %r11 | |||
pext %rcx, %r11, %r10 | |||
pdep %rax, %r10, %r10 | |||
rol $8, %r10 | |||
xor %r10, 0(%rdi) | |||
pext %rbx, %r11, %r10 | |||
mov $0x1010101010101010, %rdx | |||
pdep %rdx, %r10, %r10 | |||
xor %r10, 8(%rdi) | |||
pext %r13, %r11, %r10 | |||
pdep %rdx, %r10, %r10 | |||
xor %r10, 16(%rdi) | |||
pext %r14, %r11, %r10 | |||
pdep %rdx, %r10, %r10 | |||
xor %r10, 24(%rdi) | |||
pext %r15, %r11, %r10 | |||
pdep %rdx, %r10, %r10 | |||
xor %r10, 32(%rdi) | |||
pext %r9, %r11, %r10 | |||
pdep %rdx, %r10, %r10 | |||
xor %r10, 40(%rdi) | |||
pext %rbp, %r11, %r10 | |||
pdep %rdx, %r10, %r10 | |||
xor %r10, 48(%rdi) | |||
pext %r8, %r11, %r10 | |||
pdep %rdx, %r10, %r10 | |||
xor %r10, 56(%rdi) | |||
mov 40(%rsi), %r11 | |||
mov $0xc00000000000007f, %r12 | |||
pext %r12, %r11, %r10 | |||
mov $0x8090101010101010, %rcx | |||
pdep %rcx, %r10, %r10 | |||
rol $11, %r10 | |||
xor %r10, 0(%rdi) | |||
pext %rbx, %r11, %r10 | |||
mov $0x8080808080808080, %rax | |||
pdep %rax, %r10, %r10 | |||
xor %r10, 8(%rdi) | |||
pext %r13, %r11, %r10 | |||
pdep %rax, %r10, %r10 | |||
xor %r10, 16(%rdi) | |||
pext %r14, %r11, %r10 | |||
pdep %rax, %r10, %r10 | |||
xor %r10, 24(%rdi) | |||
pext %r15, %r11, %r10 | |||
pdep %rax, %r10, %r10 | |||
xor %r10, 32(%rdi) | |||
pext %r9, %r11, %r10 | |||
pdep %rax, %r10, %r10 | |||
xor %r10, 40(%rdi) | |||
pext %rbp, %r11, %r10 | |||
pdep %rax, %r10, %r10 | |||
xor %r10, 48(%rdi) | |||
mov $0x3f80000000000000, %r8 | |||
pext %r8, %r11, %r10 | |||
mov $0x80808080808080, %rdx | |||
pdep %rdx, %r10, %r10 | |||
xor %r10, 56(%rdi) | |||
mov 48(%rsi), %r11 | |||
mov $0xc00000000000003f, %r12 | |||
pext %r12, %r11, %r10 | |||
mov $0x8080101010101010, %rcx | |||
pdep %rcx, %r10, %r10 | |||
rol $14, %r10 | |||
xor %r10, 0(%rdi) | |||
mov $0x3fc0, %rbx | |||
pext %rbx, %r11, %r10 | |||
mov $0x404040404040404, %r13 | |||
pdep %r13, %r10, %r10 | |||
xor %r10, 8(%rdi) | |||
mov $0x3fc000, %r14 | |||
pext %r14, %r11, %r10 | |||
pdep %r13, %r10, %r10 | |||
xor %r10, 16(%rdi) | |||
mov $0x3fc00000, %r15 | |||
pext %r15, %r11, %r10 | |||
pdep %r13, %r10, %r10 | |||
xor %r10, 24(%rdi) | |||
mov $0x3fc0000000, %r9 | |||
pext %r9, %r11, %r10 | |||
pdep %r13, %r10, %r10 | |||
xor %r10, 32(%rdi) | |||
mov $0x3fc000000000, %rbp | |||
pext %rbp, %r11, %r10 | |||
pdep %r13, %r10, %r10 | |||
xor %r10, 40(%rdi) | |||
mov $0x3fc00000000000, %rax | |||
pext %rax, %r11, %r10 | |||
pdep %r13, %r10, %r10 | |||
xor %r10, 48(%rdi) | |||
mov $0x3fc0000000000000, %r8 | |||
pext %r8, %r11, %r10 | |||
pdep %r13, %r10, %r10 | |||
xor %r10, 56(%rdi) | |||
mov 56(%rsi), %r11 | |||
mov %r11, %r10 | |||
and $0x3f, %r10 | |||
mov $0x2020202020200000, %rdx | |||
pdep %rdx, %r10, %r10 | |||
xor %r10, 0(%rdi) | |||
pext %rbx, %r11, %r10 | |||
mov $0x2020202020202020, %r12 | |||
pdep %r12, %r10, %r10 | |||
xor %r10, 8(%rdi) | |||
pext %r14, %r11, %r10 | |||
pdep %r12, %r10, %r10 | |||
xor %r10, 16(%rdi) | |||
pext %r15, %r11, %r10 | |||
pdep %r12, %r10, %r10 | |||
xor %r10, 24(%rdi) | |||
pext %r9, %r11, %r10 | |||
pdep %r12, %r10, %r10 | |||
xor %r10, 32(%rdi) | |||
pext %rbp, %r11, %r10 | |||
pdep %r12, %r10, %r10 | |||
xor %r10, 40(%rdi) | |||
pext %rax, %r11, %r10 | |||
pdep %r12, %r10, %r10 | |||
xor %r10, 48(%rdi) | |||
mov $0x1fc0000000000000, %rcx | |||
pext %rcx, %r11, %r10 | |||
mov $0x20202020202020, %r8 | |||
pdep %r8, %r10, %r10 | |||
xor %r10, 56(%rdi) | |||
pop %rbp | |||
pop %rbx | |||
pop %r12 | |||
pop %r13 | |||
pop %r14 | |||
pop %r15 | |||
ret |
@@ -0,0 +1,296 @@ | |||
.data | |||
.p2align 5 | |||
.text | |||
.global PQCLEAN_NTRUHPS2048509_AVX2_square_6_509 | |||
.global _PQCLEAN_NTRUHPS2048509_AVX2_square_6_509 | |||
PQCLEAN_NTRUHPS2048509_AVX2_square_6_509: | |||
_PQCLEAN_NTRUHPS2048509_AVX2_square_6_509: | |||
push %r15 | |||
push %r14 | |||
push %r13 | |||
push %r12 | |||
push %rbx | |||
push %rbp | |||
mov 0(%rsi), %r11 | |||
mov $0x101010101010101, %rbp | |||
pext %rbp, %r11, %r10 | |||
mov $0x249249, %rbx | |||
pdep %rbx, %r10, %r10 | |||
mov %r10, 0(%rdi) | |||
mov $0x202020202020202, %r12 | |||
pext %r12, %r11, %r10 | |||
pdep %rbx, %r10, %r10 | |||
mov %r10, 8(%rdi) | |||
mov $0x404040404040404, %r13 | |||
pext %r13, %r11, %r10 | |||
pdep %rbx, %r10, %r10 | |||
mov %r10, 16(%rdi) | |||
mov $0x808080808080808, %r14 | |||
pext %r14, %r11, %r10 | |||
pdep %rbx, %r10, %r10 | |||
mov %r10, 24(%rdi) | |||
mov $0x1010101010101010, %r15 | |||
pext %r15, %r11, %r10 | |||
pdep %rbx, %r10, %r10 | |||
mov %r10, 32(%rdi) | |||
mov $0x2020202020202020, %r9 | |||
pext %r9, %r11, %r10 | |||
pdep %rbx, %r10, %r10 | |||
mov %r10, 40(%rdi) | |||
mov $0x4040404040404040, %r8 | |||
pext %r8, %r11, %r10 | |||
pdep %rbx, %r10, %r10 | |||
mov %r10, 48(%rdi) | |||
mov $0x8080808080808080, %rdx | |||
pext %rdx, %r11, %r10 | |||
pdep %rbx, %r10, %r10 | |||
mov %r10, 56(%rdi) | |||
mov 8(%rsi), %r11 | |||
pext %rbp, %r11, %r10 | |||
mov $0x249249000000, %rcx | |||
pdep %rcx, %r10, %r10 | |||
xor %r10, 0(%rdi) | |||
pext %r12, %r11, %r10 | |||
pdep %rcx, %r10, %r10 | |||
xor %r10, 8(%rdi) | |||
pext %r13, %r11, %r10 | |||
pdep %rcx, %r10, %r10 | |||
xor %r10, 16(%rdi) | |||
pext %r14, %r11, %r10 | |||
pdep %rcx, %r10, %r10 | |||
xor %r10, 24(%rdi) | |||
pext %r15, %r11, %r10 | |||
pdep %rcx, %r10, %r10 | |||
xor %r10, 32(%rdi) | |||
pext %r9, %r11, %r10 | |||
pdep %rcx, %r10, %r10 | |||
xor %r10, 40(%rdi) | |||
pext %r8, %r11, %r10 | |||
pdep %rcx, %r10, %r10 | |||
xor %r10, 48(%rdi) | |||
pext %rdx, %r11, %r10 | |||
pdep %rcx, %r10, %r10 | |||
xor %r10, 56(%rdi) | |||
mov 16(%rsi), %r11 | |||
mov $0x8080810101010101, %rax | |||
pext %rax, %r11, %r10 | |||
mov $0x9249248000000000, %rbx | |||
pdep %rbx, %r10, %r10 | |||
rol $9, %r10 | |||
xor %r10, 0(%rdi) | |||
mov $0x101020202020202, %rbp | |||
pext %rbp, %r11, %r10 | |||
mov $0x9249240000000000, %r12 | |||
pdep %r12, %r10, %r10 | |||
rol $6, %r10 | |||
xor %r10, 8(%rdi) | |||
mov $0x202040404040404, %r13 | |||
pext %r13, %r11, %r10 | |||
pdep %r12, %r10, %r10 | |||
rol $6, %r10 | |||
xor %r10, 16(%rdi) | |||
mov $0x404080808080808, %r14 | |||
pext %r14, %r11, %r10 | |||
pdep %r12, %r10, %r10 | |||
rol $6, %r10 | |||
xor %r10, 24(%rdi) | |||
mov $0x808101010101010, %r15 | |||
pext %r15, %r11, %r10 | |||
pdep %r12, %r10, %r10 | |||
rol $6, %r10 | |||
xor %r10, 32(%rdi) | |||
mov $0x1010202020202020, %r9 | |||
pext %r9, %r11, %r10 | |||
pdep %r12, %r10, %r10 | |||
rol $6, %r10 | |||
xor %r10, 40(%rdi) | |||
mov $0x2020404040404040, %r8 | |||
pext %r8, %r11, %r10 | |||
pdep %r12, %r10, %r10 | |||
rol $6, %r10 | |||
xor %r10, 48(%rdi) | |||
mov $0x4040008080808080, %rdx | |||
pext %rdx, %r11, %r10 | |||
mov $0x9049240000000000, %rcx | |||
pdep %rcx, %r10, %r10 | |||
rol $6, %r10 | |||
xor %r10, 56(%rdi) | |||
mov 24(%rsi), %r11 | |||
mov $0x8080808080808080, %rax | |||
pext %rax, %r11, %r10 | |||
mov $0x124924800, %rbx | |||
pdep %rbx, %r10, %r10 | |||
xor %r10, 0(%rdi) | |||
mov $0x101010101010101, %rbp | |||
pext %rbp, %r11, %r10 | |||
mov $0x24924900, %r13 | |||
pdep %r13, %r10, %r10 | |||
xor %r10, 8(%rdi) | |||
mov $0x202020202020202, %r14 | |||
pext %r14, %r11, %r10 | |||
pdep %r13, %r10, %r10 | |||
xor %r10, 16(%rdi) | |||
mov $0x404040404040404, %r15 | |||
pext %r15, %r11, %r10 | |||
pdep %r13, %r10, %r10 | |||
xor %r10, 24(%rdi) | |||
mov $0x808080808080808, %r9 | |||
pext %r9, %r11, %r10 | |||
pdep %r13, %r10, %r10 | |||
xor %r10, 32(%rdi) | |||
mov $0x1010101010101010, %r8 | |||
pext %r8, %r11, %r10 | |||
pdep %r13, %r10, %r10 | |||
xor %r10, 40(%rdi) | |||
mov $0x2020202020202020, %r12 | |||
pext %r12, %r11, %r10 | |||
pdep %r13, %r10, %r10 | |||
xor %r10, 48(%rdi) | |||
mov $0x4040404040404040, %rdx | |||
pext %rdx, %r11, %r10 | |||
pdep %r13, %r10, %r10 | |||
xor %r10, 56(%rdi) | |||
mov 32(%rsi), %r11 | |||
pext %rax, %r11, %r10 | |||
mov $0x124924800000000, %rcx | |||
pdep %rcx, %r10, %r10 | |||
xor %r10, 0(%rdi) | |||
pext %rbp, %r11, %r10 | |||
mov $0x24924900000000, %rbx | |||
pdep %rbx, %r10, %r10 | |||
xor %r10, 8(%rdi) | |||
pext %r14, %r11, %r10 | |||
pdep %rbx, %r10, %r10 | |||
xor %r10, 16(%rdi) | |||
pext %r15, %r11, %r10 | |||
pdep %rbx, %r10, %r10 | |||
xor %r10, 24(%rdi) | |||
pext %r9, %r11, %r10 | |||
pdep %rbx, %r10, %r10 | |||
xor %r10, 32(%rdi) | |||
pext %r8, %r11, %r10 | |||
pdep %rbx, %r10, %r10 | |||
xor %r10, 40(%rdi) | |||
pext %r12, %r11, %r10 | |||
pdep %rbx, %r10, %r10 | |||
xor %r10, 48(%rdi) | |||
pext %rdx, %r11, %r10 | |||
pdep %rbx, %r10, %r10 | |||
xor %r10, 56(%rdi) | |||
mov 40(%rsi), %r11 | |||
mov $0x4040404040408080, %r13 | |||
pext %r13, %r11, %r10 | |||
mov $0x9249240000000000, %rax | |||
pdep %rax, %r10, %r10 | |||
rol $17, %r10 | |||
xor %r10, 0(%rdi) | |||
mov $0x8080808080810101, %rcx | |||
pext %rcx, %r11, %r10 | |||
mov $0x9249248000000000, %rbp | |||
pdep %rbp, %r10, %r10 | |||
rol $17, %r10 | |||
xor %r10, 8(%rdi) | |||
mov $0x101010101020202, %r14 | |||
pext %r14, %r11, %r10 | |||
pdep %rax, %r10, %r10 | |||
rol $14, %r10 | |||
xor %r10, 16(%rdi) | |||
mov $0x202020202040404, %r15 | |||
pext %r15, %r11, %r10 | |||
pdep %rax, %r10, %r10 | |||
rol $14, %r10 | |||
xor %r10, 24(%rdi) | |||
mov $0x404040404080808, %r9 | |||
pext %r9, %r11, %r10 | |||
pdep %rax, %r10, %r10 | |||
rol $14, %r10 | |||
xor %r10, 32(%rdi) | |||
mov $0x808080808101010, %r8 | |||
pext %r8, %r11, %r10 | |||
pdep %rax, %r10, %r10 | |||
rol $14, %r10 | |||
xor %r10, 40(%rdi) | |||
mov $0x1010101010202020, %r12 | |||
pext %r12, %r11, %r10 | |||
pdep %rax, %r10, %r10 | |||
rol $14, %r10 | |||
xor %r10, 48(%rdi) | |||
mov $0x2020202020004040, %rdx | |||
pext %rdx, %r11, %r10 | |||
mov $0x9248240000000000, %rbx | |||
pdep %rbx, %r10, %r10 | |||
rol $14, %r10 | |||
xor %r10, 56(%rdi) | |||
mov 48(%rsi), %r11 | |||
mov $0x4040404040404040, %r13 | |||
pext %r13, %r11, %r10 | |||
mov $0x12492480000, %rcx | |||
pdep %rcx, %r10, %r10 | |||
xor %r10, 0(%rdi) | |||
mov $0x8080808080808080, %rbp | |||
pext %rbp, %r11, %r10 | |||
pdep %rcx, %r10, %r10 | |||
xor %r10, 8(%rdi) | |||
mov $0x101010101010101, %r14 | |||
pext %r14, %r11, %r10 | |||
mov $0x2492490000, %r15 | |||
pdep %r15, %r10, %r10 | |||
xor %r10, 16(%rdi) | |||
mov $0x202020202020202, %r9 | |||
pext %r9, %r11, %r10 | |||
pdep %r15, %r10, %r10 | |||
xor %r10, 24(%rdi) | |||
mov $0x404040404040404, %r8 | |||
pext %r8, %r11, %r10 | |||
pdep %r15, %r10, %r10 | |||
xor %r10, 32(%rdi) | |||
mov $0x808080808080808, %r12 | |||
pext %r12, %r11, %r10 | |||
pdep %r15, %r10, %r10 | |||
xor %r10, 40(%rdi) | |||
mov $0x1010101010101010, %rax | |||
pext %rax, %r11, %r10 | |||
pdep %r15, %r10, %r10 | |||
xor %r10, 48(%rdi) | |||
mov $0x2020202020202020, %rdx | |||
pext %rdx, %r11, %r10 | |||
pdep %r15, %r10, %r10 | |||
xor %r10, 56(%rdi) | |||
mov 56(%rsi), %r11 | |||
mov $0x40404040404040, %rbx | |||
pext %rbx, %r11, %r10 | |||
mov $0x2492480000000000, %r13 | |||
pdep %r13, %r10, %r10 | |||
xor %r10, 0(%rdi) | |||
mov $0x80808080808080, %rbp | |||
pext %rbp, %r11, %r10 | |||
pdep %r13, %r10, %r10 | |||
xor %r10, 8(%rdi) | |||
pext %r14, %r11, %r10 | |||
mov $0x2492490000000000, %rcx | |||
pdep %rcx, %r10, %r10 | |||
xor %r10, 16(%rdi) | |||
pext %r9, %r11, %r10 | |||
pdep %rcx, %r10, %r10 | |||
xor %r10, 24(%rdi) | |||
pext %r8, %r11, %r10 | |||
pdep %rcx, %r10, %r10 | |||
xor %r10, 32(%rdi) | |||
pext %r12, %r11, %r10 | |||
pdep %rcx, %r10, %r10 | |||
xor %r10, 40(%rdi) | |||
pext %rax, %r11, %r10 | |||
pdep %rcx, %r10, %r10 | |||
xor %r10, 48(%rdi) | |||
mov $0x20202020202020, %rdx | |||
pext %rdx, %r11, %r10 | |||
mov $0x492490000000000, %r15 | |||
pdep %r15, %r10, %r10 | |||
xor %r10, 56(%rdi) | |||
pop %rbp | |||
pop %rbx | |||
pop %r12 | |||
pop %r13 | |||
pop %r14 | |||
pop %r15 | |||
ret |
@@ -0,0 +1,784 @@ | |||
.data | |||
.p2align 5 | |||
cast8_to_16: | |||
.byte 255 | |||
.byte 0 | |||
.byte 255 | |||
.byte 1 | |||
.byte 255 | |||
.byte 2 | |||
.byte 255 | |||
.byte 3 | |||
.byte 255 | |||
.byte 4 | |||
.byte 255 | |||
.byte 5 | |||
.byte 255 | |||
.byte 6 | |||
.byte 255 | |||
.byte 7 | |||
.byte 255 | |||
.byte 0 | |||
.byte 255 | |||
.byte 1 | |||
.byte 255 | |||
.byte 2 | |||
.byte 255 | |||
.byte 3 | |||
.byte 255 | |||
.byte 4 | |||
.byte 255 | |||
.byte 5 | |||
.byte 255 | |||
.byte 6 | |||
.byte 255 | |||
.byte 7 | |||
mask_ff: | |||
.word 0xff | |||
.word 0xff | |||
.word 0xff | |||
.word 0xff | |||
.word 0xff | |||
.word 0xff | |||
.word 0xff | |||
.word 0xff | |||
.word 0xff | |||
.word 0xff | |||
.word 0xff | |||
.word 0xff | |||
.word 0xff | |||
.word 0xff | |||
.word 0xff | |||
.word 0xff | |||
mask_f: | |||
.word 0xf | |||
.word 0xf | |||
.word 0xf | |||
.word 0xf | |||
.word 0xf | |||
.word 0xf | |||
.word 0xf | |||
.word 0xf | |||
.word 0xf | |||
.word 0xf | |||
.word 0xf | |||
.word 0xf | |||
.word 0xf | |||
.word 0xf | |||
.word 0xf | |||
.word 0xf | |||
mask_3: | |||
.word 0x03 | |||
.word 0x03 | |||
.word 0x03 | |||
.word 0x03 | |||
.word 0x03 | |||
.word 0x03 | |||
.word 0x03 | |||
.word 0x03 | |||
.word 0x03 | |||
.word 0x03 | |||
.word 0x03 | |||
.word 0x03 | |||
.word 0x03 | |||
.word 0x03 | |||
.word 0x03 | |||
.word 0x03 | |||
.text | |||
.global PQCLEAN_NTRUHPS2048509_AVX2_vec32_sample_iid | |||
.global _PQCLEAN_NTRUHPS2048509_AVX2_vec32_sample_iid | |||
PQCLEAN_NTRUHPS2048509_AVX2_vec32_sample_iid: | |||
_PQCLEAN_NTRUHPS2048509_AVX2_vec32_sample_iid: | |||
vmovdqa 0(%rsi), %ymm3 | |||
vextracti128 $0, %ymm3, %xmm1 | |||
vpermq $216, %ymm1, %ymm1 | |||
vpshufb cast8_to_16(%rip), %ymm1, %ymm1 | |||
vpsrlw $8, %ymm1, %ymm2 | |||
vpand mask_ff(%rip), %ymm1, %ymm1 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_f(%rip), %ymm2, %ymm1 | |||
vpsrlw $4, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpsubw mask_3(%rip), %ymm2, %ymm14 | |||
vpsraw $15, %ymm14, %ymm15 | |||
vpandn %ymm14, %ymm15, %ymm1 | |||
vpand %ymm15, %ymm2, %ymm14 | |||
vpxor %ymm14, %ymm1, %ymm2 | |||
vmovdqa %ymm2, 0(%rdi) | |||
vextracti128 $1, %ymm3, %xmm1 | |||
vpermq $216, %ymm1, %ymm1 | |||
vpshufb cast8_to_16(%rip), %ymm1, %ymm1 | |||
vpsrlw $8, %ymm1, %ymm2 | |||
vpand mask_ff(%rip), %ymm1, %ymm1 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_f(%rip), %ymm2, %ymm1 | |||
vpsrlw $4, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpsubw mask_3(%rip), %ymm2, %ymm14 | |||
vpsraw $15, %ymm14, %ymm15 | |||
vpandn %ymm14, %ymm15, %ymm1 | |||
vpand %ymm15, %ymm2, %ymm14 | |||
vpxor %ymm14, %ymm1, %ymm2 | |||
vmovdqa %ymm2, 32(%rdi) | |||
vmovdqa 32(%rsi), %ymm3 | |||
vextracti128 $0, %ymm3, %xmm1 | |||
vpermq $216, %ymm1, %ymm1 | |||
vpshufb cast8_to_16(%rip), %ymm1, %ymm1 | |||
vpsrlw $8, %ymm1, %ymm2 | |||
vpand mask_ff(%rip), %ymm1, %ymm1 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_f(%rip), %ymm2, %ymm1 | |||
vpsrlw $4, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpsubw mask_3(%rip), %ymm2, %ymm14 | |||
vpsraw $15, %ymm14, %ymm15 | |||
vpandn %ymm14, %ymm15, %ymm1 | |||
vpand %ymm15, %ymm2, %ymm14 | |||
vpxor %ymm14, %ymm1, %ymm2 | |||
vmovdqa %ymm2, 64(%rdi) | |||
vextracti128 $1, %ymm3, %xmm1 | |||
vpermq $216, %ymm1, %ymm1 | |||
vpshufb cast8_to_16(%rip), %ymm1, %ymm1 | |||
vpsrlw $8, %ymm1, %ymm2 | |||
vpand mask_ff(%rip), %ymm1, %ymm1 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_f(%rip), %ymm2, %ymm1 | |||
vpsrlw $4, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpsubw mask_3(%rip), %ymm2, %ymm14 | |||
vpsraw $15, %ymm14, %ymm15 | |||
vpandn %ymm14, %ymm15, %ymm1 | |||
vpand %ymm15, %ymm2, %ymm14 | |||
vpxor %ymm14, %ymm1, %ymm2 | |||
vmovdqa %ymm2, 96(%rdi) | |||
vmovdqa 64(%rsi), %ymm3 | |||
vextracti128 $0, %ymm3, %xmm1 | |||
vpermq $216, %ymm1, %ymm1 | |||
vpshufb cast8_to_16(%rip), %ymm1, %ymm1 | |||
vpsrlw $8, %ymm1, %ymm2 | |||
vpand mask_ff(%rip), %ymm1, %ymm1 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_f(%rip), %ymm2, %ymm1 | |||
vpsrlw $4, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpsubw mask_3(%rip), %ymm2, %ymm14 | |||
vpsraw $15, %ymm14, %ymm15 | |||
vpandn %ymm14, %ymm15, %ymm1 | |||
vpand %ymm15, %ymm2, %ymm14 | |||
vpxor %ymm14, %ymm1, %ymm2 | |||
vmovdqa %ymm2, 128(%rdi) | |||
vextracti128 $1, %ymm3, %xmm1 | |||
vpermq $216, %ymm1, %ymm1 | |||
vpshufb cast8_to_16(%rip), %ymm1, %ymm1 | |||
vpsrlw $8, %ymm1, %ymm2 | |||
vpand mask_ff(%rip), %ymm1, %ymm1 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_f(%rip), %ymm2, %ymm1 | |||
vpsrlw $4, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpsubw mask_3(%rip), %ymm2, %ymm14 | |||
vpsraw $15, %ymm14, %ymm15 | |||
vpandn %ymm14, %ymm15, %ymm1 | |||
vpand %ymm15, %ymm2, %ymm14 | |||
vpxor %ymm14, %ymm1, %ymm2 | |||
vmovdqa %ymm2, 160(%rdi) | |||
vmovdqa 96(%rsi), %ymm3 | |||
vextracti128 $0, %ymm3, %xmm1 | |||
vpermq $216, %ymm1, %ymm1 | |||
vpshufb cast8_to_16(%rip), %ymm1, %ymm1 | |||
vpsrlw $8, %ymm1, %ymm2 | |||
vpand mask_ff(%rip), %ymm1, %ymm1 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_f(%rip), %ymm2, %ymm1 | |||
vpsrlw $4, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpsubw mask_3(%rip), %ymm2, %ymm14 | |||
vpsraw $15, %ymm14, %ymm15 | |||
vpandn %ymm14, %ymm15, %ymm1 | |||
vpand %ymm15, %ymm2, %ymm14 | |||
vpxor %ymm14, %ymm1, %ymm2 | |||
vmovdqa %ymm2, 192(%rdi) | |||
vextracti128 $1, %ymm3, %xmm1 | |||
vpermq $216, %ymm1, %ymm1 | |||
vpshufb cast8_to_16(%rip), %ymm1, %ymm1 | |||
vpsrlw $8, %ymm1, %ymm2 | |||
vpand mask_ff(%rip), %ymm1, %ymm1 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_f(%rip), %ymm2, %ymm1 | |||
vpsrlw $4, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpsubw mask_3(%rip), %ymm2, %ymm14 | |||
vpsraw $15, %ymm14, %ymm15 | |||
vpandn %ymm14, %ymm15, %ymm1 | |||
vpand %ymm15, %ymm2, %ymm14 | |||
vpxor %ymm14, %ymm1, %ymm2 | |||
vmovdqa %ymm2, 224(%rdi) | |||
vmovdqa 128(%rsi), %ymm3 | |||
vextracti128 $0, %ymm3, %xmm1 | |||
vpermq $216, %ymm1, %ymm1 | |||
vpshufb cast8_to_16(%rip), %ymm1, %ymm1 | |||
vpsrlw $8, %ymm1, %ymm2 | |||
vpand mask_ff(%rip), %ymm1, %ymm1 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_f(%rip), %ymm2, %ymm1 | |||
vpsrlw $4, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpsubw mask_3(%rip), %ymm2, %ymm14 | |||
vpsraw $15, %ymm14, %ymm15 | |||
vpandn %ymm14, %ymm15, %ymm1 | |||
vpand %ymm15, %ymm2, %ymm14 | |||
vpxor %ymm14, %ymm1, %ymm2 | |||
vmovdqa %ymm2, 256(%rdi) | |||
vextracti128 $1, %ymm3, %xmm1 | |||
vpermq $216, %ymm1, %ymm1 | |||
vpshufb cast8_to_16(%rip), %ymm1, %ymm1 | |||
vpsrlw $8, %ymm1, %ymm2 | |||
vpand mask_ff(%rip), %ymm1, %ymm1 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_f(%rip), %ymm2, %ymm1 | |||
vpsrlw $4, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpsubw mask_3(%rip), %ymm2, %ymm14 | |||
vpsraw $15, %ymm14, %ymm15 | |||
vpandn %ymm14, %ymm15, %ymm1 | |||
vpand %ymm15, %ymm2, %ymm14 | |||
vpxor %ymm14, %ymm1, %ymm2 | |||
vmovdqa %ymm2, 288(%rdi) | |||
vmovdqa 160(%rsi), %ymm3 | |||
vextracti128 $0, %ymm3, %xmm1 | |||
vpermq $216, %ymm1, %ymm1 | |||
vpshufb cast8_to_16(%rip), %ymm1, %ymm1 | |||
vpsrlw $8, %ymm1, %ymm2 | |||
vpand mask_ff(%rip), %ymm1, %ymm1 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_f(%rip), %ymm2, %ymm1 | |||
vpsrlw $4, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpsubw mask_3(%rip), %ymm2, %ymm14 | |||
vpsraw $15, %ymm14, %ymm15 | |||
vpandn %ymm14, %ymm15, %ymm1 | |||
vpand %ymm15, %ymm2, %ymm14 | |||
vpxor %ymm14, %ymm1, %ymm2 | |||
vmovdqa %ymm2, 320(%rdi) | |||
vextracti128 $1, %ymm3, %xmm1 | |||
vpermq $216, %ymm1, %ymm1 | |||
vpshufb cast8_to_16(%rip), %ymm1, %ymm1 | |||
vpsrlw $8, %ymm1, %ymm2 | |||
vpand mask_ff(%rip), %ymm1, %ymm1 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_f(%rip), %ymm2, %ymm1 | |||
vpsrlw $4, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpsubw mask_3(%rip), %ymm2, %ymm14 | |||
vpsraw $15, %ymm14, %ymm15 | |||
vpandn %ymm14, %ymm15, %ymm1 | |||
vpand %ymm15, %ymm2, %ymm14 | |||
vpxor %ymm14, %ymm1, %ymm2 | |||
vmovdqa %ymm2, 352(%rdi) | |||
vmovdqa 192(%rsi), %ymm3 | |||
vextracti128 $0, %ymm3, %xmm1 | |||
vpermq $216, %ymm1, %ymm1 | |||
vpshufb cast8_to_16(%rip), %ymm1, %ymm1 | |||
vpsrlw $8, %ymm1, %ymm2 | |||
vpand mask_ff(%rip), %ymm1, %ymm1 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_f(%rip), %ymm2, %ymm1 | |||
vpsrlw $4, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpsubw mask_3(%rip), %ymm2, %ymm14 | |||
vpsraw $15, %ymm14, %ymm15 | |||
vpandn %ymm14, %ymm15, %ymm1 | |||
vpand %ymm15, %ymm2, %ymm14 | |||
vpxor %ymm14, %ymm1, %ymm2 | |||
vmovdqa %ymm2, 384(%rdi) | |||
vextracti128 $1, %ymm3, %xmm1 | |||
vpermq $216, %ymm1, %ymm1 | |||
vpshufb cast8_to_16(%rip), %ymm1, %ymm1 | |||
vpsrlw $8, %ymm1, %ymm2 | |||
vpand mask_ff(%rip), %ymm1, %ymm1 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_f(%rip), %ymm2, %ymm1 | |||
vpsrlw $4, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpsubw mask_3(%rip), %ymm2, %ymm14 | |||
vpsraw $15, %ymm14, %ymm15 | |||
vpandn %ymm14, %ymm15, %ymm1 | |||
vpand %ymm15, %ymm2, %ymm14 | |||
vpxor %ymm14, %ymm1, %ymm2 | |||
vmovdqa %ymm2, 416(%rdi) | |||
vmovdqa 224(%rsi), %ymm3 | |||
vextracti128 $0, %ymm3, %xmm1 | |||
vpermq $216, %ymm1, %ymm1 | |||
vpshufb cast8_to_16(%rip), %ymm1, %ymm1 | |||
vpsrlw $8, %ymm1, %ymm2 | |||
vpand mask_ff(%rip), %ymm1, %ymm1 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_f(%rip), %ymm2, %ymm1 | |||
vpsrlw $4, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpsubw mask_3(%rip), %ymm2, %ymm14 | |||
vpsraw $15, %ymm14, %ymm15 | |||
vpandn %ymm14, %ymm15, %ymm1 | |||
vpand %ymm15, %ymm2, %ymm14 | |||
vpxor %ymm14, %ymm1, %ymm2 | |||
vmovdqa %ymm2, 448(%rdi) | |||
vextracti128 $1, %ymm3, %xmm1 | |||
vpermq $216, %ymm1, %ymm1 | |||
vpshufb cast8_to_16(%rip), %ymm1, %ymm1 | |||
vpsrlw $8, %ymm1, %ymm2 | |||
vpand mask_ff(%rip), %ymm1, %ymm1 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_f(%rip), %ymm2, %ymm1 | |||
vpsrlw $4, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpsubw mask_3(%rip), %ymm2, %ymm14 | |||
vpsraw $15, %ymm14, %ymm15 | |||
vpandn %ymm14, %ymm15, %ymm1 | |||
vpand %ymm15, %ymm2, %ymm14 | |||
vpxor %ymm14, %ymm1, %ymm2 | |||
vmovdqa %ymm2, 480(%rdi) | |||
vmovdqa 256(%rsi), %ymm3 | |||
vextracti128 $0, %ymm3, %xmm1 | |||
vpermq $216, %ymm1, %ymm1 | |||
vpshufb cast8_to_16(%rip), %ymm1, %ymm1 | |||
vpsrlw $8, %ymm1, %ymm2 | |||
vpand mask_ff(%rip), %ymm1, %ymm1 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_f(%rip), %ymm2, %ymm1 | |||
vpsrlw $4, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpsubw mask_3(%rip), %ymm2, %ymm14 | |||
vpsraw $15, %ymm14, %ymm15 | |||
vpandn %ymm14, %ymm15, %ymm1 | |||
vpand %ymm15, %ymm2, %ymm14 | |||
vpxor %ymm14, %ymm1, %ymm2 | |||
vmovdqa %ymm2, 512(%rdi) | |||
vextracti128 $1, %ymm3, %xmm1 | |||
vpermq $216, %ymm1, %ymm1 | |||
vpshufb cast8_to_16(%rip), %ymm1, %ymm1 | |||
vpsrlw $8, %ymm1, %ymm2 | |||
vpand mask_ff(%rip), %ymm1, %ymm1 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_f(%rip), %ymm2, %ymm1 | |||
vpsrlw $4, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpsubw mask_3(%rip), %ymm2, %ymm14 | |||
vpsraw $15, %ymm14, %ymm15 | |||
vpandn %ymm14, %ymm15, %ymm1 | |||
vpand %ymm15, %ymm2, %ymm14 | |||
vpxor %ymm14, %ymm1, %ymm2 | |||
vmovdqa %ymm2, 544(%rdi) | |||
vmovdqa 288(%rsi), %ymm3 | |||
vextracti128 $0, %ymm3, %xmm1 | |||
vpermq $216, %ymm1, %ymm1 | |||
vpshufb cast8_to_16(%rip), %ymm1, %ymm1 | |||
vpsrlw $8, %ymm1, %ymm2 | |||
vpand mask_ff(%rip), %ymm1, %ymm1 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_f(%rip), %ymm2, %ymm1 | |||
vpsrlw $4, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpsubw mask_3(%rip), %ymm2, %ymm14 | |||
vpsraw $15, %ymm14, %ymm15 | |||
vpandn %ymm14, %ymm15, %ymm1 | |||
vpand %ymm15, %ymm2, %ymm14 | |||
vpxor %ymm14, %ymm1, %ymm2 | |||
vmovdqa %ymm2, 576(%rdi) | |||
vextracti128 $1, %ymm3, %xmm1 | |||
vpermq $216, %ymm1, %ymm1 | |||
vpshufb cast8_to_16(%rip), %ymm1, %ymm1 | |||
vpsrlw $8, %ymm1, %ymm2 | |||
vpand mask_ff(%rip), %ymm1, %ymm1 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_f(%rip), %ymm2, %ymm1 | |||
vpsrlw $4, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpsubw mask_3(%rip), %ymm2, %ymm14 | |||
vpsraw $15, %ymm14, %ymm15 | |||
vpandn %ymm14, %ymm15, %ymm1 | |||
vpand %ymm15, %ymm2, %ymm14 | |||
vpxor %ymm14, %ymm1, %ymm2 | |||
vmovdqa %ymm2, 608(%rdi) | |||
vmovdqa 320(%rsi), %ymm3 | |||
vextracti128 $0, %ymm3, %xmm1 | |||
vpermq $216, %ymm1, %ymm1 | |||
vpshufb cast8_to_16(%rip), %ymm1, %ymm1 | |||
vpsrlw $8, %ymm1, %ymm2 | |||
vpand mask_ff(%rip), %ymm1, %ymm1 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_f(%rip), %ymm2, %ymm1 | |||
vpsrlw $4, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpsubw mask_3(%rip), %ymm2, %ymm14 | |||
vpsraw $15, %ymm14, %ymm15 | |||
vpandn %ymm14, %ymm15, %ymm1 | |||
vpand %ymm15, %ymm2, %ymm14 | |||
vpxor %ymm14, %ymm1, %ymm2 | |||
vmovdqa %ymm2, 640(%rdi) | |||
vextracti128 $1, %ymm3, %xmm1 | |||
vpermq $216, %ymm1, %ymm1 | |||
vpshufb cast8_to_16(%rip), %ymm1, %ymm1 | |||
vpsrlw $8, %ymm1, %ymm2 | |||
vpand mask_ff(%rip), %ymm1, %ymm1 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_f(%rip), %ymm2, %ymm1 | |||
vpsrlw $4, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpsubw mask_3(%rip), %ymm2, %ymm14 | |||
vpsraw $15, %ymm14, %ymm15 | |||
vpandn %ymm14, %ymm15, %ymm1 | |||
vpand %ymm15, %ymm2, %ymm14 | |||
vpxor %ymm14, %ymm1, %ymm2 | |||
vmovdqa %ymm2, 672(%rdi) | |||
vmovdqa 352(%rsi), %ymm3 | |||
vextracti128 $0, %ymm3, %xmm1 | |||
vpermq $216, %ymm1, %ymm1 | |||
vpshufb cast8_to_16(%rip), %ymm1, %ymm1 | |||
vpsrlw $8, %ymm1, %ymm2 | |||
vpand mask_ff(%rip), %ymm1, %ymm1 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_f(%rip), %ymm2, %ymm1 | |||
vpsrlw $4, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpsubw mask_3(%rip), %ymm2, %ymm14 | |||
vpsraw $15, %ymm14, %ymm15 | |||
vpandn %ymm14, %ymm15, %ymm1 | |||
vpand %ymm15, %ymm2, %ymm14 | |||
vpxor %ymm14, %ymm1, %ymm2 | |||
vmovdqa %ymm2, 704(%rdi) | |||
vextracti128 $1, %ymm3, %xmm1 | |||
vpermq $216, %ymm1, %ymm1 | |||
vpshufb cast8_to_16(%rip), %ymm1, %ymm1 | |||
vpsrlw $8, %ymm1, %ymm2 | |||
vpand mask_ff(%rip), %ymm1, %ymm1 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_f(%rip), %ymm2, %ymm1 | |||
vpsrlw $4, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpsubw mask_3(%rip), %ymm2, %ymm14 | |||
vpsraw $15, %ymm14, %ymm15 | |||
vpandn %ymm14, %ymm15, %ymm1 | |||
vpand %ymm15, %ymm2, %ymm14 | |||
vpxor %ymm14, %ymm1, %ymm2 | |||
vmovdqa %ymm2, 736(%rdi) | |||
vmovdqa 384(%rsi), %ymm3 | |||
vextracti128 $0, %ymm3, %xmm1 | |||
vpermq $216, %ymm1, %ymm1 | |||
vpshufb cast8_to_16(%rip), %ymm1, %ymm1 | |||
vpsrlw $8, %ymm1, %ymm2 | |||
vpand mask_ff(%rip), %ymm1, %ymm1 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_f(%rip), %ymm2, %ymm1 | |||
vpsrlw $4, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpsubw mask_3(%rip), %ymm2, %ymm14 | |||
vpsraw $15, %ymm14, %ymm15 | |||
vpandn %ymm14, %ymm15, %ymm1 | |||
vpand %ymm15, %ymm2, %ymm14 | |||
vpxor %ymm14, %ymm1, %ymm2 | |||
vmovdqa %ymm2, 768(%rdi) | |||
vextracti128 $1, %ymm3, %xmm1 | |||
vpermq $216, %ymm1, %ymm1 | |||
vpshufb cast8_to_16(%rip), %ymm1, %ymm1 | |||
vpsrlw $8, %ymm1, %ymm2 | |||
vpand mask_ff(%rip), %ymm1, %ymm1 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_f(%rip), %ymm2, %ymm1 | |||
vpsrlw $4, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpsubw mask_3(%rip), %ymm2, %ymm14 | |||
vpsraw $15, %ymm14, %ymm15 | |||
vpandn %ymm14, %ymm15, %ymm1 | |||
vpand %ymm15, %ymm2, %ymm14 | |||
vpxor %ymm14, %ymm1, %ymm2 | |||
vmovdqa %ymm2, 800(%rdi) | |||
vmovdqa 416(%rsi), %ymm3 | |||
vextracti128 $0, %ymm3, %xmm1 | |||
vpermq $216, %ymm1, %ymm1 | |||
vpshufb cast8_to_16(%rip), %ymm1, %ymm1 | |||
vpsrlw $8, %ymm1, %ymm2 | |||
vpand mask_ff(%rip), %ymm1, %ymm1 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_f(%rip), %ymm2, %ymm1 | |||
vpsrlw $4, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpsubw mask_3(%rip), %ymm2, %ymm14 | |||
vpsraw $15, %ymm14, %ymm15 | |||
vpandn %ymm14, %ymm15, %ymm1 | |||
vpand %ymm15, %ymm2, %ymm14 | |||
vpxor %ymm14, %ymm1, %ymm2 | |||
vmovdqa %ymm2, 832(%rdi) | |||
vextracti128 $1, %ymm3, %xmm1 | |||
vpermq $216, %ymm1, %ymm1 | |||
vpshufb cast8_to_16(%rip), %ymm1, %ymm1 | |||
vpsrlw $8, %ymm1, %ymm2 | |||
vpand mask_ff(%rip), %ymm1, %ymm1 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_f(%rip), %ymm2, %ymm1 | |||
vpsrlw $4, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpsubw mask_3(%rip), %ymm2, %ymm14 | |||
vpsraw $15, %ymm14, %ymm15 | |||
vpandn %ymm14, %ymm15, %ymm1 | |||
vpand %ymm15, %ymm2, %ymm14 | |||
vpxor %ymm14, %ymm1, %ymm2 | |||
vmovdqa %ymm2, 864(%rdi) | |||
vmovdqa 448(%rsi), %ymm3 | |||
vextracti128 $0, %ymm3, %xmm1 | |||
vpermq $216, %ymm1, %ymm1 | |||
vpshufb cast8_to_16(%rip), %ymm1, %ymm1 | |||
vpsrlw $8, %ymm1, %ymm2 | |||
vpand mask_ff(%rip), %ymm1, %ymm1 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_f(%rip), %ymm2, %ymm1 | |||
vpsrlw $4, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpsubw mask_3(%rip), %ymm2, %ymm14 | |||
vpsraw $15, %ymm14, %ymm15 | |||
vpandn %ymm14, %ymm15, %ymm1 | |||
vpand %ymm15, %ymm2, %ymm14 | |||
vpxor %ymm14, %ymm1, %ymm2 | |||
vmovdqa %ymm2, 896(%rdi) | |||
vextracti128 $1, %ymm3, %xmm1 | |||
vpermq $216, %ymm1, %ymm1 | |||
vpshufb cast8_to_16(%rip), %ymm1, %ymm1 | |||
vpsrlw $8, %ymm1, %ymm2 | |||
vpand mask_ff(%rip), %ymm1, %ymm1 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_f(%rip), %ymm2, %ymm1 | |||
vpsrlw $4, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpsubw mask_3(%rip), %ymm2, %ymm14 | |||
vpsraw $15, %ymm14, %ymm15 | |||
vpandn %ymm14, %ymm15, %ymm1 | |||
vpand %ymm15, %ymm2, %ymm14 | |||
vpxor %ymm14, %ymm1, %ymm2 | |||
vmovdqa %ymm2, 928(%rdi) | |||
vmovdqa 480(%rsi), %ymm3 | |||
vextracti128 $0, %ymm3, %xmm1 | |||
vpermq $216, %ymm1, %ymm1 | |||
vpshufb cast8_to_16(%rip), %ymm1, %ymm1 | |||
vpsrlw $8, %ymm1, %ymm2 | |||
vpand mask_ff(%rip), %ymm1, %ymm1 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_f(%rip), %ymm2, %ymm1 | |||
vpsrlw $4, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpsubw mask_3(%rip), %ymm2, %ymm14 | |||
vpsraw $15, %ymm14, %ymm15 | |||
vpandn %ymm14, %ymm15, %ymm1 | |||
vpand %ymm15, %ymm2, %ymm14 | |||
vpxor %ymm14, %ymm1, %ymm2 | |||
vmovdqa %ymm2, 960(%rdi) | |||
vextracti128 $1, %ymm3, %xmm1 | |||
vpermq $216, %ymm1, %ymm1 | |||
vpshufb cast8_to_16(%rip), %ymm1, %ymm1 | |||
vpsrlw $8, %ymm1, %ymm2 | |||
vpand mask_ff(%rip), %ymm1, %ymm1 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_f(%rip), %ymm2, %ymm1 | |||
vpsrlw $4, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpsubw mask_3(%rip), %ymm2, %ymm14 | |||
vpsraw $15, %ymm14, %ymm15 | |||
vpandn %ymm14, %ymm15, %ymm1 | |||
vpand %ymm15, %ymm2, %ymm14 | |||
vpxor %ymm14, %ymm1, %ymm2 | |||
vmovdqa %ymm2, 992(%rdi) | |||
movw $0, 1016(%rdi) | |||
movw $0, 1018(%rdi) | |||
movw $0, 1020(%rdi) | |||
movw $0, 1022(%rdi) | |||
ret |
@@ -1,8 +1,8 @@ | |||
# This Makefile can be used with GNU Make or BSD Make | |||
LIB=libntruhps2048509_clean.a | |||
HEADERS=api.h crypto_sort.h owcpa.h params.h poly.h sample.h verify.h | |||
OBJECTS=crypto_sort.o kem.o owcpa.o pack3.o packq.o poly.o poly_lift.o poly_mod.o poly_r2_inv.o poly_rq_mul.o poly_s3_inv.o sample.o sample_iid.o verify.o | |||
HEADERS=api.h cmov.h crypto_sort_int32.h owcpa.h params.h poly.h sample.h | |||
OBJECTS=cmov.o crypto_sort_int32.o kem.o owcpa.o pack3.o packq.o poly.o poly_lift.o poly_mod.o poly_r2_inv.o poly_rq_mul.o poly_s3_inv.o sample.o sample_iid.o | |||
CFLAGS=-O3 -Wall -Wextra -Wpedantic -Wvla -Werror -Wredundant-decls -Wmissing-prototypes -std=c99 -I../../../common $(EXTRAFLAGS) | |||
@@ -2,7 +2,7 @@ | |||
# nmake /f Makefile.Microsoft_nmake | |||
LIBRARY=libntruhps2048509_clean.lib | |||
OBJECTS=crypto_sort.obj kem.obj owcpa.obj pack3.obj packq.obj poly.obj poly_lift.obj poly_mod.obj poly_r2_inv.obj poly_rq_mul.obj poly_s3_inv.obj sample.obj sample_iid.obj verify.obj | |||
OBJECTS=cmov.obj crypto_sort_int32.obj kem.obj owcpa.obj pack3.obj packq.obj poly.obj poly_lift.obj poly_mod.obj poly_r2_inv.obj poly_rq_mul.obj poly_s3_inv.obj sample.obj sample_iid.obj | |||
CFLAGS=/nologo /O2 /I ..\..\..\common /W4 /WX | |||
@@ -8,7 +8,7 @@ | |||
#define PQCLEAN_NTRUHPS2048509_CLEAN_CRYPTO_CIPHERTEXTBYTES 699 | |||
#define PQCLEAN_NTRUHPS2048509_CLEAN_CRYPTO_BYTES 32 | |||
#define PQCLEAN_NTRUHPS2048509_CLEAN_CRYPTO_ALGNAME "NTRU-HPS2048509" | |||
#define PQCLEAN_NTRUHPS2048509_CLEAN_CRYPTO_ALGNAME "ntruhps2048509" | |||
int PQCLEAN_NTRUHPS2048509_CLEAN_crypto_kem_keypair(uint8_t *pk, uint8_t *sk); | |||
@@ -0,0 +1,11 @@ | |||
#include "cmov.h" | |||
/* b = 1 means mov, b = 0 means don't mov*/ | |||
void PQCLEAN_NTRUHPS2048509_CLEAN_cmov(unsigned char *r, const unsigned char *x, size_t len, unsigned char b) { | |||
size_t i; | |||
b = (~b + 1); | |||
for (i = 0; i < len; i++) { | |||
r[i] ^= b & (x[i] ^ r[i]); | |||
} | |||
} |
@@ -0,0 +1,10 @@ | |||
#ifndef VERIFY_H | |||
#define VERIFY_H | |||
#include "params.h" | |||
#include <stddef.h> | |||
void PQCLEAN_NTRUHPS2048509_CLEAN_cmov(unsigned char *r, const unsigned char *x, size_t len, unsigned char b); | |||
#endif |
@@ -1,50 +0,0 @@ | |||
// XXX: Temporary placeholder for a faster sort. | |||
// Copied from supercop-20190110/crypto_sort/int32/portable3 | |||
#include <stdint.h> | |||
#include "crypto_sort.h" | |||
#define int32_MINMAX(a,b) \ | |||
do { \ | |||
int32_t ab = (b) ^ (a); \ | |||
int32_t c = (int32_t)((int64_t)(b) - (int64_t)(a)); \ | |||
c ^= ab & (c ^ (b)); \ | |||
c >>= 31; \ | |||
c &= ab; \ | |||
(a) ^= c; \ | |||
(b) ^= c; \ | |||
} while(0) | |||
void PQCLEAN_NTRUHPS2048509_CLEAN_crypto_sort(void *array, long long n) { | |||
long long top, p, q, r, i; | |||
int32_t *x = array; | |||
if (n < 2) { | |||
return; | |||
} | |||
top = 1; | |||
while (top < n - top) { | |||
top += top; | |||
} | |||
for (p = top; p > 0; p >>= 1) { | |||
for (i = 0; i < n - p; ++i) { | |||
if (!(i & p)) { | |||
int32_MINMAX(x[i], x[i + p]); | |||
} | |||
} | |||
i = 0; | |||
for (q = top; q > p; q >>= 1) { | |||
for (; i < n - q; ++i) { | |||
if (!(i & p)) { | |||
int32_t a = x[i + p]; | |||
for (r = q; r > p; r >>= 1) { | |||
int32_MINMAX(a, x[i + r]); | |||
} | |||
x[i + p] = a; | |||
} | |||
} | |||
} | |||
} | |||
} |
@@ -1,6 +0,0 @@ | |||
#ifndef CRYPTO_SORT | |||
#define CRYPTO_SORT | |||
void PQCLEAN_NTRUHPS2048509_CLEAN_crypto_sort(void *array, long long n); | |||
#endif |
@@ -0,0 +1,86 @@ | |||
// Based on supercop-20190110/crypto_sort/int32/x86 | |||
#include "crypto_sort_int32.h" | |||
#include <stdint.h> | |||
#define int32 int32_t | |||
#define int32_MINMAX(a,b) \ | |||
do { \ | |||
int32_t ab = (b) ^ (a); \ | |||
int32_t c = (int32_t)((int64_t)(b) - (int64_t)(a)); \ | |||
c ^= ab & (c ^ (b)); \ | |||
c >>= 31; \ | |||
c &= ab; \ | |||
(a) ^= c; \ | |||
(b) ^= c; \ | |||
} while(0) | |||
/* assume 2 <= n <= 0x40000000 */ | |||
void PQCLEAN_NTRUHPS2048509_CLEAN_crypto_sort_int32(int32 *array, size_t n) { | |||
size_t top, p, q, r, i, j; | |||
int32 *x = array; | |||
top = 1; | |||
while (top < n - top) { | |||
top += top; | |||
} | |||
for (p = top; p >= 1; p >>= 1) { | |||
i = 0; | |||
while (i + 2 * p <= n) { | |||
for (j = i; j < i + p; ++j) { | |||
int32_MINMAX(x[j], x[j + p]); | |||
} | |||
i += 2 * p; | |||
} | |||
for (j = i; j < n - p; ++j) { | |||
int32_MINMAX(x[j], x[j + p]); | |||
} | |||
i = 0; | |||
j = 0; | |||
for (q = top; q > p; q >>= 1) { | |||
if (j != i) { | |||
for (;;) { | |||
if (j == n - q) { | |||
goto done; | |||
} | |||
int32 a = x[j + p]; | |||
for (r = q; r > p; r >>= 1) { | |||
int32_MINMAX(a, x[j + r]); | |||
} | |||
x[j + p] = a; | |||
++j; | |||
if (j == i + p) { | |||
i += 2 * p; | |||
break; | |||
} | |||
} | |||
} | |||
while (i + p <= n - q) { | |||
for (j = i; j < i + p; ++j) { | |||
int32 a = x[j + p]; | |||
for (r = q; r > p; r >>= 1) { | |||
int32_MINMAX(a, x[j + r]); | |||
} | |||
x[j + p] = a; | |||
} | |||
i += 2 * p; | |||
} | |||
/* now i + p > n - q */ | |||
j = i; | |||
while (j < n - q) { | |||
int32 a = x[j + p]; | |||
for (r = q; r > p; r >>= 1) { | |||
int32_MINMAX(a, x[j + r]); | |||
} | |||
x[j + p] = a; | |||
++j; | |||
} | |||
done: | |||
; | |||
} | |||
} | |||
} |
@@ -0,0 +1,11 @@ | |||
#ifndef CRYPTO_SORT | |||
#define CRYPTO_SORT | |||
#include "params.h" | |||
#include <stddef.h> | |||
#include <stdint.h> | |||
void PQCLEAN_NTRUHPS2048509_CLEAN_crypto_sort_int32(int32_t *array, size_t n); | |||
#endif |
@@ -1,12 +1,10 @@ | |||
#include <stdint.h> | |||
#include "api.h" | |||
#include "cmov.h" | |||
#include "fips202.h" | |||
#include "owcpa.h" | |||
#include "params.h" | |||
#include "randombytes.h" | |||
#include "sample.h" | |||
#include "verify.h" | |||
// API FUNCTIONS | |||
int PQCLEAN_NTRUHPS2048509_CLEAN_crypto_kem_keypair(uint8_t *pk, uint8_t *sk) { | |||
@@ -51,7 +49,7 @@ int PQCLEAN_NTRUHPS2048509_CLEAN_crypto_kem_dec(uint8_t *k, const uint8_t *c, co | |||
fail |= PQCLEAN_NTRUHPS2048509_CLEAN_owcpa_dec(rm, c, sk); | |||
/* If fail = 0 then c = Enc(h, rm). There is no need to re-encapsulate. */ | |||
/* See comment in PQCLEAN_NTRUHPS2048509_CLEAN_owcpa_dec for details. */ | |||
/* See comment in PQCLEAN_NTRUHPS2048509_CLEAN_owcpa_dec for details. */ | |||
sha3_256(k, rm, NTRU_OWCPA_MSGBYTES); | |||
@@ -59,6 +59,7 @@ void PQCLEAN_NTRUHPS2048509_CLEAN_owcpa_keypair(unsigned char *pk, | |||
PQCLEAN_NTRUHPS2048509_CLEAN_poly_Z3_to_Zq(f); | |||
PQCLEAN_NTRUHPS2048509_CLEAN_poly_Z3_to_Zq(g); | |||
/* g = 3*g */ | |||
for (i = 0; i < NTRU_N; i++) { | |||
g->coeffs[i] = 3 * g->coeffs[i]; | |||
@@ -19,7 +19,6 @@ void PQCLEAN_NTRUHPS2048509_CLEAN_poly_S3_tobytes(unsigned char msg[NTRU_OWCPA_M | |||
c = (3 * c + a->coeffs[5 * i + j]) & 255; | |||
} | |||
msg[i] = c; | |||
} | |||
void PQCLEAN_NTRUHPS2048509_CLEAN_poly_S3_frombytes(poly *r, const unsigned char msg[NTRU_OWCPA_MSGBYTES]) { | |||
@@ -29,7 +29,6 @@ void PQCLEAN_NTRUHPS2048509_CLEAN_poly_Sq_tobytes(unsigned char *r, const poly * | |||
t[j] = 0; | |||
} | |||
switch (NTRU_PACK_DEG & 0x07) { | |||
// cases 0 and 6 are impossible since 2 generates (Z/n)* and | |||
// p mod 8 in {1, 7} implies that 2 is a quadratic residue. | |||
@@ -61,19 +60,18 @@ void PQCLEAN_NTRUHPS2048509_CLEAN_poly_Sq_frombytes(poly *r, const unsigned char | |||
r->coeffs[8 * i + 6] = (a[11 * i + 8] >> 2) | (((uint16_t)a[11 * i + 9] & 0x1f) << 6); | |||
r->coeffs[8 * i + 7] = (a[11 * i + 9] >> 5) | (((uint16_t)a[11 * i + 10] & 0xff) << 3); | |||
} | |||
switch (NTRU_PACK_DEG & 0x07) { | |||
// cases 0 and 6 are impossible since 2 generates (Z/n)* and | |||
// p mod 8 in {1, 7} implies that 2 is a quadratic residue. | |||
case 4: | |||
r->coeffs[8 * i + 0] = (unsigned char) (a[11 * i + 0] >> 0) | (((uint16_t)a[11 * i + 1] & 0x07) << 8); | |||
r->coeffs[8 * i + 1] = (unsigned char) (a[11 * i + 1] >> 3) | (((uint16_t)a[11 * i + 2] & 0x3f) << 5); | |||
r->coeffs[8 * i + 2] = (unsigned char) (a[11 * i + 2] >> 6) | (((uint16_t)a[11 * i + 3] & 0xff) << 2) | (((uint16_t)a[11 * i + 4] & 0x01) << 10); | |||
r->coeffs[8 * i + 3] = (unsigned char) (a[11 * i + 4] >> 1) | (((uint16_t)a[11 * i + 5] & 0x0f) << 7); | |||
r->coeffs[8 * i + 0] = (a[11 * i + 0] >> 0) | (((uint16_t)a[11 * i + 1] & 0x07) << 8); | |||
r->coeffs[8 * i + 1] = (a[11 * i + 1] >> 3) | (((uint16_t)a[11 * i + 2] & 0x3f) << 5); | |||
r->coeffs[8 * i + 2] = (a[11 * i + 2] >> 6) | (((uint16_t)a[11 * i + 3] & 0xff) << 2) | (((uint16_t)a[11 * i + 4] & 0x01) << 10); | |||
r->coeffs[8 * i + 3] = (a[11 * i + 4] >> 1) | (((uint16_t)a[11 * i + 5] & 0x0f) << 7); | |||
break; | |||
case 2: | |||
r->coeffs[8 * i + 0] = (unsigned char) (a[11 * i + 0] >> 0) | (((uint16_t)a[11 * i + 1] & 0x07) << 8); | |||
r->coeffs[8 * i + 1] = (unsigned char) (a[11 * i + 1] >> 3) | (((uint16_t)a[11 * i + 2] & 0x3f) << 5); | |||
r->coeffs[8 * i + 0] = (a[11 * i + 0] >> 0) | (((uint16_t)a[11 * i + 1] & 0x07) << 8); | |||
r->coeffs[8 * i + 1] = (a[11 * i + 1] >> 3) | (((uint16_t)a[11 * i + 2] & 0x3f) << 5); | |||
break; | |||
} | |||
r->coeffs[NTRU_N - 1] = 0; | |||
@@ -5,6 +5,7 @@ | |||
#define NTRU_N 509 | |||
#define NTRU_LOGQ 11 | |||
/* Do not modify below this line */ | |||
#define PAD32(X) ((((X) + 31)/32)*32) | |||
@@ -1,16 +1,13 @@ | |||
#ifndef POLY_H | |||
#define POLY_H | |||
#include <stdint.h> | |||
#include "params.h" | |||
#include <stdint.h> | |||
#define MODQ(X) ((X) & (NTRU_Q-1)) | |||
typedef struct { | |||
// round to nearest multiple of 32 to make it easier to load into vector | |||
// registers without having to do bound checks | |||
#define NTRU_N_32 PAD32(NTRU_N) | |||
uint16_t coeffs[NTRU_N]; | |||
} poly; | |||
@@ -38,5 +35,4 @@ void PQCLEAN_NTRUHPS2048509_CLEAN_poly_S3_inv(poly *r, const poly *a); | |||
void PQCLEAN_NTRUHPS2048509_CLEAN_poly_Z3_to_Zq(poly *r); | |||
void PQCLEAN_NTRUHPS2048509_CLEAN_poly_trinary_Zq_to_Z3(poly *r); | |||
#endif |
@@ -8,3 +8,4 @@ void PQCLEAN_NTRUHPS2048509_CLEAN_poly_lift(poly *r, const poly *a) { | |||
PQCLEAN_NTRUHPS2048509_CLEAN_poly_Z3_to_Zq(r); | |||
} | |||
@@ -1,113 +1,69 @@ | |||
#include "poly.h" | |||
#include "verify.h" | |||
#define POLY_R2_ADD(I,A,B,S) \ | |||
for ((I)=0; (I)<NTRU_N; (I)++) { \ | |||
(A).coeffs[(I)] ^= (B).coeffs[(I)] * (S); \ | |||
} | |||
static void cswappoly(poly *a, poly *b, int swap) { | |||
int i; | |||
uint16_t t; | |||
swap = -swap; | |||
for (i = 0; i < NTRU_N; i++) { | |||
t = (a->coeffs[i] ^ b->coeffs[i]) & swap; | |||
a->coeffs[i] ^= t; | |||
b->coeffs[i] ^= t; | |||
} | |||
} | |||
static inline void poly_divx(poly *a, int s) { | |||
int i; | |||
/* Based on supercop-20200702/crypto_core/invhrss701/simpler/core.c */ | |||
for (i = 1; i < NTRU_N; i++) { | |||
a->coeffs[i - 1] = (unsigned char) ((s * a->coeffs[i]) | (!s * a->coeffs[i - 1])); | |||
} | |||
a->coeffs[NTRU_N - 1] = (!s * a->coeffs[NTRU_N - 1]); | |||
} | |||
static inline void poly_mulx(poly *a, int s) { | |||
int i; | |||
#include "poly.h" | |||
for (i = 1; i < NTRU_N; i++) { | |||
a->coeffs[NTRU_N - i] = (unsigned char) ((s * a->coeffs[NTRU_N - i - 1]) | (!s * a->coeffs[NTRU_N - i])); | |||
} | |||
a->coeffs[0] = (!s * a->coeffs[0]); | |||
/* return -1 if x<0 and y<0; otherwise return 0 */ | |||
static inline int both_negative_mask(int x, int y) { | |||
return (x & y) >> 15; | |||
} | |||
void PQCLEAN_NTRUHPS2048509_CLEAN_poly_R2_inv(poly *r, const poly *a) { | |||
/* Schroeppel--Orman--O'Malley--Spatscheck | |||
* "Almost Inverse" algorithm as described | |||
* by Silverman in NTRU Tech Report #14 */ | |||
// with several modifications to make it run in constant-time | |||
int i, j; | |||
int k = 0; | |||
uint16_t degf = NTRU_N - 1; | |||
uint16_t degg = NTRU_N - 1; | |||
int sign, t, swap; | |||
int16_t done = 0; | |||
poly b, f, g; | |||
poly *c = r; // save some stack space | |||
poly *temp_r = &f; | |||
poly f, g, v, w; | |||
int i, loop, delta; | |||
int sign, swap, t; | |||
/* b(X) := 1 */ | |||
for (i = 1; i < NTRU_N; i++) { | |||
b.coeffs[i] = 0; | |||
for (i = 0; i < NTRU_N; ++i) { | |||
v.coeffs[i] = 0; | |||
} | |||
b.coeffs[0] = 1; | |||
/* c(X) := 0 */ | |||
for (i = 0; i < NTRU_N; i++) { | |||
c->coeffs[i] = 0; | |||
for (i = 0; i < NTRU_N; ++i) { | |||
w.coeffs[i] = 0; | |||
} | |||
w.coeffs[0] = 1; | |||
/* f(X) := a(X) */ | |||
for (i = 0; i < NTRU_N; i++) { | |||
f.coeffs[i] = a->coeffs[i] & 1; | |||
for (i = 0; i < NTRU_N; ++i) { | |||
f.coeffs[i] = 1; | |||
} | |||
/* g(X) := 1 + X + X^2 + ... + X^{N-1} */ | |||
for (i = 0; i < NTRU_N; i++) { | |||
g.coeffs[i] = 1; | |||
for (i = 0; i < NTRU_N - 1; ++i) { | |||
g.coeffs[NTRU_N - 2 - i] = (a->coeffs[i] ^ a->coeffs[NTRU_N - 1]) & 1; | |||
} | |||
g.coeffs[NTRU_N - 1] = 0; | |||
for (j = 0; j < 2 * (NTRU_N - 1) - 1; j++) { | |||
sign = f.coeffs[0]; | |||
swap = sign & !done & ((degf - degg) >> 15); | |||
cswappoly(&f, &g, swap); | |||
cswappoly(&b, c, swap); | |||
t = (degf ^ degg) & (-swap); | |||
degf ^= t; | |||
degg ^= t; | |||
POLY_R2_ADD(i, f, g, sign * (!done)); | |||
POLY_R2_ADD(i, b, (*c), sign * (!done)); | |||
poly_divx(&f, !done); | |||
poly_mulx(c, !done); | |||
degf -= !done; | |||
k += !done; | |||
done = 1 - (((uint16_t) - degf) >> 15); | |||
} | |||
delta = 1; | |||
k = k - NTRU_N * ((uint16_t)(NTRU_N - k - 1) >> 15); | |||
for (loop = 0; loop < 2 * (NTRU_N - 1) - 1; ++loop) { | |||
for (i = NTRU_N - 1; i > 0; --i) { | |||
v.coeffs[i] = v.coeffs[i - 1]; | |||
} | |||
v.coeffs[0] = 0; | |||
sign = g.coeffs[0] & f.coeffs[0]; | |||
swap = both_negative_mask(-delta, -(int) g.coeffs[0]); | |||
delta ^= swap & (delta ^ -delta); | |||
delta += 1; | |||
for (i = 0; i < NTRU_N; ++i) { | |||
t = swap & (f.coeffs[i] ^ g.coeffs[i]); | |||
f.coeffs[i] ^= t; | |||
g.coeffs[i] ^= t; | |||
t = swap & (v.coeffs[i] ^ w.coeffs[i]); | |||
v.coeffs[i] ^= t; | |||
w.coeffs[i] ^= t; | |||
} | |||
/* Return X^{N-k} * b(X) */ | |||
/* This is a k-coefficient rotation. We do this by looking at the binary | |||
representation of k, rotating for every power of 2, and performing a cmov | |||
if the respective bit is set. */ | |||
for (i = 0; i < NTRU_N; i++) { | |||
r->coeffs[i] = b.coeffs[i]; | |||
for (i = 0; i < NTRU_N; ++i) { | |||
g.coeffs[i] = g.coeffs[i] ^ (sign & f.coeffs[i]); | |||
} | |||
for (i = 0; i < NTRU_N; ++i) { | |||
w.coeffs[i] = w.coeffs[i] ^ (sign & v.coeffs[i]); | |||
} | |||
for (i = 0; i < NTRU_N - 1; ++i) { | |||
g.coeffs[i] = g.coeffs[i + 1]; | |||
} | |||
g.coeffs[NTRU_N - 1] = 0; | |||
} | |||
for (i = 0; i < 10; i++) { | |||
for (j = 0; j < NTRU_N; j++) { | |||
temp_r->coeffs[j] = r->coeffs[(j + (1 << i)) % NTRU_N]; | |||
} | |||
PQCLEAN_NTRUHPS2048509_CLEAN_cmov((unsigned char *) & (r->coeffs), | |||
(unsigned char *) & (temp_r->coeffs), sizeof(uint16_t) * NTRU_N, k & 1); | |||
k >>= 1; | |||
for (i = 0; i < NTRU_N - 1; ++i) { | |||
r->coeffs[i] = v.coeffs[NTRU_N - 2 - i]; | |||
} | |||
r->coeffs[NTRU_N - 1] = 0; | |||
} |
@@ -1,137 +1,78 @@ | |||
/* Based on supercop-20200702/crypto_core/invhrss701/simpler/core.c */ | |||
#include "poly.h" | |||
#include "verify.h" | |||
static uint16_t mod3(uint16_t a) { | |||
uint16_t r; | |||
static inline uint8_t mod3(uint8_t a) { /* a between 0 and 9 */ | |||
int16_t t, c; | |||
r = (a >> 8) + (a & 0xff); // r mod 255 == a mod 255 | |||
r = (r >> 4) + (r & 0xf); // r' mod 15 == r mod 15 | |||
r = (r >> 2) + (r & 0x3); // r' mod 3 == r mod 3 | |||
r = (r >> 2) + (r & 0x3); // r' mod 3 == r mod 3 | |||
t = r - 3; | |||
c = t >> 15; | |||
return (c & r) ^ (~c & t); | |||
a = (a >> 2) + (a & 3); /* between 0 and 4 */ | |||
t = a - 3; | |||
c = t >> 5; | |||
return t ^ (c & (a ^ t)); | |||
} | |||
#define POLY_S3_FMADD(I,A,B,S) \ | |||
for ((I)=0; (I)<NTRU_N; (I)++) { \ | |||
(A).coeffs[(I)] = mod3((A).coeffs[(I)] + (S) * (B).coeffs[(I)]); \ | |||
} | |||
static void cswappoly(poly *a, poly *b, int swap) { | |||
int i; | |||
uint16_t t; | |||
swap = -swap; | |||
for (i = 0; i < NTRU_N; i++) { | |||
t = (a->coeffs[i] ^ b->coeffs[i]) & swap; | |||
a->coeffs[i] ^= t; | |||
b->coeffs[i] ^= t; | |||
} | |||
/* return -1 if x<0 and y<0; otherwise return 0 */ | |||
static inline int both_negative_mask(int x, int y) { | |||
return (x & y) >> 15; | |||
} | |||
static inline void poly_divx(poly *a, int s) { | |||
int i; | |||
void PQCLEAN_NTRUHPS2048509_CLEAN_poly_S3_inv(poly *r, const poly *a) { | |||
poly f, g, v, w; | |||
int i, loop, delta; | |||
int sign, swap, t; | |||
for (i = 1; i < NTRU_N; i++) { | |||
a->coeffs[i - 1] = (unsigned char) ((s * a->coeffs[i]) | (!s * a->coeffs[i - 1])); | |||
for (i = 0; i < NTRU_N; ++i) { | |||
v.coeffs[i] = 0; | |||
} | |||
a->coeffs[NTRU_N - 1] = (!s * a->coeffs[NTRU_N - 1]); | |||
} | |||
static inline void poly_mulx(poly *a, int s) { | |||
int i; | |||
for (i = 1; i < NTRU_N; i++) { | |||
a->coeffs[NTRU_N - i] = (unsigned char) ((s * a->coeffs[NTRU_N - i - 1]) | (!s * a->coeffs[NTRU_N - i])); | |||
for (i = 0; i < NTRU_N; ++i) { | |||
w.coeffs[i] = 0; | |||
} | |||
a->coeffs[0] = (!s * a->coeffs[0]); | |||
} | |||
void PQCLEAN_NTRUHPS2048509_CLEAN_poly_S3_inv(poly *r, const poly *a) { | |||
/* Schroeppel--Orman--O'Malley--Spatscheck | |||
* "Almost Inverse" algorithm as described | |||
* by Silverman in NTRU Tech Report #14 */ | |||
// with several modifications to make it run in constant-time | |||
int i, j; | |||
uint16_t k = 0; | |||
uint16_t degf = NTRU_N - 1; | |||
uint16_t degg = NTRU_N - 1; | |||
int sign, fsign = 0, t, swap; | |||
int16_t done = 0; | |||
poly b, c, f, g; | |||
poly *temp_r = &f; | |||
w.coeffs[0] = 1; | |||
/* b(X) := 1 */ | |||
for (i = 1; i < NTRU_N; i++) { | |||
b.coeffs[i] = 0; | |||
for (i = 0; i < NTRU_N; ++i) { | |||
f.coeffs[i] = 1; | |||
} | |||
b.coeffs[0] = 1; | |||
/* c(X) := 0 */ | |||
for (i = 0; i < NTRU_N; i++) { | |||
c.coeffs[i] = 0; | |||
for (i = 0; i < NTRU_N - 1; ++i) { | |||
g.coeffs[NTRU_N - 2 - i] = mod3((a->coeffs[i] & 3) + 2 * (a->coeffs[NTRU_N - 1] & 3)); | |||
} | |||
g.coeffs[NTRU_N - 1] = 0; | |||
/* f(X) := a(X) */ | |||
for (i = 0; i < NTRU_N; i++) { | |||
f.coeffs[i] = a->coeffs[i]; | |||
} | |||
delta = 1; | |||
/* g(X) := 1 + X + X^2 + ... + X^{N-1} */ | |||
for (i = 0; i < NTRU_N; i++) { | |||
g.coeffs[i] = 1; | |||
} | |||
for (loop = 0; loop < 2 * (NTRU_N - 1) - 1; ++loop) { | |||
for (i = NTRU_N - 1; i > 0; --i) { | |||
v.coeffs[i] = v.coeffs[i - 1]; | |||
} | |||
v.coeffs[0] = 0; | |||
for (j = 0; j < 2 * (NTRU_N - 1) - 1; j++) { | |||
sign = mod3(2 * g.coeffs[0] * f.coeffs[0]); | |||
swap = (((sign & 2) >> 1) | sign) & !done & ((degf - degg) >> 15); | |||
cswappoly(&f, &g, swap); | |||
cswappoly(&b, &c, swap); | |||
t = (degf ^ degg) & (-swap); | |||
degf ^= t; | |||
degg ^= t; | |||
swap = both_negative_mask(-delta, -(int) g.coeffs[0]); | |||
delta ^= swap & (delta ^ -delta); | |||
delta += 1; | |||
for (i = 0; i < NTRU_N; ++i) { | |||
t = swap & (f.coeffs[i] ^ g.coeffs[i]); | |||
f.coeffs[i] ^= t; | |||
g.coeffs[i] ^= t; | |||
t = swap & (v.coeffs[i] ^ w.coeffs[i]); | |||
v.coeffs[i] ^= t; | |||
w.coeffs[i] ^= t; | |||
} | |||
for (i = 0; i < NTRU_N; i++) { | |||
f.coeffs[i] = mod3(f.coeffs[i] + ((uint16_t) (sign * (!done))) * g.coeffs[i]); | |||
for (i = 0; i < NTRU_N; ++i) { | |||
g.coeffs[i] = mod3(g.coeffs[i] + sign * f.coeffs[i]); | |||
} | |||
for (i = 0; i < NTRU_N; i++) { | |||
b.coeffs[i] = mod3(b.coeffs[i] + ((uint16_t) (sign * (!done))) * c.coeffs[i]); | |||
for (i = 0; i < NTRU_N; ++i) { | |||
w.coeffs[i] = mod3(w.coeffs[i] + sign * v.coeffs[i]); | |||
} | |||
poly_divx(&f, !done); | |||
poly_mulx(&c, !done); | |||
degf -= !done; | |||
k += !done; | |||
done = 1 - (((uint16_t) - degf) >> 15); | |||
} | |||
fsign = f.coeffs[0]; | |||
k = k - NTRU_N * ((uint16_t)(NTRU_N - k - 1) >> 15); | |||
/* Return X^{N-k} * b(X) */ | |||
/* This is a k-coefficient rotation. We do this by looking at the binary | |||
representation of k, rotating for every power of 2, and performing a cmov | |||
if the respective bit is set. */ | |||
for (i = 0; i < NTRU_N; i++) { | |||
r->coeffs[i] = mod3((uint16_t) fsign * b.coeffs[i]); | |||
} | |||
for (i = 0; i < 10; i++) { | |||
for (j = 0; j < NTRU_N; j++) { | |||
temp_r->coeffs[j] = r->coeffs[(j + (1 << i)) % NTRU_N]; | |||
for (i = 0; i < NTRU_N - 1; ++i) { | |||
g.coeffs[i] = g.coeffs[i + 1]; | |||
} | |||
PQCLEAN_NTRUHPS2048509_CLEAN_cmov((unsigned char *) & (r->coeffs), | |||
(unsigned char *) & (temp_r->coeffs), sizeof(uint16_t) * NTRU_N, k & 1); | |||
k >>= 1; | |||
g.coeffs[NTRU_N - 1] = 0; | |||
} | |||
/* Reduce modulo Phi_n */ | |||
for (i = 0; i < NTRU_N; i++) { | |||
r->coeffs[i] = mod3(r->coeffs[i] + 2 * r->coeffs[NTRU_N - 1]); | |||
sign = f.coeffs[0]; | |||
for (i = 0; i < NTRU_N - 1; ++i) { | |||
r->coeffs[i] = mod3(sign * v.coeffs[NTRU_N - 2 - i]); | |||
} | |||
r->coeffs[NTRU_N - 1] = 0; | |||
} |
@@ -1,27 +1,30 @@ | |||
#include "crypto_sort.h" | |||
#include "sample.h" | |||
void PQCLEAN_NTRUHPS2048509_CLEAN_sample_fg(poly *f, poly *g, const unsigned char uniformbytes[NTRU_SAMPLE_FG_BYTES]) { | |||
PQCLEAN_NTRUHPS2048509_CLEAN_sample_iid(f, uniformbytes); | |||
PQCLEAN_NTRUHPS2048509_CLEAN_sample_fixed_type(g, uniformbytes + NTRU_SAMPLE_IID_BYTES); | |||
} | |||
void PQCLEAN_NTRUHPS2048509_CLEAN_sample_rm(poly *r, poly *m, const unsigned char uniformbytes[NTRU_SAMPLE_RM_BYTES]) { | |||
PQCLEAN_NTRUHPS2048509_CLEAN_sample_iid(r, uniformbytes); | |||
PQCLEAN_NTRUHPS2048509_CLEAN_sample_fixed_type(m, uniformbytes + NTRU_SAMPLE_IID_BYTES); | |||
} | |||
#include "crypto_sort_int32.h" | |||
void PQCLEAN_NTRUHPS2048509_CLEAN_sample_fixed_type(poly *r, const unsigned char u[NTRU_SAMPLE_FT_BYTES]) { | |||
// Assumes NTRU_SAMPLE_FT_BYTES = ceil(30*(n-1)/8) | |||
uint32_t s[NTRU_N - 1]; | |||
int32_t s[NTRU_N - 1]; | |||
int i; | |||
// Use 30 bits of u per word | |||
for (i = 0; i < (NTRU_N - 1) / 4; i++) { | |||
s[4 * i + 0] = (u[15 * i + 0] << 2) + (u[15 * i + 1] << 10) + (u[15 * i + 2] << 18) + ((uint32_t) u[15 * i + 3] << 26); | |||
s[4 * i + 1] = ((u[15 * i + 3] & 0xc0) >> 4) + (u[15 * i + 4] << 4) + (u[15 * i + 5] << 12) + (u[15 * i + 6] << 20) + ((uint32_t) u[15 * i + 7] << 28); | |||
s[4 * i + 2] = ((u[15 * i + 7] & 0xf0) >> 2) + (u[15 * i + 8] << 6) + (u[15 * i + 9] << 14) + (u[15 * i + 10] << 22) + ((uint32_t) u[15 * i + 11] << 30); | |||
s[4 * i + 0] = (u[15 * i + 0] << 2) + (u[15 * i + 1] << 10) + (u[15 * i + 2] << 18) + ((uint32_t) u[15 * i + 3] << 26); | |||
s[4 * i + 1] = ((u[15 * i + 3] & 0xc0) >> 4) + (u[15 * i + 4] << 4) + (u[15 * i + 5] << 12) + (u[15 * i + 6] << 20) + ((uint32_t) u[15 * i + 7] << 28); | |||
s[4 * i + 2] = ((u[15 * i + 7] & 0xf0) >> 2) + (u[15 * i + 8] << 6) + (u[15 * i + 9] << 14) + (u[15 * i + 10] << 22) + ((uint32_t) u[15 * i + 11] << 30); | |||
s[4 * i + 3] = (u[15 * i + 11] & 0xfc) + (u[15 * i + 12] << 8) + (u[15 * i + 13] << 15) + ((uint32_t) u[15 * i + 14] << 24); | |||
} | |||
@@ -33,7 +36,7 @@ void PQCLEAN_NTRUHPS2048509_CLEAN_sample_fixed_type(poly *r, const unsigned char | |||
s[i] |= 2; | |||
} | |||
PQCLEAN_NTRUHPS2048509_CLEAN_crypto_sort(s, NTRU_N - 1); | |||
PQCLEAN_NTRUHPS2048509_CLEAN_crypto_sort_int32(s, NTRU_N - 1); | |||
for (i = 0; i < NTRU_N - 1; i++) { | |||
r->coeffs[i] = ((uint16_t) (s[i] & 3)); | |||
@@ -1,8 +1,6 @@ | |||
#ifndef SAMPLE_H | |||
#define SAMPLE_H | |||
#include <stdlib.h> | |||
#include "params.h" | |||
#include "poly.h" | |||
@@ -13,4 +11,5 @@ void PQCLEAN_NTRUHPS2048509_CLEAN_sample_iid(poly *r, const unsigned char unifor | |||
void PQCLEAN_NTRUHPS2048509_CLEAN_sample_fixed_type(poly *r, const unsigned char uniformbytes[NTRU_SAMPLE_FT_BYTES]); | |||
#endif |
@@ -1,29 +0,0 @@ | |||
#include <stdint.h> | |||
#include <stdlib.h> | |||
#include "verify.h" | |||
/* returns 0 for equal strings, 1 for non-equal strings */ | |||
unsigned char PQCLEAN_NTRUHPS2048509_CLEAN_verify(const unsigned char *a, const unsigned char *b, size_t len) { | |||
uint64_t r; | |||
size_t i; | |||
r = 0; | |||
for (i = 0; i < len; i++) { | |||
r |= a[i] ^ b[i]; | |||
} | |||
r = (~r + 1); // Two's complement | |||
r >>= 63; | |||
return (unsigned char)r; | |||
} | |||
/* b = 1 means mov, b = 0 means don't mov*/ | |||
void PQCLEAN_NTRUHPS2048509_CLEAN_cmov(unsigned char *r, const unsigned char *x, size_t len, unsigned char b) { | |||
size_t i; | |||
b = (~b + 1); // Two's complement | |||
for (i = 0; i < len; i++) { | |||
r[i] ^= b & (x[i] ^ r[i]); | |||
} | |||
} |
@@ -1,12 +0,0 @@ | |||
#ifndef VERIFY_H | |||
#define VERIFY_H | |||
#include <stdio.h> | |||
/* returns 0 for equal strings, 1 for non-equal strings */ | |||
unsigned char PQCLEAN_NTRUHPS2048509_CLEAN_verify(const unsigned char *a, const unsigned char *b, size_t len); | |||
/* b = 1 means mov, b = 0 means don't mov*/ | |||
void PQCLEAN_NTRUHPS2048509_CLEAN_cmov(unsigned char *r, const unsigned char *x, size_t len, unsigned char b); | |||
#endif |
@@ -1,4 +1,4 @@ | |||
name: NTRU-HPS2048677 | |||
name: ntruhps2048677 | |||
type: kem | |||
claimed-nist-level: 3 | |||
claimed-security: IND-CCA2 | |||
@@ -15,9 +15,22 @@ auxiliary-submitters: | |||
- Jeffrey Hoffstein | |||
- Andreas Hülsing | |||
- Joost Rijneveld | |||
- Tsunekazu Saito | |||
- Peter Schwabe | |||
- William Whyte | |||
- Keita Xagawa | |||
- Takashi Yamakawa | |||
- Zhenfei Zhang | |||
implementations: | |||
- name: clean | |||
version: https://github.com/jschanck/ntru/tree/485dde03 reference implementation | |||
version: https://github.com/jschanck/ntru/tree/4699d70a reference implementation | |||
- name: avx2 | |||
version: https://github.com/jschanck/ntru/tree/4699d70a avx2 implementation | |||
supported_platforms: | |||
- architecture: x86_64 | |||
operating_systems: | |||
- Linux | |||
- Darwin | |||
required_flags: | |||
- avx2 | |||
- bmi2 |
@@ -0,0 +1 @@ | |||
Public Domain |
@@ -0,0 +1,24 @@ | |||
# This Makefile can be used with GNU Make or BSD Make | |||
LIB=libntruhps2048677_avx2.a | |||
HEADERS=api.h cmov.h crypto_sort_int32.h owcpa.h params.h poly.h poly_r2_inv.h sample.h | |||
OBJECTS=cmov.o crypto_sort_int32.o kem.o owcpa.o pack3.o packq.o poly.o poly_lift.o poly_r2_inv.o poly_s3_inv.o sample.o sample_iid.o \ | |||
square_1_677_patience.o square_2_677_patience.o square_3_677_patience.o square_5_677_patience.o square_10_677_shufbytes.o square_21_677_shufbytes.o square_42_677_shufbytes.o square_84_677_shufbytes.o square_168_677_shufbytes.o square_336_677_shufbytes.o \ | |||
poly_mod_3_Phi_n.o poly_mod_q_Phi_n.o poly_r2_mul.o poly_rq_mul.o poly_rq_to_s3.o vec32_sample_iid.o | |||
CFLAGS=-O3 -mavx2 -mbmi2 -Wall -Wextra -Wpedantic -Wvla -Werror -Wredundant-decls -Wmissing-prototypes -std=c99 -I../../../common $(EXTRAFLAGS) | |||
all: $(LIB) | |||
%.o: %.s $(HEADERS) | |||
$(AS) -o $@ $< | |||
%.o: %.c $(HEADERS) | |||
$(CC) $(CFLAGS) -c -o $@ $< | |||
$(LIB): $(OBJECTS) | |||
$(AR) -r $@ $(OBJECTS) | |||
clean: | |||
$(RM) $(OBJECTS) | |||
$(RM) $(LIB) |
@@ -0,0 +1,19 @@ | |||
#ifndef PQCLEAN_NTRUHPS2048677_AVX2_API_H | |||
#define PQCLEAN_NTRUHPS2048677_AVX2_API_H | |||
#include <stdint.h> | |||
#define PQCLEAN_NTRUHPS2048677_AVX2_CRYPTO_SECRETKEYBYTES 1234 | |||
#define PQCLEAN_NTRUHPS2048677_AVX2_CRYPTO_PUBLICKEYBYTES 930 | |||
#define PQCLEAN_NTRUHPS2048677_AVX2_CRYPTO_CIPHERTEXTBYTES 930 | |||
#define PQCLEAN_NTRUHPS2048677_AVX2_CRYPTO_BYTES 32 | |||
#define PQCLEAN_NTRUHPS2048677_AVX2_CRYPTO_ALGNAME "ntruhps2048677" | |||
int PQCLEAN_NTRUHPS2048677_AVX2_crypto_kem_keypair(uint8_t *pk, uint8_t *sk); | |||
int PQCLEAN_NTRUHPS2048677_AVX2_crypto_kem_enc(uint8_t *c, uint8_t *k, const uint8_t *pk); | |||
int PQCLEAN_NTRUHPS2048677_AVX2_crypto_kem_dec(uint8_t *k, const uint8_t *c, const uint8_t *sk); | |||
#endif |
@@ -0,0 +1,11 @@ | |||
#include "cmov.h" | |||
/* b = 1 means mov, b = 0 means don't mov*/ | |||
void PQCLEAN_NTRUHPS2048677_AVX2_cmov(unsigned char *r, const unsigned char *x, size_t len, unsigned char b) { | |||
size_t i; | |||
b = (~b + 1); | |||
for (i = 0; i < len; i++) { | |||
r[i] ^= b & (x[i] ^ r[i]); | |||
} | |||
} |
@@ -0,0 +1,10 @@ | |||
#ifndef VERIFY_H | |||
#define VERIFY_H | |||
#include "params.h" | |||
#include <stddef.h> | |||
void PQCLEAN_NTRUHPS2048677_AVX2_cmov(unsigned char *r, const unsigned char *x, size_t len, unsigned char b); | |||
#endif |
@@ -0,0 +1,11 @@ | |||
#ifndef CRYPTO_SORT | |||
#define CRYPTO_SORT | |||
#include "params.h" | |||
#include <stddef.h> | |||
#include <stdint.h> | |||
void PQCLEAN_NTRUHPS2048677_AVX2_crypto_sort_int32(int32_t *x, size_t n); | |||
#endif |
@@ -0,0 +1,68 @@ | |||
#include "api.h" | |||
#include "cmov.h" | |||
#include "fips202.h" | |||
#include "owcpa.h" | |||
#include "params.h" | |||
#include "randombytes.h" | |||
#include "sample.h" | |||
// API FUNCTIONS | |||
int PQCLEAN_NTRUHPS2048677_AVX2_crypto_kem_keypair(uint8_t *pk, uint8_t *sk) { | |||
uint8_t seed[NTRU_SAMPLE_FG_BYTES]; | |||
randombytes(seed, NTRU_SAMPLE_FG_BYTES); | |||
PQCLEAN_NTRUHPS2048677_AVX2_owcpa_keypair(pk, sk, seed); | |||
randombytes(sk + NTRU_OWCPA_SECRETKEYBYTES, NTRU_PRFKEYBYTES); | |||
return 0; | |||
} | |||
int PQCLEAN_NTRUHPS2048677_AVX2_crypto_kem_enc(uint8_t *c, uint8_t *k, const uint8_t *pk) { | |||
poly r, m; | |||
uint8_t rm[NTRU_OWCPA_MSGBYTES]; | |||
uint8_t rm_seed[NTRU_SAMPLE_RM_BYTES]; | |||
randombytes(rm_seed, NTRU_SAMPLE_RM_BYTES); | |||
PQCLEAN_NTRUHPS2048677_AVX2_sample_rm(&r, &m, rm_seed); | |||
PQCLEAN_NTRUHPS2048677_AVX2_poly_S3_tobytes(rm, &r); | |||
PQCLEAN_NTRUHPS2048677_AVX2_poly_S3_tobytes(rm + NTRU_PACK_TRINARY_BYTES, &m); | |||
sha3_256(k, rm, NTRU_OWCPA_MSGBYTES); | |||
PQCLEAN_NTRUHPS2048677_AVX2_poly_Z3_to_Zq(&r); | |||
PQCLEAN_NTRUHPS2048677_AVX2_owcpa_enc(c, &r, &m, pk); | |||
return 0; | |||
} | |||
int PQCLEAN_NTRUHPS2048677_AVX2_crypto_kem_dec(uint8_t *k, const uint8_t *c, const uint8_t *sk) { | |||
int i, fail; | |||
uint8_t rm[NTRU_OWCPA_MSGBYTES]; | |||
uint8_t buf[NTRU_PRFKEYBYTES + NTRU_CIPHERTEXTBYTES]; | |||
fail = 0; | |||
/* Check that unused bits of last byte of ciphertext are zero */ | |||
fail |= c[NTRU_CIPHERTEXTBYTES - 1] & (0xff << (8 - (7 & (NTRU_LOGQ * NTRU_PACK_DEG)))); | |||
fail |= PQCLEAN_NTRUHPS2048677_AVX2_owcpa_dec(rm, c, sk); | |||
/* If fail = 0 then c = Enc(h, rm). There is no need to re-encapsulate. */ | |||
/* See comment in PQCLEAN_NTRUHPS2048677_AVX2_owcpa_dec for details. */ | |||
sha3_256(k, rm, NTRU_OWCPA_MSGBYTES); | |||
/* shake(secret PRF key || input ciphertext) */ | |||
for (i = 0; i < NTRU_PRFKEYBYTES; i++) { | |||
buf[i] = sk[i + NTRU_OWCPA_SECRETKEYBYTES]; | |||
} | |||
for (i = 0; i < NTRU_CIPHERTEXTBYTES; i++) { | |||
buf[NTRU_PRFKEYBYTES + i] = c[i]; | |||
} | |||
sha3_256(rm, buf, NTRU_PRFKEYBYTES + NTRU_CIPHERTEXTBYTES); | |||
PQCLEAN_NTRUHPS2048677_AVX2_cmov(k, rm, NTRU_SHAREDKEYBYTES, (unsigned char) fail); | |||
return 0; | |||
} |
@@ -0,0 +1,160 @@ | |||
#include "owcpa.h" | |||
#include "poly.h" | |||
#include "sample.h" | |||
static int owcpa_check_r(const poly *r) { | |||
/* Check that r is in message space. */ | |||
/* Note: Assumes that r has coefficients in {0, 1, ..., q-1} */ | |||
int i; | |||
uint64_t t = 0; | |||
uint16_t c; | |||
for (i = 0; i < NTRU_N; i++) { | |||
c = MODQ(r->coeffs[i] + 1); | |||
t |= c & (NTRU_Q - 4); /* 0 if c is in {0,1,2,3} */ | |||
t |= (c + 1) & 0x4; /* 0 if c is in {0,1,2} */ | |||
} | |||
t |= MODQ(r->coeffs[NTRU_N - 1]); /* Coefficient n-1 must be zero */ | |||
t = (~t + 1); // two's complement | |||
t >>= 63; | |||
return (int) t; | |||
} | |||
static int owcpa_check_m(const poly *m) { | |||
/* Check that m is in message space. */ | |||
/* Note: Assumes that m has coefficients in {0,1,2}. */ | |||
int i; | |||
uint64_t t = 0; | |||
uint16_t p1 = 0; | |||
uint16_t m1 = 0; | |||
for (i = 0; i < NTRU_N; i++) { | |||
p1 += m->coeffs[i] & 0x01; | |||
m1 += (m->coeffs[i] & 0x02) >> 1; | |||
} | |||
/* Need p1 = m1 and p1 + m1 = NTRU_WEIGHT */ | |||
t |= p1 ^ m1; | |||
t |= (p1 + m1) ^ NTRU_WEIGHT; | |||
t = (~t + 1); // two's complement | |||
t >>= 63; | |||
return (int) t; | |||
} | |||
void PQCLEAN_NTRUHPS2048677_AVX2_owcpa_keypair(unsigned char *pk, | |||
unsigned char *sk, | |||
const unsigned char seed[NTRU_SAMPLE_FG_BYTES]) { | |||
int i; | |||
poly x1, x2, x3, x4, x5; | |||
poly *f = &x1, *g = &x2, *invf_mod3 = &x3; | |||
poly *gf = &x3, *invgf = &x4, *tmp = &x5; | |||
poly *invh = &x3, *h = &x3; | |||
PQCLEAN_NTRUHPS2048677_AVX2_sample_fg(f, g, seed); | |||
PQCLEAN_NTRUHPS2048677_AVX2_poly_S3_inv(invf_mod3, f); | |||
PQCLEAN_NTRUHPS2048677_AVX2_poly_S3_tobytes(sk, f); | |||
PQCLEAN_NTRUHPS2048677_AVX2_poly_S3_tobytes(sk + NTRU_PACK_TRINARY_BYTES, invf_mod3); | |||
/* Lift coeffs of f and g from Z_p to Z_q */ | |||
PQCLEAN_NTRUHPS2048677_AVX2_poly_Z3_to_Zq(f); | |||
PQCLEAN_NTRUHPS2048677_AVX2_poly_Z3_to_Zq(g); | |||
/* g = 3*g */ | |||
for (i = 0; i < NTRU_N; i++) { | |||
g->coeffs[i] = 3 * g->coeffs[i]; | |||
} | |||
PQCLEAN_NTRUHPS2048677_AVX2_poly_Rq_mul(gf, g, f); | |||
PQCLEAN_NTRUHPS2048677_AVX2_poly_Rq_inv(invgf, gf); | |||
PQCLEAN_NTRUHPS2048677_AVX2_poly_Rq_mul(tmp, invgf, f); | |||
PQCLEAN_NTRUHPS2048677_AVX2_poly_Sq_mul(invh, tmp, f); | |||
PQCLEAN_NTRUHPS2048677_AVX2_poly_Sq_tobytes(sk + 2 * NTRU_PACK_TRINARY_BYTES, invh); | |||
PQCLEAN_NTRUHPS2048677_AVX2_poly_Rq_mul(tmp, invgf, g); | |||
PQCLEAN_NTRUHPS2048677_AVX2_poly_Rq_mul(h, tmp, g); | |||
PQCLEAN_NTRUHPS2048677_AVX2_poly_Rq_sum_zero_tobytes(pk, h); | |||
} | |||
void PQCLEAN_NTRUHPS2048677_AVX2_owcpa_enc(unsigned char *c, | |||
const poly *r, | |||
const poly *m, | |||
const unsigned char *pk) { | |||
int i; | |||
poly x1, x2; | |||
poly *h = &x1, *liftm = &x1; | |||
poly *ct = &x2; | |||
PQCLEAN_NTRUHPS2048677_AVX2_poly_Rq_sum_zero_frombytes(h, pk); | |||
PQCLEAN_NTRUHPS2048677_AVX2_poly_Rq_mul(ct, r, h); | |||
PQCLEAN_NTRUHPS2048677_AVX2_poly_lift(liftm, m); | |||
for (i = 0; i < NTRU_N; i++) { | |||
ct->coeffs[i] = ct->coeffs[i] + liftm->coeffs[i]; | |||
} | |||
PQCLEAN_NTRUHPS2048677_AVX2_poly_Rq_sum_zero_tobytes(c, ct); | |||
} | |||
int PQCLEAN_NTRUHPS2048677_AVX2_owcpa_dec(unsigned char *rm, | |||
const unsigned char *ciphertext, | |||
const unsigned char *secretkey) { | |||
int i; | |||
int fail; | |||
poly x1, x2, x3, x4; | |||
poly *c = &x1, *f = &x2, *cf = &x3; | |||
poly *mf = &x2, *finv3 = &x3, *m = &x4; | |||
poly *liftm = &x2, *invh = &x3, *r = &x4; | |||
poly *b = &x1; | |||
PQCLEAN_NTRUHPS2048677_AVX2_poly_Rq_sum_zero_frombytes(c, ciphertext); | |||
PQCLEAN_NTRUHPS2048677_AVX2_poly_S3_frombytes(f, secretkey); | |||
PQCLEAN_NTRUHPS2048677_AVX2_poly_Z3_to_Zq(f); | |||
PQCLEAN_NTRUHPS2048677_AVX2_poly_Rq_mul(cf, c, f); | |||
PQCLEAN_NTRUHPS2048677_AVX2_poly_Rq_to_S3(mf, cf); | |||
PQCLEAN_NTRUHPS2048677_AVX2_poly_S3_frombytes(finv3, secretkey + NTRU_PACK_TRINARY_BYTES); | |||
PQCLEAN_NTRUHPS2048677_AVX2_poly_S3_mul(m, mf, finv3); | |||
PQCLEAN_NTRUHPS2048677_AVX2_poly_S3_tobytes(rm + NTRU_PACK_TRINARY_BYTES, m); | |||
/* NOTE: For the IND-CCA2 KEM we must ensure that c = Enc(h, (r,m)). */ | |||
/* We can avoid re-computing r*h + Lift(m) as long as we check that */ | |||
/* r (defined as b/h mod (q, Phi_n)) and m are in the message space. */ | |||
/* (m can take any value in S3 in NTRU_HRSS) */ | |||
fail = 0; | |||
fail |= owcpa_check_m(m); | |||
/* b = c - Lift(m) mod (q, x^n - 1) */ | |||
PQCLEAN_NTRUHPS2048677_AVX2_poly_lift(liftm, m); | |||
for (i = 0; i < NTRU_N; i++) { | |||
b->coeffs[i] = c->coeffs[i] - liftm->coeffs[i]; | |||
} | |||
/* r = b / h mod (q, Phi_n) */ | |||
PQCLEAN_NTRUHPS2048677_AVX2_poly_Sq_frombytes(invh, secretkey + 2 * NTRU_PACK_TRINARY_BYTES); | |||
PQCLEAN_NTRUHPS2048677_AVX2_poly_Sq_mul(r, b, invh); | |||
/* NOTE: Our definition of r as b/h mod (q, Phi_n) follows Figure 4 of */ | |||
/* [Sch18] https://eprint.iacr.org/2018/1174/20181203:032458. */ | |||
/* This differs from Figure 10 of Saito--Xagawa--Yamakawa */ | |||
/* [SXY17] https://eprint.iacr.org/2017/1005/20180516:055500 */ | |||
/* where r gets a final reduction modulo p. */ | |||
/* We need this change to use Proposition 1 of [Sch18]. */ | |||
/* Proposition 1 of [Sch18] shows that re-encryption with (r,m) yields c. */ | |||
/* if and only if fail==0 after the following call to owcpa_check_r */ | |||
/* The procedure given in Fig. 8 of [Sch18] can be skipped because we have */ | |||
/* c(1) = 0 due to the use of poly_Rq_sum_zero_{to,from}bytes. */ | |||
fail |= owcpa_check_r(r); | |||
PQCLEAN_NTRUHPS2048677_AVX2_poly_trinary_Zq_to_Z3(r); | |||
PQCLEAN_NTRUHPS2048677_AVX2_poly_S3_tobytes(rm, r); | |||
return fail; | |||
} |
@@ -0,0 +1,22 @@ | |||
#ifndef OWCPA_H | |||
#define OWCPA_H | |||
#include "params.h" | |||
#include "poly.h" | |||
void PQCLEAN_NTRUHPS2048677_AVX2_owcpa_samplemsg(unsigned char msg[NTRU_OWCPA_MSGBYTES], | |||
const unsigned char seed[NTRU_SEEDBYTES]); | |||
void PQCLEAN_NTRUHPS2048677_AVX2_owcpa_keypair(unsigned char *pk, | |||
unsigned char *sk, | |||
const unsigned char seed[NTRU_SEEDBYTES]); | |||
void PQCLEAN_NTRUHPS2048677_AVX2_owcpa_enc(unsigned char *c, | |||
const poly *r, | |||
const poly *m, | |||
const unsigned char *pk); | |||
int PQCLEAN_NTRUHPS2048677_AVX2_owcpa_dec(unsigned char *rm, | |||
const unsigned char *ciphertext, | |||
const unsigned char *secretkey); | |||
#endif |
@@ -0,0 +1,46 @@ | |||
#include "poly.h" | |||
void PQCLEAN_NTRUHPS2048677_AVX2_poly_S3_tobytes(unsigned char msg[NTRU_OWCPA_MSGBYTES], const poly *a) { | |||
int i; | |||
unsigned char c; | |||
int j; | |||
for (i = 0; i < NTRU_PACK_DEG / 5; i++) { | |||
c = a->coeffs[5 * i + 4] & 255; | |||
c = (3 * c + a->coeffs[5 * i + 3]) & 255; | |||
c = (3 * c + a->coeffs[5 * i + 2]) & 255; | |||
c = (3 * c + a->coeffs[5 * i + 1]) & 255; | |||
c = (3 * c + a->coeffs[5 * i + 0]) & 255; | |||
msg[i] = c; | |||
} | |||
i = NTRU_PACK_DEG / 5; | |||
c = 0; | |||
for (j = NTRU_PACK_DEG - (5 * i) - 1; j >= 0; j--) { | |||
c = (3 * c + a->coeffs[5 * i + j]) & 255; | |||
} | |||
msg[i] = c; | |||
} | |||
void PQCLEAN_NTRUHPS2048677_AVX2_poly_S3_frombytes(poly *r, const unsigned char msg[NTRU_OWCPA_MSGBYTES]) { | |||
int i; | |||
unsigned char c; | |||
int j; | |||
for (i = 0; i < NTRU_PACK_DEG / 5; i++) { | |||
c = msg[i]; | |||
r->coeffs[5 * i + 0] = c; | |||
r->coeffs[5 * i + 1] = c * 171 >> 9; // this is division by 3 | |||
r->coeffs[5 * i + 2] = c * 57 >> 9; // division by 3^2 | |||
r->coeffs[5 * i + 3] = c * 19 >> 9; // division by 3^3 | |||
r->coeffs[5 * i + 4] = c * 203 >> 14; // etc. | |||
} | |||
i = NTRU_PACK_DEG / 5; | |||
c = msg[i]; | |||
for (j = 0; (5 * i + j) < NTRU_PACK_DEG; j++) { | |||
r->coeffs[5 * i + j] = c; | |||
c = c * 171 >> 9; | |||
} | |||
r->coeffs[NTRU_N - 1] = 0; | |||
PQCLEAN_NTRUHPS2048677_AVX2_poly_mod_3_Phi_n(r); | |||
} | |||
@@ -0,0 +1,93 @@ | |||
#include "poly.h" | |||
void PQCLEAN_NTRUHPS2048677_AVX2_poly_Sq_tobytes(unsigned char *r, const poly *a) { | |||
int i, j; | |||
uint16_t t[8]; | |||
for (i = 0; i < NTRU_PACK_DEG / 8; i++) { | |||
for (j = 0; j < 8; j++) { | |||
t[j] = MODQ(a->coeffs[8 * i + j]); | |||
} | |||
r[11 * i + 0] = (unsigned char) ( t[0] & 0xff); | |||
r[11 * i + 1] = (unsigned char) ((t[0] >> 8) | ((t[1] & 0x1f) << 3)); | |||
r[11 * i + 2] = (unsigned char) ((t[1] >> 5) | ((t[2] & 0x03) << 6)); | |||
r[11 * i + 3] = (unsigned char) ((t[2] >> 2) & 0xff); | |||
r[11 * i + 4] = (unsigned char) ((t[2] >> 10) | ((t[3] & 0x7f) << 1)); | |||
r[11 * i + 5] = (unsigned char) ((t[3] >> 7) | ((t[4] & 0x0f) << 4)); | |||
r[11 * i + 6] = (unsigned char) ((t[4] >> 4) | ((t[5] & 0x01) << 7)); | |||
r[11 * i + 7] = (unsigned char) ((t[5] >> 1) & 0xff); | |||
r[11 * i + 8] = (unsigned char) ((t[5] >> 9) | ((t[6] & 0x3f) << 2)); | |||
r[11 * i + 9] = (unsigned char) ((t[6] >> 6) | ((t[7] & 0x07) << 5)); | |||
r[11 * i + 10] = (unsigned char) ((t[7] >> 3)); | |||
} | |||
for (j = 0; j < NTRU_PACK_DEG - 8 * i; j++) { | |||
t[j] = MODQ(a->coeffs[8 * i + j]); | |||
} | |||
for (; j < 8; j++) { | |||
t[j] = 0; | |||
} | |||
switch (NTRU_PACK_DEG & 0x07) { | |||
// cases 0 and 6 are impossible since 2 generates (Z/n)* and | |||
// p mod 8 in {1, 7} implies that 2 is a quadratic residue. | |||
case 4: | |||
r[11 * i + 0] = (unsigned char) (t[0] & 0xff); | |||
r[11 * i + 1] = (unsigned char) (t[0] >> 8) | ((t[1] & 0x1f) << 3); | |||
r[11 * i + 2] = (unsigned char) (t[1] >> 5) | ((t[2] & 0x03) << 6); | |||
r[11 * i + 3] = (unsigned char) (t[2] >> 2) & 0xff; | |||
r[11 * i + 4] = (unsigned char) (t[2] >> 10) | ((t[3] & 0x7f) << 1); | |||
r[11 * i + 5] = (unsigned char) (t[3] >> 7) | ((t[4] & 0x0f) << 4); | |||
break; | |||
case 2: | |||
r[11 * i + 0] = (unsigned char) (t[0] & 0xff); | |||
r[11 * i + 1] = (unsigned char) (t[0] >> 8) | ((t[1] & 0x1f) << 3); | |||
r[11 * i + 2] = (unsigned char) (t[1] >> 5) | ((t[2] & 0x03) << 6); | |||
break; | |||
} | |||
} | |||
void PQCLEAN_NTRUHPS2048677_AVX2_poly_Sq_frombytes(poly *r, const unsigned char *a) { | |||
int i; | |||
for (i = 0; i < NTRU_PACK_DEG / 8; i++) { | |||
r->coeffs[8 * i + 0] = (a[11 * i + 0] >> 0) | (((uint16_t)a[11 * i + 1] & 0x07) << 8); | |||
r->coeffs[8 * i + 1] = (a[11 * i + 1] >> 3) | (((uint16_t)a[11 * i + 2] & 0x3f) << 5); | |||
r->coeffs[8 * i + 2] = (a[11 * i + 2] >> 6) | (((uint16_t)a[11 * i + 3] & 0xff) << 2) | (((uint16_t)a[11 * i + 4] & 0x01) << 10); | |||
r->coeffs[8 * i + 3] = (a[11 * i + 4] >> 1) | (((uint16_t)a[11 * i + 5] & 0x0f) << 7); | |||
r->coeffs[8 * i + 4] = (a[11 * i + 5] >> 4) | (((uint16_t)a[11 * i + 6] & 0x7f) << 4); | |||
r->coeffs[8 * i + 5] = (a[11 * i + 6] >> 7) | (((uint16_t)a[11 * i + 7] & 0xff) << 1) | (((uint16_t)a[11 * i + 8] & 0x03) << 9); | |||
r->coeffs[8 * i + 6] = (a[11 * i + 8] >> 2) | (((uint16_t)a[11 * i + 9] & 0x1f) << 6); | |||
r->coeffs[8 * i + 7] = (a[11 * i + 9] >> 5) | (((uint16_t)a[11 * i + 10] & 0xff) << 3); | |||
} | |||
switch (NTRU_PACK_DEG & 0x07) { | |||
// cases 0 and 6 are impossible since 2 generates (Z/n)* and | |||
// p mod 8 in {1, 7} implies that 2 is a quadratic residue. | |||
case 4: | |||
r->coeffs[8 * i + 0] = (a[11 * i + 0] >> 0) | (((uint16_t)a[11 * i + 1] & 0x07) << 8); | |||
r->coeffs[8 * i + 1] = (a[11 * i + 1] >> 3) | (((uint16_t)a[11 * i + 2] & 0x3f) << 5); | |||
r->coeffs[8 * i + 2] = (a[11 * i + 2] >> 6) | (((uint16_t)a[11 * i + 3] & 0xff) << 2) | (((uint16_t)a[11 * i + 4] & 0x01) << 10); | |||
r->coeffs[8 * i + 3] = (a[11 * i + 4] >> 1) | (((uint16_t)a[11 * i + 5] & 0x0f) << 7); | |||
break; | |||
case 2: | |||
r->coeffs[8 * i + 0] = (a[11 * i + 0] >> 0) | (((uint16_t)a[11 * i + 1] & 0x07) << 8); | |||
r->coeffs[8 * i + 1] = (a[11 * i + 1] >> 3) | (((uint16_t)a[11 * i + 2] & 0x3f) << 5); | |||
break; | |||
} | |||
r->coeffs[NTRU_N - 1] = 0; | |||
} | |||
void PQCLEAN_NTRUHPS2048677_AVX2_poly_Rq_sum_zero_tobytes(unsigned char *r, const poly *a) { | |||
PQCLEAN_NTRUHPS2048677_AVX2_poly_Sq_tobytes(r, a); | |||
} | |||
void PQCLEAN_NTRUHPS2048677_AVX2_poly_Rq_sum_zero_frombytes(poly *r, const unsigned char *a) { | |||
int i; | |||
PQCLEAN_NTRUHPS2048677_AVX2_poly_Sq_frombytes(r, a); | |||
/* Set r[n-1] so that the sum of coefficients is zero mod q */ | |||
r->coeffs[NTRU_N - 1] = 0; | |||
for (i = 0; i < NTRU_PACK_DEG; i++) { | |||
r->coeffs[NTRU_N - 1] -= r->coeffs[i]; | |||
} | |||
} |
@@ -0,0 +1,37 @@ | |||
#ifndef PARAMS_H | |||
#define PARAMS_H | |||
#define NTRU_HPS | |||
#define NTRU_N 677 | |||
#define NTRU_LOGQ 11 | |||
/* Do not modify below this line */ | |||
#define PAD32(X) ((((X) + 31)/32)*32) | |||
#define NTRU_Q (1 << NTRU_LOGQ) | |||
#define NTRU_WEIGHT (NTRU_Q/8 - 2) | |||
#define NTRU_SEEDBYTES 32 | |||
#define NTRU_PRFKEYBYTES 32 | |||
#define NTRU_SHAREDKEYBYTES 32 | |||
#define NTRU_SAMPLE_IID_BYTES (NTRU_N-1) | |||
#define NTRU_SAMPLE_FT_BYTES ((30*(NTRU_N-1)+7)/8) | |||
#define NTRU_SAMPLE_FG_BYTES (NTRU_SAMPLE_IID_BYTES+NTRU_SAMPLE_FT_BYTES) | |||
#define NTRU_SAMPLE_RM_BYTES (NTRU_SAMPLE_IID_BYTES+NTRU_SAMPLE_FT_BYTES) | |||
#define NTRU_PACK_DEG (NTRU_N-1) | |||
#define NTRU_PACK_TRINARY_BYTES ((NTRU_PACK_DEG+4)/5) | |||
#define NTRU_OWCPA_MSGBYTES (2*NTRU_PACK_TRINARY_BYTES) | |||
#define NTRU_OWCPA_PUBLICKEYBYTES ((NTRU_LOGQ*NTRU_PACK_DEG+7)/8) | |||
#define NTRU_OWCPA_SECRETKEYBYTES (2*NTRU_PACK_TRINARY_BYTES + NTRU_OWCPA_PUBLICKEYBYTES) | |||
#define NTRU_OWCPA_BYTES ((NTRU_LOGQ*NTRU_PACK_DEG+7)/8) | |||
#define NTRU_PUBLICKEYBYTES (NTRU_OWCPA_PUBLICKEYBYTES) | |||
#define NTRU_SECRETKEYBYTES (NTRU_OWCPA_SECRETKEYBYTES + NTRU_PRFKEYBYTES) | |||
#define NTRU_CIPHERTEXTBYTES (NTRU_OWCPA_BYTES) | |||
#endif |
@@ -0,0 +1,67 @@ | |||
#include "poly.h" | |||
/* Map {0, 1, 2} -> {0,1,q-1} in place */ | |||
void PQCLEAN_NTRUHPS2048677_AVX2_poly_Z3_to_Zq(poly *r) { | |||
int i; | |||
for (i = 0; i < NTRU_N; i++) { | |||
r->coeffs[i] = r->coeffs[i] | ((-(r->coeffs[i] >> 1)) & (NTRU_Q - 1)); | |||
} | |||
} | |||
/* Map {0, 1, q-1} -> {0,1,2} in place */ | |||
void PQCLEAN_NTRUHPS2048677_AVX2_poly_trinary_Zq_to_Z3(poly *r) { | |||
int i; | |||
for (i = 0; i < NTRU_N; i++) { | |||
r->coeffs[i] = MODQ(r->coeffs[i]); | |||
r->coeffs[i] = 3 & (r->coeffs[i] ^ (r->coeffs[i] >> (NTRU_LOGQ - 1))); | |||
} | |||
} | |||
void PQCLEAN_NTRUHPS2048677_AVX2_poly_Sq_mul(poly *r, const poly *a, const poly *b) { | |||
PQCLEAN_NTRUHPS2048677_AVX2_poly_Rq_mul(r, a, b); | |||
PQCLEAN_NTRUHPS2048677_AVX2_poly_mod_q_Phi_n(r); | |||
} | |||
void PQCLEAN_NTRUHPS2048677_AVX2_poly_S3_mul(poly *r, const poly *a, const poly *b) { | |||
PQCLEAN_NTRUHPS2048677_AVX2_poly_Rq_mul(r, a, b); | |||
PQCLEAN_NTRUHPS2048677_AVX2_poly_mod_3_Phi_n(r); | |||
} | |||
static void PQCLEAN_NTRUHPS2048677_AVX2_poly_R2_inv_to_Rq_inv(poly *r, const poly *ai, const poly *a) { | |||
int i; | |||
poly b, c; | |||
poly s; | |||
// for 0..4 | |||
// ai = ai * (2 - a*ai) mod q | |||
for (i = 0; i < NTRU_N; i++) { | |||
b.coeffs[i] = -(a->coeffs[i]); | |||
} | |||
for (i = 0; i < NTRU_N; i++) { | |||
r->coeffs[i] = ai->coeffs[i]; | |||
} | |||
PQCLEAN_NTRUHPS2048677_AVX2_poly_Rq_mul(&c, r, &b); | |||
c.coeffs[0] += 2; // c = 2 - a*ai | |||
PQCLEAN_NTRUHPS2048677_AVX2_poly_Rq_mul(&s, &c, r); // s = ai*c | |||
PQCLEAN_NTRUHPS2048677_AVX2_poly_Rq_mul(&c, &s, &b); | |||
c.coeffs[0] += 2; // c = 2 - a*s | |||
PQCLEAN_NTRUHPS2048677_AVX2_poly_Rq_mul(r, &c, &s); // r = s*c | |||
PQCLEAN_NTRUHPS2048677_AVX2_poly_Rq_mul(&c, r, &b); | |||
c.coeffs[0] += 2; // c = 2 - a*r | |||
PQCLEAN_NTRUHPS2048677_AVX2_poly_Rq_mul(&s, &c, r); // s = r*c | |||
PQCLEAN_NTRUHPS2048677_AVX2_poly_Rq_mul(&c, &s, &b); | |||
c.coeffs[0] += 2; // c = 2 - a*s | |||
PQCLEAN_NTRUHPS2048677_AVX2_poly_Rq_mul(r, &c, &s); // r = s*c | |||
} | |||
void PQCLEAN_NTRUHPS2048677_AVX2_poly_Rq_inv(poly *r, const poly *a) { | |||
poly ai2; | |||
PQCLEAN_NTRUHPS2048677_AVX2_poly_R2_inv(&ai2, a); | |||
PQCLEAN_NTRUHPS2048677_AVX2_poly_R2_inv_to_Rq_inv(r, &ai2, a); | |||
} |
@@ -0,0 +1,41 @@ | |||
#ifndef POLY_H | |||
#define POLY_H | |||
#include <immintrin.h> | |||
#include <stdint.h> | |||
#include "params.h" | |||
#define MODQ(X) ((X) & (NTRU_Q-1)) | |||
typedef union { /* align to 32 byte boundary for vmovdqa */ | |||
uint16_t coeffs[PAD32(NTRU_N)]; | |||
__m256i coeffs_x16[PAD32(NTRU_N) / 16]; | |||
} poly; | |||
void PQCLEAN_NTRUHPS2048677_AVX2_poly_mod_3_Phi_n(poly *r); | |||
void PQCLEAN_NTRUHPS2048677_AVX2_poly_mod_q_Phi_n(poly *r); | |||
void PQCLEAN_NTRUHPS2048677_AVX2_poly_Sq_tobytes(unsigned char *r, const poly *a); | |||
void PQCLEAN_NTRUHPS2048677_AVX2_poly_Sq_frombytes(poly *r, const unsigned char *a); | |||
void PQCLEAN_NTRUHPS2048677_AVX2_poly_Rq_sum_zero_tobytes(unsigned char *r, const poly *a); | |||
void PQCLEAN_NTRUHPS2048677_AVX2_poly_Rq_sum_zero_frombytes(poly *r, const unsigned char *a); | |||
void PQCLEAN_NTRUHPS2048677_AVX2_poly_S3_tobytes(unsigned char msg[NTRU_PACK_TRINARY_BYTES], const poly *a); | |||
void PQCLEAN_NTRUHPS2048677_AVX2_poly_S3_frombytes(poly *r, const unsigned char msg[NTRU_PACK_TRINARY_BYTES]); | |||
void PQCLEAN_NTRUHPS2048677_AVX2_poly_Sq_mul(poly *r, const poly *a, const poly *b); | |||
void PQCLEAN_NTRUHPS2048677_AVX2_poly_Rq_mul(poly *r, const poly *a, const poly *b); | |||
void PQCLEAN_NTRUHPS2048677_AVX2_poly_S3_mul(poly *r, const poly *a, const poly *b); | |||
void PQCLEAN_NTRUHPS2048677_AVX2_poly_lift(poly *r, const poly *a); | |||
void PQCLEAN_NTRUHPS2048677_AVX2_poly_Rq_to_S3(poly *r, const poly *a); | |||
void PQCLEAN_NTRUHPS2048677_AVX2_poly_R2_inv(poly *r, const poly *a); | |||
void PQCLEAN_NTRUHPS2048677_AVX2_poly_Rq_inv(poly *r, const poly *a); | |||
void PQCLEAN_NTRUHPS2048677_AVX2_poly_S3_inv(poly *r, const poly *a); | |||
void PQCLEAN_NTRUHPS2048677_AVX2_poly_Z3_to_Zq(poly *r); | |||
void PQCLEAN_NTRUHPS2048677_AVX2_poly_trinary_Zq_to_Z3(poly *r); | |||
#endif |
@@ -0,0 +1,11 @@ | |||
#include "poly.h" | |||
void PQCLEAN_NTRUHPS2048677_AVX2_poly_lift(poly *r, const poly *a) { | |||
int i; | |||
for (i = 0; i < NTRU_N; i++) { | |||
r->coeffs[i] = a->coeffs[i]; | |||
} | |||
PQCLEAN_NTRUHPS2048677_AVX2_poly_Z3_to_Zq(r); | |||
} | |||
@@ -0,0 +1,928 @@ | |||
.data | |||
.p2align 5 | |||
mask_ff: | |||
.word 0xff | |||
.word 0xff | |||
.word 0xff | |||
.word 0xff | |||
.word 0xff | |||
.word 0xff | |||
.word 0xff | |||
.word 0xff | |||
.word 0xff | |||
.word 0xff | |||
.word 0xff | |||
.word 0xff | |||
.word 0xff | |||
.word 0xff | |||
.word 0xff | |||
.word 0xff | |||
mask_f: | |||
.word 0xf | |||
.word 0xf | |||
.word 0xf | |||
.word 0xf | |||
.word 0xf | |||
.word 0xf | |||
.word 0xf | |||
.word 0xf | |||
.word 0xf | |||
.word 0xf | |||
.word 0xf | |||
.word 0xf | |||
.word 0xf | |||
.word 0xf | |||
.word 0xf | |||
.word 0xf | |||
mask_3: | |||
.word 0x03 | |||
.word 0x03 | |||
.word 0x03 | |||
.word 0x03 | |||
.word 0x03 | |||
.word 0x03 | |||
.word 0x03 | |||
.word 0x03 | |||
.word 0x03 | |||
.word 0x03 | |||
.word 0x03 | |||
.word 0x03 | |||
.word 0x03 | |||
.word 0x03 | |||
.word 0x03 | |||
.word 0x03 | |||
.text | |||
.global PQCLEAN_NTRUHPS2048677_AVX2_poly_mod_3_Phi_n | |||
.global _PQCLEAN_NTRUHPS2048677_AVX2_poly_mod_3_Phi_n | |||
PQCLEAN_NTRUHPS2048677_AVX2_poly_mod_3_Phi_n: | |||
_PQCLEAN_NTRUHPS2048677_AVX2_poly_mod_3_Phi_n: | |||
vmovdqa 1344(%rdi), %ymm0 | |||
vpermq $1, %ymm0, %ymm0 | |||
vpslld $17, %ymm0, %ymm0 | |||
vpsrld $16, %ymm0, %ymm1 | |||
vpor %ymm0, %ymm1, %ymm0 | |||
vbroadcastss %xmm0, %ymm0 | |||
vpaddw 0(%rdi), %ymm0, %ymm1 | |||
vpsrlw $8, %ymm1, %ymm2 | |||
vpand mask_ff(%rip), %ymm1, %ymm1 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_f(%rip), %ymm2, %ymm1 | |||
vpsrlw $4, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpsubw mask_3(%rip), %ymm2, %ymm14 | |||
vpsraw $15, %ymm14, %ymm15 | |||
vpandn %ymm14, %ymm15, %ymm1 | |||
vpand %ymm15, %ymm2, %ymm14 | |||
vpxor %ymm14, %ymm1, %ymm2 | |||
vmovdqa %ymm2, 0(%rdi) | |||
vpaddw 32(%rdi), %ymm0, %ymm1 | |||
vpsrlw $8, %ymm1, %ymm2 | |||
vpand mask_ff(%rip), %ymm1, %ymm1 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_f(%rip), %ymm2, %ymm1 | |||
vpsrlw $4, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpsubw mask_3(%rip), %ymm2, %ymm14 | |||
vpsraw $15, %ymm14, %ymm15 | |||
vpandn %ymm14, %ymm15, %ymm1 | |||
vpand %ymm15, %ymm2, %ymm14 | |||
vpxor %ymm14, %ymm1, %ymm2 | |||
vmovdqa %ymm2, 32(%rdi) | |||
vpaddw 64(%rdi), %ymm0, %ymm1 | |||
vpsrlw $8, %ymm1, %ymm2 | |||
vpand mask_ff(%rip), %ymm1, %ymm1 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_f(%rip), %ymm2, %ymm1 | |||
vpsrlw $4, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpsubw mask_3(%rip), %ymm2, %ymm14 | |||
vpsraw $15, %ymm14, %ymm15 | |||
vpandn %ymm14, %ymm15, %ymm1 | |||
vpand %ymm15, %ymm2, %ymm14 | |||
vpxor %ymm14, %ymm1, %ymm2 | |||
vmovdqa %ymm2, 64(%rdi) | |||
vpaddw 96(%rdi), %ymm0, %ymm1 | |||
vpsrlw $8, %ymm1, %ymm2 | |||
vpand mask_ff(%rip), %ymm1, %ymm1 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_f(%rip), %ymm2, %ymm1 | |||
vpsrlw $4, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpsubw mask_3(%rip), %ymm2, %ymm14 | |||
vpsraw $15, %ymm14, %ymm15 | |||
vpandn %ymm14, %ymm15, %ymm1 | |||
vpand %ymm15, %ymm2, %ymm14 | |||
vpxor %ymm14, %ymm1, %ymm2 | |||
vmovdqa %ymm2, 96(%rdi) | |||
vpaddw 128(%rdi), %ymm0, %ymm1 | |||
vpsrlw $8, %ymm1, %ymm2 | |||
vpand mask_ff(%rip), %ymm1, %ymm1 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_f(%rip), %ymm2, %ymm1 | |||
vpsrlw $4, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpsubw mask_3(%rip), %ymm2, %ymm14 | |||
vpsraw $15, %ymm14, %ymm15 | |||
vpandn %ymm14, %ymm15, %ymm1 | |||
vpand %ymm15, %ymm2, %ymm14 | |||
vpxor %ymm14, %ymm1, %ymm2 | |||
vmovdqa %ymm2, 128(%rdi) | |||
vpaddw 160(%rdi), %ymm0, %ymm1 | |||
vpsrlw $8, %ymm1, %ymm2 | |||
vpand mask_ff(%rip), %ymm1, %ymm1 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_f(%rip), %ymm2, %ymm1 | |||
vpsrlw $4, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpsubw mask_3(%rip), %ymm2, %ymm14 | |||
vpsraw $15, %ymm14, %ymm15 | |||
vpandn %ymm14, %ymm15, %ymm1 | |||
vpand %ymm15, %ymm2, %ymm14 | |||
vpxor %ymm14, %ymm1, %ymm2 | |||
vmovdqa %ymm2, 160(%rdi) | |||
vpaddw 192(%rdi), %ymm0, %ymm1 | |||
vpsrlw $8, %ymm1, %ymm2 | |||
vpand mask_ff(%rip), %ymm1, %ymm1 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_f(%rip), %ymm2, %ymm1 | |||
vpsrlw $4, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpsubw mask_3(%rip), %ymm2, %ymm14 | |||
vpsraw $15, %ymm14, %ymm15 | |||
vpandn %ymm14, %ymm15, %ymm1 | |||
vpand %ymm15, %ymm2, %ymm14 | |||
vpxor %ymm14, %ymm1, %ymm2 | |||
vmovdqa %ymm2, 192(%rdi) | |||
vpaddw 224(%rdi), %ymm0, %ymm1 | |||
vpsrlw $8, %ymm1, %ymm2 | |||
vpand mask_ff(%rip), %ymm1, %ymm1 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_f(%rip), %ymm2, %ymm1 | |||
vpsrlw $4, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpsubw mask_3(%rip), %ymm2, %ymm14 | |||
vpsraw $15, %ymm14, %ymm15 | |||
vpandn %ymm14, %ymm15, %ymm1 | |||
vpand %ymm15, %ymm2, %ymm14 | |||
vpxor %ymm14, %ymm1, %ymm2 | |||
vmovdqa %ymm2, 224(%rdi) | |||
vpaddw 256(%rdi), %ymm0, %ymm1 | |||
vpsrlw $8, %ymm1, %ymm2 | |||
vpand mask_ff(%rip), %ymm1, %ymm1 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_f(%rip), %ymm2, %ymm1 | |||
vpsrlw $4, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpsubw mask_3(%rip), %ymm2, %ymm14 | |||
vpsraw $15, %ymm14, %ymm15 | |||
vpandn %ymm14, %ymm15, %ymm1 | |||
vpand %ymm15, %ymm2, %ymm14 | |||
vpxor %ymm14, %ymm1, %ymm2 | |||
vmovdqa %ymm2, 256(%rdi) | |||
vpaddw 288(%rdi), %ymm0, %ymm1 | |||
vpsrlw $8, %ymm1, %ymm2 | |||
vpand mask_ff(%rip), %ymm1, %ymm1 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_f(%rip), %ymm2, %ymm1 | |||
vpsrlw $4, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpsubw mask_3(%rip), %ymm2, %ymm14 | |||
vpsraw $15, %ymm14, %ymm15 | |||
vpandn %ymm14, %ymm15, %ymm1 | |||
vpand %ymm15, %ymm2, %ymm14 | |||
vpxor %ymm14, %ymm1, %ymm2 | |||
vmovdqa %ymm2, 288(%rdi) | |||
vpaddw 320(%rdi), %ymm0, %ymm1 | |||
vpsrlw $8, %ymm1, %ymm2 | |||
vpand mask_ff(%rip), %ymm1, %ymm1 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_f(%rip), %ymm2, %ymm1 | |||
vpsrlw $4, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpsubw mask_3(%rip), %ymm2, %ymm14 | |||
vpsraw $15, %ymm14, %ymm15 | |||
vpandn %ymm14, %ymm15, %ymm1 | |||
vpand %ymm15, %ymm2, %ymm14 | |||
vpxor %ymm14, %ymm1, %ymm2 | |||
vmovdqa %ymm2, 320(%rdi) | |||
vpaddw 352(%rdi), %ymm0, %ymm1 | |||
vpsrlw $8, %ymm1, %ymm2 | |||
vpand mask_ff(%rip), %ymm1, %ymm1 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_f(%rip), %ymm2, %ymm1 | |||
vpsrlw $4, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpsubw mask_3(%rip), %ymm2, %ymm14 | |||
vpsraw $15, %ymm14, %ymm15 | |||
vpandn %ymm14, %ymm15, %ymm1 | |||
vpand %ymm15, %ymm2, %ymm14 | |||
vpxor %ymm14, %ymm1, %ymm2 | |||
vmovdqa %ymm2, 352(%rdi) | |||
vpaddw 384(%rdi), %ymm0, %ymm1 | |||
vpsrlw $8, %ymm1, %ymm2 | |||
vpand mask_ff(%rip), %ymm1, %ymm1 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_f(%rip), %ymm2, %ymm1 | |||
vpsrlw $4, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpsubw mask_3(%rip), %ymm2, %ymm14 | |||
vpsraw $15, %ymm14, %ymm15 | |||
vpandn %ymm14, %ymm15, %ymm1 | |||
vpand %ymm15, %ymm2, %ymm14 | |||
vpxor %ymm14, %ymm1, %ymm2 | |||
vmovdqa %ymm2, 384(%rdi) | |||
vpaddw 416(%rdi), %ymm0, %ymm1 | |||
vpsrlw $8, %ymm1, %ymm2 | |||
vpand mask_ff(%rip), %ymm1, %ymm1 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_f(%rip), %ymm2, %ymm1 | |||
vpsrlw $4, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpsubw mask_3(%rip), %ymm2, %ymm14 | |||
vpsraw $15, %ymm14, %ymm15 | |||
vpandn %ymm14, %ymm15, %ymm1 | |||
vpand %ymm15, %ymm2, %ymm14 | |||
vpxor %ymm14, %ymm1, %ymm2 | |||
vmovdqa %ymm2, 416(%rdi) | |||
vpaddw 448(%rdi), %ymm0, %ymm1 | |||
vpsrlw $8, %ymm1, %ymm2 | |||
vpand mask_ff(%rip), %ymm1, %ymm1 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_f(%rip), %ymm2, %ymm1 | |||
vpsrlw $4, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpsubw mask_3(%rip), %ymm2, %ymm14 | |||
vpsraw $15, %ymm14, %ymm15 | |||
vpandn %ymm14, %ymm15, %ymm1 | |||
vpand %ymm15, %ymm2, %ymm14 | |||
vpxor %ymm14, %ymm1, %ymm2 | |||
vmovdqa %ymm2, 448(%rdi) | |||
vpaddw 480(%rdi), %ymm0, %ymm1 | |||
vpsrlw $8, %ymm1, %ymm2 | |||
vpand mask_ff(%rip), %ymm1, %ymm1 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_f(%rip), %ymm2, %ymm1 | |||
vpsrlw $4, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpsubw mask_3(%rip), %ymm2, %ymm14 | |||
vpsraw $15, %ymm14, %ymm15 | |||
vpandn %ymm14, %ymm15, %ymm1 | |||
vpand %ymm15, %ymm2, %ymm14 | |||
vpxor %ymm14, %ymm1, %ymm2 | |||
vmovdqa %ymm2, 480(%rdi) | |||
vpaddw 512(%rdi), %ymm0, %ymm1 | |||
vpsrlw $8, %ymm1, %ymm2 | |||
vpand mask_ff(%rip), %ymm1, %ymm1 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_f(%rip), %ymm2, %ymm1 | |||
vpsrlw $4, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpsubw mask_3(%rip), %ymm2, %ymm14 | |||
vpsraw $15, %ymm14, %ymm15 | |||
vpandn %ymm14, %ymm15, %ymm1 | |||
vpand %ymm15, %ymm2, %ymm14 | |||
vpxor %ymm14, %ymm1, %ymm2 | |||
vmovdqa %ymm2, 512(%rdi) | |||
vpaddw 544(%rdi), %ymm0, %ymm1 | |||
vpsrlw $8, %ymm1, %ymm2 | |||
vpand mask_ff(%rip), %ymm1, %ymm1 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_f(%rip), %ymm2, %ymm1 | |||
vpsrlw $4, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpsubw mask_3(%rip), %ymm2, %ymm14 | |||
vpsraw $15, %ymm14, %ymm15 | |||
vpandn %ymm14, %ymm15, %ymm1 | |||
vpand %ymm15, %ymm2, %ymm14 | |||
vpxor %ymm14, %ymm1, %ymm2 | |||
vmovdqa %ymm2, 544(%rdi) | |||
vpaddw 576(%rdi), %ymm0, %ymm1 | |||
vpsrlw $8, %ymm1, %ymm2 | |||
vpand mask_ff(%rip), %ymm1, %ymm1 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_f(%rip), %ymm2, %ymm1 | |||
vpsrlw $4, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpsubw mask_3(%rip), %ymm2, %ymm14 | |||
vpsraw $15, %ymm14, %ymm15 | |||
vpandn %ymm14, %ymm15, %ymm1 | |||
vpand %ymm15, %ymm2, %ymm14 | |||
vpxor %ymm14, %ymm1, %ymm2 | |||
vmovdqa %ymm2, 576(%rdi) | |||
vpaddw 608(%rdi), %ymm0, %ymm1 | |||
vpsrlw $8, %ymm1, %ymm2 | |||
vpand mask_ff(%rip), %ymm1, %ymm1 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_f(%rip), %ymm2, %ymm1 | |||
vpsrlw $4, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpsubw mask_3(%rip), %ymm2, %ymm14 | |||
vpsraw $15, %ymm14, %ymm15 | |||
vpandn %ymm14, %ymm15, %ymm1 | |||
vpand %ymm15, %ymm2, %ymm14 | |||
vpxor %ymm14, %ymm1, %ymm2 | |||
vmovdqa %ymm2, 608(%rdi) | |||
vpaddw 640(%rdi), %ymm0, %ymm1 | |||
vpsrlw $8, %ymm1, %ymm2 | |||
vpand mask_ff(%rip), %ymm1, %ymm1 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_f(%rip), %ymm2, %ymm1 | |||
vpsrlw $4, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpsubw mask_3(%rip), %ymm2, %ymm14 | |||
vpsraw $15, %ymm14, %ymm15 | |||
vpandn %ymm14, %ymm15, %ymm1 | |||
vpand %ymm15, %ymm2, %ymm14 | |||
vpxor %ymm14, %ymm1, %ymm2 | |||
vmovdqa %ymm2, 640(%rdi) | |||
vpaddw 672(%rdi), %ymm0, %ymm1 | |||
vpsrlw $8, %ymm1, %ymm2 | |||
vpand mask_ff(%rip), %ymm1, %ymm1 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_f(%rip), %ymm2, %ymm1 | |||
vpsrlw $4, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpsubw mask_3(%rip), %ymm2, %ymm14 | |||
vpsraw $15, %ymm14, %ymm15 | |||
vpandn %ymm14, %ymm15, %ymm1 | |||
vpand %ymm15, %ymm2, %ymm14 | |||
vpxor %ymm14, %ymm1, %ymm2 | |||
vmovdqa %ymm2, 672(%rdi) | |||
vpaddw 704(%rdi), %ymm0, %ymm1 | |||
vpsrlw $8, %ymm1, %ymm2 | |||
vpand mask_ff(%rip), %ymm1, %ymm1 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_f(%rip), %ymm2, %ymm1 | |||
vpsrlw $4, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpsubw mask_3(%rip), %ymm2, %ymm14 | |||
vpsraw $15, %ymm14, %ymm15 | |||
vpandn %ymm14, %ymm15, %ymm1 | |||
vpand %ymm15, %ymm2, %ymm14 | |||
vpxor %ymm14, %ymm1, %ymm2 | |||
vmovdqa %ymm2, 704(%rdi) | |||
vpaddw 736(%rdi), %ymm0, %ymm1 | |||
vpsrlw $8, %ymm1, %ymm2 | |||
vpand mask_ff(%rip), %ymm1, %ymm1 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_f(%rip), %ymm2, %ymm1 | |||
vpsrlw $4, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpsubw mask_3(%rip), %ymm2, %ymm14 | |||
vpsraw $15, %ymm14, %ymm15 | |||
vpandn %ymm14, %ymm15, %ymm1 | |||
vpand %ymm15, %ymm2, %ymm14 | |||
vpxor %ymm14, %ymm1, %ymm2 | |||
vmovdqa %ymm2, 736(%rdi) | |||
vpaddw 768(%rdi), %ymm0, %ymm1 | |||
vpsrlw $8, %ymm1, %ymm2 | |||
vpand mask_ff(%rip), %ymm1, %ymm1 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_f(%rip), %ymm2, %ymm1 | |||
vpsrlw $4, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpsubw mask_3(%rip), %ymm2, %ymm14 | |||
vpsraw $15, %ymm14, %ymm15 | |||
vpandn %ymm14, %ymm15, %ymm1 | |||
vpand %ymm15, %ymm2, %ymm14 | |||
vpxor %ymm14, %ymm1, %ymm2 | |||
vmovdqa %ymm2, 768(%rdi) | |||
vpaddw 800(%rdi), %ymm0, %ymm1 | |||
vpsrlw $8, %ymm1, %ymm2 | |||
vpand mask_ff(%rip), %ymm1, %ymm1 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_f(%rip), %ymm2, %ymm1 | |||
vpsrlw $4, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpsubw mask_3(%rip), %ymm2, %ymm14 | |||
vpsraw $15, %ymm14, %ymm15 | |||
vpandn %ymm14, %ymm15, %ymm1 | |||
vpand %ymm15, %ymm2, %ymm14 | |||
vpxor %ymm14, %ymm1, %ymm2 | |||
vmovdqa %ymm2, 800(%rdi) | |||
vpaddw 832(%rdi), %ymm0, %ymm1 | |||
vpsrlw $8, %ymm1, %ymm2 | |||
vpand mask_ff(%rip), %ymm1, %ymm1 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_f(%rip), %ymm2, %ymm1 | |||
vpsrlw $4, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpsubw mask_3(%rip), %ymm2, %ymm14 | |||
vpsraw $15, %ymm14, %ymm15 | |||
vpandn %ymm14, %ymm15, %ymm1 | |||
vpand %ymm15, %ymm2, %ymm14 | |||
vpxor %ymm14, %ymm1, %ymm2 | |||
vmovdqa %ymm2, 832(%rdi) | |||
vpaddw 864(%rdi), %ymm0, %ymm1 | |||
vpsrlw $8, %ymm1, %ymm2 | |||
vpand mask_ff(%rip), %ymm1, %ymm1 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_f(%rip), %ymm2, %ymm1 | |||
vpsrlw $4, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpsubw mask_3(%rip), %ymm2, %ymm14 | |||
vpsraw $15, %ymm14, %ymm15 | |||
vpandn %ymm14, %ymm15, %ymm1 | |||
vpand %ymm15, %ymm2, %ymm14 | |||
vpxor %ymm14, %ymm1, %ymm2 | |||
vmovdqa %ymm2, 864(%rdi) | |||
vpaddw 896(%rdi), %ymm0, %ymm1 | |||
vpsrlw $8, %ymm1, %ymm2 | |||
vpand mask_ff(%rip), %ymm1, %ymm1 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_f(%rip), %ymm2, %ymm1 | |||
vpsrlw $4, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpsubw mask_3(%rip), %ymm2, %ymm14 | |||
vpsraw $15, %ymm14, %ymm15 | |||
vpandn %ymm14, %ymm15, %ymm1 | |||
vpand %ymm15, %ymm2, %ymm14 | |||
vpxor %ymm14, %ymm1, %ymm2 | |||
vmovdqa %ymm2, 896(%rdi) | |||
vpaddw 928(%rdi), %ymm0, %ymm1 | |||
vpsrlw $8, %ymm1, %ymm2 | |||
vpand mask_ff(%rip), %ymm1, %ymm1 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_f(%rip), %ymm2, %ymm1 | |||
vpsrlw $4, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpsubw mask_3(%rip), %ymm2, %ymm14 | |||
vpsraw $15, %ymm14, %ymm15 | |||
vpandn %ymm14, %ymm15, %ymm1 | |||
vpand %ymm15, %ymm2, %ymm14 | |||
vpxor %ymm14, %ymm1, %ymm2 | |||
vmovdqa %ymm2, 928(%rdi) | |||
vpaddw 960(%rdi), %ymm0, %ymm1 | |||
vpsrlw $8, %ymm1, %ymm2 | |||
vpand mask_ff(%rip), %ymm1, %ymm1 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_f(%rip), %ymm2, %ymm1 | |||
vpsrlw $4, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpsubw mask_3(%rip), %ymm2, %ymm14 | |||
vpsraw $15, %ymm14, %ymm15 | |||
vpandn %ymm14, %ymm15, %ymm1 | |||
vpand %ymm15, %ymm2, %ymm14 | |||
vpxor %ymm14, %ymm1, %ymm2 | |||
vmovdqa %ymm2, 960(%rdi) | |||
vpaddw 992(%rdi), %ymm0, %ymm1 | |||
vpsrlw $8, %ymm1, %ymm2 | |||
vpand mask_ff(%rip), %ymm1, %ymm1 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_f(%rip), %ymm2, %ymm1 | |||
vpsrlw $4, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpsubw mask_3(%rip), %ymm2, %ymm14 | |||
vpsraw $15, %ymm14, %ymm15 | |||
vpandn %ymm14, %ymm15, %ymm1 | |||
vpand %ymm15, %ymm2, %ymm14 | |||
vpxor %ymm14, %ymm1, %ymm2 | |||
vmovdqa %ymm2, 992(%rdi) | |||
vpaddw 1024(%rdi), %ymm0, %ymm1 | |||
vpsrlw $8, %ymm1, %ymm2 | |||
vpand mask_ff(%rip), %ymm1, %ymm1 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_f(%rip), %ymm2, %ymm1 | |||
vpsrlw $4, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpsubw mask_3(%rip), %ymm2, %ymm14 | |||
vpsraw $15, %ymm14, %ymm15 | |||
vpandn %ymm14, %ymm15, %ymm1 | |||
vpand %ymm15, %ymm2, %ymm14 | |||
vpxor %ymm14, %ymm1, %ymm2 | |||
vmovdqa %ymm2, 1024(%rdi) | |||
vpaddw 1056(%rdi), %ymm0, %ymm1 | |||
vpsrlw $8, %ymm1, %ymm2 | |||
vpand mask_ff(%rip), %ymm1, %ymm1 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_f(%rip), %ymm2, %ymm1 | |||
vpsrlw $4, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpsubw mask_3(%rip), %ymm2, %ymm14 | |||
vpsraw $15, %ymm14, %ymm15 | |||
vpandn %ymm14, %ymm15, %ymm1 | |||
vpand %ymm15, %ymm2, %ymm14 | |||
vpxor %ymm14, %ymm1, %ymm2 | |||
vmovdqa %ymm2, 1056(%rdi) | |||
vpaddw 1088(%rdi), %ymm0, %ymm1 | |||
vpsrlw $8, %ymm1, %ymm2 | |||
vpand mask_ff(%rip), %ymm1, %ymm1 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_f(%rip), %ymm2, %ymm1 | |||
vpsrlw $4, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpsubw mask_3(%rip), %ymm2, %ymm14 | |||
vpsraw $15, %ymm14, %ymm15 | |||
vpandn %ymm14, %ymm15, %ymm1 | |||
vpand %ymm15, %ymm2, %ymm14 | |||
vpxor %ymm14, %ymm1, %ymm2 | |||
vmovdqa %ymm2, 1088(%rdi) | |||
vpaddw 1120(%rdi), %ymm0, %ymm1 | |||
vpsrlw $8, %ymm1, %ymm2 | |||
vpand mask_ff(%rip), %ymm1, %ymm1 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_f(%rip), %ymm2, %ymm1 | |||
vpsrlw $4, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpsubw mask_3(%rip), %ymm2, %ymm14 | |||
vpsraw $15, %ymm14, %ymm15 | |||
vpandn %ymm14, %ymm15, %ymm1 | |||
vpand %ymm15, %ymm2, %ymm14 | |||
vpxor %ymm14, %ymm1, %ymm2 | |||
vmovdqa %ymm2, 1120(%rdi) | |||
vpaddw 1152(%rdi), %ymm0, %ymm1 | |||
vpsrlw $8, %ymm1, %ymm2 | |||
vpand mask_ff(%rip), %ymm1, %ymm1 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_f(%rip), %ymm2, %ymm1 | |||
vpsrlw $4, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpsubw mask_3(%rip), %ymm2, %ymm14 | |||
vpsraw $15, %ymm14, %ymm15 | |||
vpandn %ymm14, %ymm15, %ymm1 | |||
vpand %ymm15, %ymm2, %ymm14 | |||
vpxor %ymm14, %ymm1, %ymm2 | |||
vmovdqa %ymm2, 1152(%rdi) | |||
vpaddw 1184(%rdi), %ymm0, %ymm1 | |||
vpsrlw $8, %ymm1, %ymm2 | |||
vpand mask_ff(%rip), %ymm1, %ymm1 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_f(%rip), %ymm2, %ymm1 | |||
vpsrlw $4, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpsubw mask_3(%rip), %ymm2, %ymm14 | |||
vpsraw $15, %ymm14, %ymm15 | |||
vpandn %ymm14, %ymm15, %ymm1 | |||
vpand %ymm15, %ymm2, %ymm14 | |||
vpxor %ymm14, %ymm1, %ymm2 | |||
vmovdqa %ymm2, 1184(%rdi) | |||
vpaddw 1216(%rdi), %ymm0, %ymm1 | |||
vpsrlw $8, %ymm1, %ymm2 | |||
vpand mask_ff(%rip), %ymm1, %ymm1 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_f(%rip), %ymm2, %ymm1 | |||
vpsrlw $4, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpsubw mask_3(%rip), %ymm2, %ymm14 | |||
vpsraw $15, %ymm14, %ymm15 | |||
vpandn %ymm14, %ymm15, %ymm1 | |||
vpand %ymm15, %ymm2, %ymm14 | |||
vpxor %ymm14, %ymm1, %ymm2 | |||
vmovdqa %ymm2, 1216(%rdi) | |||
vpaddw 1248(%rdi), %ymm0, %ymm1 | |||
vpsrlw $8, %ymm1, %ymm2 | |||
vpand mask_ff(%rip), %ymm1, %ymm1 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_f(%rip), %ymm2, %ymm1 | |||
vpsrlw $4, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpsubw mask_3(%rip), %ymm2, %ymm14 | |||
vpsraw $15, %ymm14, %ymm15 | |||
vpandn %ymm14, %ymm15, %ymm1 | |||
vpand %ymm15, %ymm2, %ymm14 | |||
vpxor %ymm14, %ymm1, %ymm2 | |||
vmovdqa %ymm2, 1248(%rdi) | |||
vpaddw 1280(%rdi), %ymm0, %ymm1 | |||
vpsrlw $8, %ymm1, %ymm2 | |||
vpand mask_ff(%rip), %ymm1, %ymm1 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_f(%rip), %ymm2, %ymm1 | |||
vpsrlw $4, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpsubw mask_3(%rip), %ymm2, %ymm14 | |||
vpsraw $15, %ymm14, %ymm15 | |||
vpandn %ymm14, %ymm15, %ymm1 | |||
vpand %ymm15, %ymm2, %ymm14 | |||
vpxor %ymm14, %ymm1, %ymm2 | |||
vmovdqa %ymm2, 1280(%rdi) | |||
vpaddw 1312(%rdi), %ymm0, %ymm1 | |||
vpsrlw $8, %ymm1, %ymm2 | |||
vpand mask_ff(%rip), %ymm1, %ymm1 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_f(%rip), %ymm2, %ymm1 | |||
vpsrlw $4, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpsubw mask_3(%rip), %ymm2, %ymm14 | |||
vpsraw $15, %ymm14, %ymm15 | |||
vpandn %ymm14, %ymm15, %ymm1 | |||
vpand %ymm15, %ymm2, %ymm14 | |||
vpxor %ymm14, %ymm1, %ymm2 | |||
vmovdqa %ymm2, 1312(%rdi) | |||
vpaddw 1344(%rdi), %ymm0, %ymm1 | |||
vpsrlw $8, %ymm1, %ymm2 | |||
vpand mask_ff(%rip), %ymm1, %ymm1 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_f(%rip), %ymm2, %ymm1 | |||
vpsrlw $4, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpsubw mask_3(%rip), %ymm2, %ymm14 | |||
vpsraw $15, %ymm14, %ymm15 | |||
vpandn %ymm14, %ymm15, %ymm1 | |||
vpand %ymm15, %ymm2, %ymm14 | |||
vpxor %ymm14, %ymm1, %ymm2 | |||
vmovdqa %ymm2, 1344(%rdi) | |||
vpaddw 1376(%rdi), %ymm0, %ymm1 | |||
vpsrlw $8, %ymm1, %ymm2 | |||
vpand mask_ff(%rip), %ymm1, %ymm1 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_f(%rip), %ymm2, %ymm1 | |||
vpsrlw $4, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpand mask_3(%rip), %ymm2, %ymm1 | |||
vpsrlw $2, %ymm2, %ymm2 | |||
vpaddw %ymm2, %ymm1, %ymm2 | |||
vpsubw mask_3(%rip), %ymm2, %ymm14 | |||
vpsraw $15, %ymm14, %ymm15 | |||
vpandn %ymm14, %ymm15, %ymm1 | |||
vpand %ymm15, %ymm2, %ymm14 | |||
vpxor %ymm14, %ymm1, %ymm2 | |||
vmovdqa %ymm2, 1376(%rdi) | |||
movw $0, 1354(%rdi) | |||
movw $0, 1356(%rdi) | |||
movw $0, 1358(%rdi) | |||
movw $0, 1360(%rdi) | |||
movw $0, 1362(%rdi) | |||
movw $0, 1364(%rdi) | |||
movw $0, 1366(%rdi) | |||
movw $0, 1368(%rdi) | |||
movw $0, 1370(%rdi) | |||
movw $0, 1372(%rdi) | |||
movw $0, 1374(%rdi) | |||
movw $0, 1376(%rdi) | |||
movw $0, 1378(%rdi) | |||
movw $0, 1380(%rdi) | |||
movw $0, 1382(%rdi) | |||
movw $0, 1384(%rdi) | |||
movw $0, 1386(%rdi) | |||
movw $0, 1388(%rdi) | |||
movw $0, 1390(%rdi) | |||
movw $0, 1392(%rdi) | |||
movw $0, 1394(%rdi) | |||
movw $0, 1396(%rdi) | |||
movw $0, 1398(%rdi) | |||
movw $0, 1400(%rdi) | |||
movw $0, 1402(%rdi) | |||
movw $0, 1404(%rdi) | |||
movw $0, 1406(%rdi) | |||
ret |
@@ -0,0 +1,104 @@ | |||
.data | |||
.p2align 5 | |||
.text | |||
.global PQCLEAN_NTRUHPS2048677_AVX2_poly_mod_q_Phi_n | |||
.global _PQCLEAN_NTRUHPS2048677_AVX2_poly_mod_q_Phi_n | |||
PQCLEAN_NTRUHPS2048677_AVX2_poly_mod_q_Phi_n: | |||
_PQCLEAN_NTRUHPS2048677_AVX2_poly_mod_q_Phi_n: | |||
vmovdqa 1344(%rdi), %ymm0 | |||
vpermq $1, %ymm0, %ymm0 | |||
vpslld $16, %ymm0, %ymm0 | |||
vpsrld $16, %ymm0, %ymm1 | |||
vpor %ymm0, %ymm1, %ymm0 | |||
vbroadcastss %xmm0, %ymm0 | |||
vxorpd %ymm1, %ymm1, %ymm1 | |||
vpsubw %ymm0, %ymm1, %ymm0 | |||
vpaddw 0(%rdi), %ymm0, %ymm1 | |||
vmovdqa %ymm1, 0(%rdi) | |||
vpaddw 32(%rdi), %ymm0, %ymm1 | |||
vmovdqa %ymm1, 32(%rdi) | |||
vpaddw 64(%rdi), %ymm0, %ymm1 | |||
vmovdqa %ymm1, 64(%rdi) | |||
vpaddw 96(%rdi), %ymm0, %ymm1 | |||
vmovdqa %ymm1, 96(%rdi) | |||
vpaddw 128(%rdi), %ymm0, %ymm1 | |||
vmovdqa %ymm1, 128(%rdi) | |||
vpaddw 160(%rdi), %ymm0, %ymm1 | |||
vmovdqa %ymm1, 160(%rdi) | |||
vpaddw 192(%rdi), %ymm0, %ymm1 | |||
vmovdqa %ymm1, 192(%rdi) | |||
vpaddw 224(%rdi), %ymm0, %ymm1 | |||
vmovdqa %ymm1, 224(%rdi) | |||
vpaddw 256(%rdi), %ymm0, %ymm1 | |||
vmovdqa %ymm1, 256(%rdi) | |||
vpaddw 288(%rdi), %ymm0, %ymm1 | |||
vmovdqa %ymm1, 288(%rdi) | |||
vpaddw 320(%rdi), %ymm0, %ymm1 | |||
vmovdqa %ymm1, 320(%rdi) | |||
vpaddw 352(%rdi), %ymm0, %ymm1 | |||
vmovdqa %ymm1, 352(%rdi) | |||
vpaddw 384(%rdi), %ymm0, %ymm1 | |||
vmovdqa %ymm1, 384(%rdi) | |||
vpaddw 416(%rdi), %ymm0, %ymm1 | |||
vmovdqa %ymm1, 416(%rdi) | |||
vpaddw 448(%rdi), %ymm0, %ymm1 | |||
vmovdqa %ymm1, 448(%rdi) | |||
vpaddw 480(%rdi), %ymm0, %ymm1 | |||
vmovdqa %ymm1, 480(%rdi) | |||
vpaddw 512(%rdi), %ymm0, %ymm1 | |||
vmovdqa %ymm1, 512(%rdi) | |||
vpaddw 544(%rdi), %ymm0, %ymm1 | |||
vmovdqa %ymm1, 544(%rdi) | |||
vpaddw 576(%rdi), %ymm0, %ymm1 | |||
vmovdqa %ymm1, 576(%rdi) | |||
vpaddw 608(%rdi), %ymm0, %ymm1 | |||
vmovdqa %ymm1, 608(%rdi) | |||
vpaddw 640(%rdi), %ymm0, %ymm1 | |||
vmovdqa %ymm1, 640(%rdi) | |||
vpaddw 672(%rdi), %ymm0, %ymm1 | |||
vmovdqa %ymm1, 672(%rdi) | |||
vpaddw 704(%rdi), %ymm0, %ymm1 | |||
vmovdqa %ymm1, 704(%rdi) | |||
vpaddw 736(%rdi), %ymm0, %ymm1 | |||
vmovdqa %ymm1, 736(%rdi) | |||
vpaddw 768(%rdi), %ymm0, %ymm1 | |||
vmovdqa %ymm1, 768(%rdi) | |||
vpaddw 800(%rdi), %ymm0, %ymm1 | |||
vmovdqa %ymm1, 800(%rdi) | |||
vpaddw 832(%rdi), %ymm0, %ymm1 | |||
vmovdqa %ymm1, 832(%rdi) | |||
vpaddw 864(%rdi), %ymm0, %ymm1 | |||
vmovdqa %ymm1, 864(%rdi) | |||
vpaddw 896(%rdi), %ymm0, %ymm1 | |||
vmovdqa %ymm1, 896(%rdi) | |||
vpaddw 928(%rdi), %ymm0, %ymm1 | |||
vmovdqa %ymm1, 928(%rdi) | |||
vpaddw 960(%rdi), %ymm0, %ymm1 | |||
vmovdqa %ymm1, 960(%rdi) | |||
vpaddw 992(%rdi), %ymm0, %ymm1 | |||
vmovdqa %ymm1, 992(%rdi) | |||
vpaddw 1024(%rdi), %ymm0, %ymm1 | |||
vmovdqa %ymm1, 1024(%rdi) | |||
vpaddw 1056(%rdi), %ymm0, %ymm1 | |||
vmovdqa %ymm1, 1056(%rdi) | |||
vpaddw 1088(%rdi), %ymm0, %ymm1 | |||
vmovdqa %ymm1, 1088(%rdi) | |||
vpaddw 1120(%rdi), %ymm0, %ymm1 | |||
vmovdqa %ymm1, 1120(%rdi) | |||
vpaddw 1152(%rdi), %ymm0, %ymm1 | |||
vmovdqa %ymm1, 1152(%rdi) | |||
vpaddw 1184(%rdi), %ymm0, %ymm1 | |||
vmovdqa %ymm1, 1184(%rdi) | |||
vpaddw 1216(%rdi), %ymm0, %ymm1 | |||
vmovdqa %ymm1, 1216(%rdi) | |||
vpaddw 1248(%rdi), %ymm0, %ymm1 | |||
vmovdqa %ymm1, 1248(%rdi) | |||
vpaddw 1280(%rdi), %ymm0, %ymm1 | |||
vmovdqa %ymm1, 1280(%rdi) | |||
vpaddw 1312(%rdi), %ymm0, %ymm1 | |||
vmovdqa %ymm1, 1312(%rdi) | |||
vpaddw 1344(%rdi), %ymm0, %ymm1 | |||
vmovdqa %ymm1, 1344(%rdi) | |||
vpaddw 1376(%rdi), %ymm0, %ymm1 | |||
vmovdqa %ymm1, 1376(%rdi) | |||
ret |
@@ -0,0 +1,73 @@ | |||
#include "poly_r2_inv.h" | |||
#include "poly.h" | |||
// TODO this costs 1764 cycles.. (implementing as S3_to_bytes results in 2108) | |||
// This can be implemented nicely in assembly using pdep / pext functions | |||
void PQCLEAN_NTRUHPS2048677_AVX2_poly_R2_tobytes(unsigned char *out, const poly *a) { | |||
int i, j, k; | |||
for (i = 0; i < 12; i++) { | |||
for (k = 0; k < 8; k++) { | |||
out[i * 8 + k] = 0; | |||
for (j = 0; j < 8; j++) { | |||
if ((i * 8 + k) * 8 + j < NTRU_N) { | |||
out[i * 8 + k] |= (a->coeffs[(i * 8 + k) * 8 + j] & 1) << j; | |||
} | |||
} | |||
} | |||
} | |||
} | |||
void PQCLEAN_NTRUHPS2048677_AVX2_poly_R2_frombytes(poly *a, const unsigned char *in) { | |||
int i, j, k; | |||
for (i = 0; i < 12; i++) { | |||
for (k = 0; k < 8; k++) { | |||
for (j = 0; j < 8; j++) { | |||
if ((i * 8 + k) * 8 + j < NTRU_N) { | |||
a->coeffs[(i * 8 + k) * 8 + j] = (in[i * 8 + k] >> j) & 1; | |||
} | |||
} | |||
} | |||
} | |||
} | |||
void PQCLEAN_NTRUHPS2048677_AVX2_poly_R2_inv(poly *r, const poly *a) { | |||
union { | |||
unsigned char s[96]; | |||
__m256i s_x32[3]; | |||
} squares[13]; | |||
#define s(x) squares[(x)].s | |||
// This relies on the following addition chain: | |||
// 1, 2, 3, 5, 10, 20, 21, 42, 84, 168, 336, 672, 675 | |||
PQCLEAN_NTRUHPS2048677_AVX2_poly_R2_tobytes(s(0), a); // TODO alignment | |||
PQCLEAN_NTRUHPS2048677_AVX2_square_1_677(s(1), s(0)); | |||
PQCLEAN_NTRUHPS2048677_AVX2_poly_R2_mul(s(1), s(1), s(0)); | |||
PQCLEAN_NTRUHPS2048677_AVX2_square_1_677(s(2), s(1)); | |||
PQCLEAN_NTRUHPS2048677_AVX2_poly_R2_mul(s(2), s(2), s(0)); | |||
PQCLEAN_NTRUHPS2048677_AVX2_square_2_677(s(3), s(2)); | |||
PQCLEAN_NTRUHPS2048677_AVX2_poly_R2_mul(s(3), s(3), s(1)); | |||
PQCLEAN_NTRUHPS2048677_AVX2_square_5_677(s(4), s(3)); | |||
PQCLEAN_NTRUHPS2048677_AVX2_poly_R2_mul(s(4), s(4), s(3)); | |||
PQCLEAN_NTRUHPS2048677_AVX2_square_10_677(s(5), s(4)); | |||
PQCLEAN_NTRUHPS2048677_AVX2_poly_R2_mul(s(5), s(5), s(4)); | |||
PQCLEAN_NTRUHPS2048677_AVX2_square_1_677(s(6), s(5)); | |||
PQCLEAN_NTRUHPS2048677_AVX2_poly_R2_mul(s(6), s(6), s(0)); | |||
PQCLEAN_NTRUHPS2048677_AVX2_square_21_677(s(7), s(6)); | |||
PQCLEAN_NTRUHPS2048677_AVX2_poly_R2_mul(s(7), s(7), s(6)); | |||
PQCLEAN_NTRUHPS2048677_AVX2_square_42_677(s(8), s(7)); | |||
PQCLEAN_NTRUHPS2048677_AVX2_poly_R2_mul(s(8), s(8), s(7)); | |||
PQCLEAN_NTRUHPS2048677_AVX2_square_84_677(s(9), s(8)); | |||
PQCLEAN_NTRUHPS2048677_AVX2_poly_R2_mul(s(9), s(9), s(8)); | |||
PQCLEAN_NTRUHPS2048677_AVX2_square_168_677(s(10), s(9)); | |||
PQCLEAN_NTRUHPS2048677_AVX2_poly_R2_mul(s(10), s(10), s(9)); | |||
PQCLEAN_NTRUHPS2048677_AVX2_square_336_677(s(11), s(10)); | |||
PQCLEAN_NTRUHPS2048677_AVX2_poly_R2_mul(s(11), s(11), s(10)); | |||
PQCLEAN_NTRUHPS2048677_AVX2_square_3_677(s(12), s(11)); | |||
PQCLEAN_NTRUHPS2048677_AVX2_poly_R2_mul(s(12), s(12), s(2)); | |||
PQCLEAN_NTRUHPS2048677_AVX2_square_1_677(s(0), s(12)); | |||
PQCLEAN_NTRUHPS2048677_AVX2_poly_R2_frombytes(r, s(0)); | |||
#undef s | |||
} |
@@ -0,0 +1,23 @@ | |||
#ifndef POLY_R2_INV_H | |||
#define POLY_R2_INV_H | |||
#include "poly.h" | |||
void PQCLEAN_NTRUHPS2048677_AVX2_poly_R2_tobytes(unsigned char *out, const poly *a); | |||
void PQCLEAN_NTRUHPS2048677_AVX2_poly_R2_frombytes(poly *a, const unsigned char *in); | |||
extern void PQCLEAN_NTRUHPS2048677_AVX2_square_1_677(unsigned char *out, const unsigned char *a); | |||
extern void PQCLEAN_NTRUHPS2048677_AVX2_square_2_677(unsigned char *out, const unsigned char *a); | |||
extern void PQCLEAN_NTRUHPS2048677_AVX2_square_3_677(unsigned char *out, const unsigned char *a); | |||
extern void PQCLEAN_NTRUHPS2048677_AVX2_square_5_677(unsigned char *out, const unsigned char *a); | |||
extern void PQCLEAN_NTRUHPS2048677_AVX2_square_10_677(unsigned char *out, const unsigned char *a); | |||
extern void PQCLEAN_NTRUHPS2048677_AVX2_square_21_677(unsigned char *out, const unsigned char *a); | |||
extern void PQCLEAN_NTRUHPS2048677_AVX2_square_42_677(unsigned char *out, const unsigned char *a); | |||
extern void PQCLEAN_NTRUHPS2048677_AVX2_square_84_677(unsigned char *out, const unsigned char *a); | |||
extern void PQCLEAN_NTRUHPS2048677_AVX2_square_168_677(unsigned char *out, const unsigned char *a); | |||
extern void PQCLEAN_NTRUHPS2048677_AVX2_square_336_677(unsigned char *out, const unsigned char *a); | |||
extern void PQCLEAN_NTRUHPS2048677_AVX2_poly_R2_mul(unsigned char *out, const unsigned char *a, | |||
const unsigned char *b); | |||
#endif |
@@ -0,0 +1,466 @@ | |||
.data | |||
.p2align 5 | |||
mask1100: | |||
.word 0 | |||
.word 0 | |||
.word 0 | |||
.word 0 | |||
.word 0 | |||
.word 0 | |||
.word 0 | |||
.word 0 | |||
.word 65535 | |||
.word 65535 | |||
.word 65535 | |||
.word 65535 | |||
.word 65535 | |||
.word 65535 | |||
.word 65535 | |||
.word 65535 | |||
mask0110: | |||
.word 0 | |||
.word 0 | |||
.word 0 | |||
.word 0 | |||
.word 65535 | |||
.word 65535 | |||
.word 65535 | |||
.word 65535 | |||
.word 65535 | |||
.word 65535 | |||
.word 65535 | |||
.word 65535 | |||
.word 0 | |||
.word 0 | |||
.word 0 | |||
.word 0 | |||
mask0011: | |||
.word 65535 | |||
.word 65535 | |||
.word 65535 | |||
.word 65535 | |||
.word 65535 | |||
.word 65535 | |||
.word 65535 | |||
.word 65535 | |||
.word 0 | |||
.word 0 | |||
.word 0 | |||
.word 0 | |||
.word 0 | |||
.word 0 | |||
.word 0 | |||
.word 0 | |||
mask1000: | |||
.word 0 | |||
.word 0 | |||
.word 0 | |||
.word 0 | |||
.word 0 | |||
.word 0 | |||
.word 0 | |||
.word 0 | |||
.word 0 | |||
.word 0 | |||
.word 0 | |||
.word 0 | |||
.word 65535 | |||
.word 65535 | |||
.word 65535 | |||
.word 65535 | |||
mask0111: | |||
.word 65535 | |||
.word 65535 | |||
.word 65535 | |||
.word 65535 | |||
.word 65535 | |||
.word 65535 | |||
.word 65535 | |||
.word 65535 | |||
.word 65535 | |||
.word 65535 | |||
.word 65535 | |||
.word 65535 | |||
.word 0 | |||
.word 0 | |||
.word 0 | |||
.word 0 | |||
low165: | |||
.word 65535 | |||
.word 65535 | |||
.word 65535 | |||
.word 65535 | |||
.word 65535 | |||
.word 65535 | |||
.word 65535 | |||
.word 65535 | |||
.word 65535 | |||
.word 65535 | |||
.word 31 | |||
.word 0 | |||
.word 0 | |||
.word 0 | |||
.word 0 | |||
.word 0 | |||
.text | |||
.global PQCLEAN_NTRUHPS2048677_AVX2_poly_R2_mul | |||
.global _PQCLEAN_NTRUHPS2048677_AVX2_poly_R2_mul | |||
PQCLEAN_NTRUHPS2048677_AVX2_poly_R2_mul: | |||
_PQCLEAN_NTRUHPS2048677_AVX2_poly_R2_mul: | |||
vmovdqa 0(%rsi), %ymm0 | |||
vmovdqa 32(%rsi), %ymm1 | |||
vmovdqa 0(%rdx), %ymm3 | |||
vmovdqa 32(%rdx), %ymm4 | |||
vpxor %ymm0, %ymm1, %ymm6 | |||
vpxor %ymm3, %ymm4, %ymm7 | |||
vextracti128 $1, %ymm0, %xmm11 | |||
vextracti128 $1, %ymm3, %xmm12 | |||
vpclmulqdq $1, %xmm11, %xmm12, %xmm5 | |||
vpclmulqdq $16, %xmm11, %xmm12, %xmm14 | |||
vpclmulqdq $17, %xmm11, %xmm12, %xmm15 | |||
vpxor %xmm5, %xmm14, %xmm14 | |||
vpclmulqdq $0, %xmm11, %xmm12, %xmm5 | |||
vpermq $16, %ymm14, %ymm14 | |||
vinserti128 $1, %xmm15, %ymm15, %ymm15 | |||
vpand mask0011(%rip), %ymm5, %ymm5 | |||
vpand mask0110(%rip), %ymm14, %ymm14 | |||
vpand mask1100(%rip), %ymm15, %ymm15 | |||
vpxor %ymm5, %ymm14, %ymm14 | |||
vpxor %ymm14, %ymm15, %ymm5 | |||
vpxor %xmm0, %xmm11, %xmm11 | |||
vpxor %xmm3, %xmm12, %xmm12 | |||
vpclmulqdq $1, %xmm11, %xmm12, %xmm13 | |||
vpclmulqdq $16, %xmm11, %xmm12, %xmm14 | |||
vpclmulqdq $17, %xmm11, %xmm12, %xmm15 | |||
vpxor %xmm13, %xmm14, %xmm14 | |||
vpclmulqdq $0, %xmm11, %xmm12, %xmm13 | |||
vpermq $16, %ymm14, %ymm14 | |||
vinserti128 $1, %xmm15, %ymm15, %ymm15 | |||
vpand mask0011(%rip), %ymm13, %ymm13 | |||
vpand mask0110(%rip), %ymm14, %ymm14 | |||
vpand mask1100(%rip), %ymm15, %ymm15 | |||
vpxor %ymm13, %ymm14, %ymm14 | |||
vpxor %ymm14, %ymm15, %ymm13 | |||
vpclmulqdq $1, %xmm0, %xmm3, %xmm2 | |||
vpclmulqdq $16, %xmm0, %xmm3, %xmm14 | |||
vpclmulqdq $17, %xmm0, %xmm3, %xmm15 | |||
vpxor %xmm2, %xmm14, %xmm14 | |||
vpclmulqdq $0, %xmm0, %xmm3, %xmm2 | |||
vpermq $16, %ymm14, %ymm14 | |||
vinserti128 $1, %xmm15, %ymm15, %ymm15 | |||
vpand mask0011(%rip), %ymm2, %ymm2 | |||
vpand mask0110(%rip), %ymm14, %ymm14 | |||
vpand mask1100(%rip), %ymm15, %ymm15 | |||
vpxor %ymm2, %ymm14, %ymm14 | |||
vpxor %ymm14, %ymm15, %ymm2 | |||
vpxor %ymm13, %ymm5, %ymm13 | |||
vpxor %ymm13, %ymm2, %ymm13 | |||
vpxor %ymm11, %ymm11, %ymm11 | |||
vextracti128 $1, %ymm13, %xmm11 | |||
vpxor %ymm5, %ymm11, %ymm5 | |||
vpxor %ymm11, %ymm11, %ymm11 | |||
vinserti128 $1, %xmm13, %ymm11, %ymm11 | |||
vpxor %ymm11, %ymm2, %ymm2 | |||
vextracti128 $1, %ymm6, %xmm11 | |||
vextracti128 $1, %ymm7, %xmm12 | |||
vpclmulqdq $1, %xmm11, %xmm12, %xmm9 | |||
vpclmulqdq $16, %xmm11, %xmm12, %xmm14 | |||
vpclmulqdq $17, %xmm11, %xmm12, %xmm15 | |||
vpxor %xmm9, %xmm14, %xmm14 | |||
vpclmulqdq $0, %xmm11, %xmm12, %xmm9 | |||
vpermq $16, %ymm14, %ymm14 | |||
vinserti128 $1, %xmm15, %ymm15, %ymm15 | |||
vpand mask0011(%rip), %ymm9, %ymm9 | |||
vpand mask0110(%rip), %ymm14, %ymm14 | |||
vpand mask1100(%rip), %ymm15, %ymm15 | |||
vpxor %ymm9, %ymm14, %ymm14 | |||
vpxor %ymm14, %ymm15, %ymm9 | |||
vpxor %xmm6, %xmm11, %xmm11 | |||
vpxor %xmm7, %xmm12, %xmm12 | |||
vpclmulqdq $1, %xmm11, %xmm12, %xmm13 | |||
vpclmulqdq $16, %xmm11, %xmm12, %xmm14 | |||
vpclmulqdq $17, %xmm11, %xmm12, %xmm15 | |||
vpxor %xmm13, %xmm14, %xmm14 | |||
vpclmulqdq $0, %xmm11, %xmm12, %xmm13 | |||
vpermq $16, %ymm14, %ymm14 | |||
vinserti128 $1, %xmm15, %ymm15, %ymm15 | |||
vpand mask0011(%rip), %ymm13, %ymm13 | |||
vpand mask0110(%rip), %ymm14, %ymm14 | |||
vpand mask1100(%rip), %ymm15, %ymm15 | |||
vpxor %ymm13, %ymm14, %ymm14 | |||
vpxor %ymm14, %ymm15, %ymm13 | |||
vpclmulqdq $1, %xmm6, %xmm7, %xmm8 | |||
vpclmulqdq $16, %xmm6, %xmm7, %xmm14 | |||
vpclmulqdq $17, %xmm6, %xmm7, %xmm15 | |||
vpxor %xmm8, %xmm14, %xmm14 | |||
vpclmulqdq $0, %xmm6, %xmm7, %xmm8 | |||
vpermq $16, %ymm14, %ymm14 | |||
vinserti128 $1, %xmm15, %ymm15, %ymm15 | |||
vpand mask0011(%rip), %ymm8, %ymm8 | |||
vpand mask0110(%rip), %ymm14, %ymm14 | |||
vpand mask1100(%rip), %ymm15, %ymm15 | |||
vpxor %ymm8, %ymm14, %ymm14 | |||
vpxor %ymm14, %ymm15, %ymm8 | |||
vpxor %ymm13, %ymm9, %ymm13 | |||
vpxor %ymm13, %ymm8, %ymm13 | |||
vpxor %ymm11, %ymm11, %ymm11 | |||
vextracti128 $1, %ymm13, %xmm11 | |||
vpxor %ymm9, %ymm11, %ymm9 | |||
vpxor %ymm11, %ymm11, %ymm11 | |||
vinserti128 $1, %xmm13, %ymm11, %ymm11 | |||
vpxor %ymm11, %ymm8, %ymm8 | |||
vpxor %ymm8, %ymm2, %ymm0 | |||
vpxor %ymm9, %ymm5, %ymm3 | |||
vpxor %ymm5, %ymm0, %ymm0 | |||
vpxor %ymm3, %ymm8, %ymm8 | |||
vmovdqa 64(%rsi), %ymm10 | |||
vmovdqa 64(%rdx), %ymm15 | |||
vpxor %ymm6, %ymm10, %ymm6 | |||
vpxor %ymm7, %ymm15, %ymm7 | |||
vextracti128 $1, %ymm6, %xmm11 | |||
vextracti128 $1, %ymm7, %xmm12 | |||
vpclmulqdq $1, %xmm11, %xmm12, %xmm5 | |||
vpclmulqdq $16, %xmm11, %xmm12, %xmm13 | |||
vpclmulqdq $17, %xmm11, %xmm12, %xmm14 | |||
vpxor %xmm5, %xmm13, %xmm13 | |||
vpclmulqdq $0, %xmm11, %xmm12, %xmm5 | |||
vpermq $16, %ymm13, %ymm13 | |||
vinserti128 $1, %xmm14, %ymm14, %ymm14 | |||
vpand mask0011(%rip), %ymm5, %ymm5 | |||
vpand mask0110(%rip), %ymm13, %ymm13 | |||
vpand mask1100(%rip), %ymm14, %ymm14 | |||
vpxor %ymm5, %ymm13, %ymm13 | |||
vpxor %ymm13, %ymm14, %ymm5 | |||
vpclmulqdq $1, %xmm6, %xmm7, %xmm3 | |||
vpclmulqdq $16, %xmm6, %xmm7, %xmm13 | |||
vpclmulqdq $17, %xmm6, %xmm7, %xmm14 | |||
vpxor %xmm3, %xmm13, %xmm13 | |||
vpclmulqdq $0, %xmm6, %xmm7, %xmm3 | |||
vpermq $16, %ymm13, %ymm13 | |||
vinserti128 $1, %xmm14, %ymm14, %ymm14 | |||
vpand mask0011(%rip), %ymm3, %ymm3 | |||
vpand mask0110(%rip), %ymm13, %ymm13 | |||
vpand mask1100(%rip), %ymm14, %ymm14 | |||
vpxor %ymm3, %ymm13, %ymm13 | |||
vpxor %ymm13, %ymm14, %ymm3 | |||
vpxor %xmm6, %xmm11, %xmm11 | |||
vpxor %xmm7, %xmm12, %xmm12 | |||
vpclmulqdq $1, %xmm11, %xmm12, %xmm13 | |||
vpclmulqdq $16, %xmm11, %xmm12, %xmm6 | |||
vpclmulqdq $17, %xmm11, %xmm12, %xmm7 | |||
vpxor %xmm13, %xmm6, %xmm6 | |||
vpclmulqdq $0, %xmm11, %xmm12, %xmm13 | |||
vpermq $16, %ymm6, %ymm6 | |||
vinserti128 $1, %xmm7, %ymm7, %ymm7 | |||
vpand mask0011(%rip), %ymm13, %ymm13 | |||
vpand mask0110(%rip), %ymm6, %ymm6 | |||
vpand mask1100(%rip), %ymm7, %ymm7 | |||
vpxor %ymm13, %ymm6, %ymm6 | |||
vpxor %ymm6, %ymm7, %ymm13 | |||
vpxor %ymm13, %ymm5, %ymm13 | |||
vpxor %ymm13, %ymm3, %ymm13 | |||
vpxor %ymm11, %ymm11, %ymm11 | |||
vextracti128 $1, %ymm13, %xmm11 | |||
vpxor %ymm5, %ymm11, %ymm5 | |||
vpxor %ymm11, %ymm11, %ymm11 | |||
vinserti128 $1, %xmm13, %ymm11, %ymm11 | |||
vpxor %ymm11, %ymm3, %ymm3 | |||
vpxor %ymm3, %ymm8, %ymm8 | |||
vpxor %ymm5, %ymm9, %ymm9 | |||
vpxor %ymm1, %ymm10, %ymm6 | |||
vpxor %ymm4, %ymm15, %ymm7 | |||
vextracti128 $1, %ymm6, %xmm11 | |||
vextracti128 $1, %ymm7, %xmm12 | |||
vpclmulqdq $1, %xmm11, %xmm12, %xmm5 | |||
vpclmulqdq $16, %xmm11, %xmm12, %xmm13 | |||
vpclmulqdq $17, %xmm11, %xmm12, %xmm14 | |||
vpxor %xmm5, %xmm13, %xmm13 | |||
vpclmulqdq $0, %xmm11, %xmm12, %xmm5 | |||
vpermq $16, %ymm13, %ymm13 | |||
vinserti128 $1, %xmm14, %ymm14, %ymm14 | |||
vpand mask0011(%rip), %ymm5, %ymm5 | |||
vpand mask0110(%rip), %ymm13, %ymm13 | |||
vpand mask1100(%rip), %ymm14, %ymm14 | |||
vpxor %ymm5, %ymm13, %ymm13 | |||
vpxor %ymm13, %ymm14, %ymm5 | |||
vpclmulqdq $1, %xmm6, %xmm7, %xmm3 | |||
vpclmulqdq $16, %xmm6, %xmm7, %xmm13 | |||
vpclmulqdq $17, %xmm6, %xmm7, %xmm14 | |||
vpxor %xmm3, %xmm13, %xmm13 | |||
vpclmulqdq $0, %xmm6, %xmm7, %xmm3 | |||
vpermq $16, %ymm13, %ymm13 | |||
vinserti128 $1, %xmm14, %ymm14, %ymm14 | |||
vpand mask0011(%rip), %ymm3, %ymm3 | |||
vpand mask0110(%rip), %ymm13, %ymm13 | |||
vpand mask1100(%rip), %ymm14, %ymm14 | |||
vpxor %ymm3, %ymm13, %ymm13 | |||
vpxor %ymm13, %ymm14, %ymm3 | |||
vpxor %xmm6, %xmm11, %xmm11 | |||
vpxor %xmm7, %xmm12, %xmm12 | |||
vpclmulqdq $1, %xmm11, %xmm12, %xmm13 | |||
vpclmulqdq $16, %xmm11, %xmm12, %xmm6 | |||
vpclmulqdq $17, %xmm11, %xmm12, %xmm7 | |||
vpxor %xmm13, %xmm6, %xmm6 | |||
vpclmulqdq $0, %xmm11, %xmm12, %xmm13 | |||
vpermq $16, %ymm6, %ymm6 | |||
vinserti128 $1, %xmm7, %ymm7, %ymm7 | |||
vpand mask0011(%rip), %ymm13, %ymm13 | |||
vpand mask0110(%rip), %ymm6, %ymm6 | |||
vpand mask1100(%rip), %ymm7, %ymm7 | |||
vpxor %ymm13, %ymm6, %ymm6 | |||
vpxor %ymm6, %ymm7, %ymm13 | |||
vpxor %ymm13, %ymm5, %ymm13 | |||
vpxor %ymm13, %ymm3, %ymm13 | |||
vpxor %ymm11, %ymm11, %ymm11 | |||
vextracti128 $1, %ymm13, %xmm11 | |||
vpxor %ymm5, %ymm11, %ymm5 | |||
vpxor %ymm11, %ymm11, %ymm11 | |||
vinserti128 $1, %xmm13, %ymm11, %ymm11 | |||
vpxor %ymm11, %ymm3, %ymm3 | |||
vpxor %ymm3, %ymm8, %ymm8 | |||
vpxor %ymm5, %ymm9, %ymm9 | |||
vextracti128 $1, %ymm1, %xmm11 | |||
vextracti128 $1, %ymm4, %xmm12 | |||
vpclmulqdq $1, %xmm11, %xmm12, %xmm7 | |||
vpclmulqdq $16, %xmm11, %xmm12, %xmm13 | |||
vpclmulqdq $17, %xmm11, %xmm12, %xmm14 | |||
vpxor %xmm7, %xmm13, %xmm13 | |||
vpclmulqdq $0, %xmm11, %xmm12, %xmm7 | |||
vpermq $16, %ymm13, %ymm13 | |||
vinserti128 $1, %xmm14, %ymm14, %ymm14 | |||
vpand mask0011(%rip), %ymm7, %ymm7 | |||
vpand mask0110(%rip), %ymm13, %ymm13 | |||
vpand mask1100(%rip), %ymm14, %ymm14 | |||
vpxor %ymm7, %ymm13, %ymm13 | |||
vpxor %ymm13, %ymm14, %ymm7 | |||
vpclmulqdq $1, %xmm1, %xmm4, %xmm6 | |||
vpclmulqdq $16, %xmm1, %xmm4, %xmm13 | |||
vpclmulqdq $17, %xmm1, %xmm4, %xmm14 | |||
vpxor %xmm6, %xmm13, %xmm13 | |||
vpclmulqdq $0, %xmm1, %xmm4, %xmm6 | |||
vpermq $16, %ymm13, %ymm13 | |||
vinserti128 $1, %xmm14, %ymm14, %ymm14 | |||
vpand mask0011(%rip), %ymm6, %ymm6 | |||
vpand mask0110(%rip), %ymm13, %ymm13 | |||
vpand mask1100(%rip), %ymm14, %ymm14 | |||
vpxor %ymm6, %ymm13, %ymm13 | |||
vpxor %ymm13, %ymm14, %ymm6 | |||
vpxor %xmm1, %xmm11, %xmm11 | |||
vpxor %xmm4, %xmm12, %xmm12 | |||
vpclmulqdq $1, %xmm11, %xmm12, %xmm13 | |||
vpclmulqdq $16, %xmm11, %xmm12, %xmm1 | |||
vpclmulqdq $17, %xmm11, %xmm12, %xmm4 | |||
vpxor %xmm13, %xmm1, %xmm1 | |||
vpclmulqdq $0, %xmm11, %xmm12, %xmm13 | |||
vpermq $16, %ymm1, %ymm1 | |||
vinserti128 $1, %xmm4, %ymm4, %ymm4 | |||
vpand mask0011(%rip), %ymm13, %ymm13 | |||
vpand mask0110(%rip), %ymm1, %ymm1 | |||
vpand mask1100(%rip), %ymm4, %ymm4 | |||
vpxor %ymm13, %ymm1, %ymm1 | |||
vpxor %ymm1, %ymm4, %ymm13 | |||
vpxor %ymm13, %ymm7, %ymm13 | |||
vpxor %ymm13, %ymm6, %ymm13 | |||
vpxor %ymm11, %ymm11, %ymm11 | |||
vextracti128 $1, %ymm13, %xmm11 | |||
vpxor %ymm7, %ymm11, %ymm7 | |||
vpxor %ymm11, %ymm11, %ymm11 | |||
vinserti128 $1, %xmm13, %ymm11, %ymm11 | |||
vpxor %ymm11, %ymm6, %ymm6 | |||
vpxor %ymm6, %ymm0, %ymm0 | |||
vpxor %ymm7, %ymm8, %ymm8 | |||
vpxor %ymm6, %ymm3, %ymm3 | |||
vpxor %ymm7, %ymm5, %ymm5 | |||
vextracti128 $1, %ymm10, %xmm11 | |||
vextracti128 $1, %ymm15, %xmm12 | |||
vpclmulqdq $1, %xmm11, %xmm12, %xmm4 | |||
vpclmulqdq $16, %xmm11, %xmm12, %xmm13 | |||
vpclmulqdq $17, %xmm11, %xmm12, %xmm14 | |||
vpxor %xmm4, %xmm13, %xmm13 | |||
vpclmulqdq $0, %xmm11, %xmm12, %xmm4 | |||
vpermq $16, %ymm13, %ymm13 | |||
vinserti128 $1, %xmm14, %ymm14, %ymm14 | |||
vpand mask0011(%rip), %ymm4, %ymm4 | |||
vpand mask0110(%rip), %ymm13, %ymm13 | |||
vpand mask1100(%rip), %ymm14, %ymm14 | |||
vpxor %ymm4, %ymm13, %ymm13 | |||
vpxor %ymm13, %ymm14, %ymm4 | |||
vpclmulqdq $1, %xmm10, %xmm15, %xmm1 | |||
vpclmulqdq $16, %xmm10, %xmm15, %xmm13 | |||
vpclmulqdq $17, %xmm10, %xmm15, %xmm14 | |||
vpxor %xmm1, %xmm13, %xmm13 | |||
vpclmulqdq $0, %xmm10, %xmm15, %xmm1 | |||
vpermq $16, %ymm13, %ymm13 | |||
vinserti128 $1, %xmm14, %ymm14, %ymm14 | |||
vpand mask0011(%rip), %ymm1, %ymm1 | |||
vpand mask0110(%rip), %ymm13, %ymm13 | |||
vpand mask1100(%rip), %ymm14, %ymm14 | |||
vpxor %ymm1, %ymm13, %ymm13 | |||
vpxor %ymm13, %ymm14, %ymm1 | |||
vpxor %xmm10, %xmm11, %xmm11 | |||
vpxor %xmm15, %xmm12, %xmm12 | |||
vpclmulqdq $1, %xmm11, %xmm12, %xmm13 | |||
vpclmulqdq $16, %xmm11, %xmm12, %xmm10 | |||
vpclmulqdq $17, %xmm11, %xmm12, %xmm15 | |||
vpxor %xmm13, %xmm10, %xmm10 | |||
vpclmulqdq $0, %xmm11, %xmm12, %xmm13 | |||
vpermq $16, %ymm10, %ymm10 | |||
vinserti128 $1, %xmm15, %ymm15, %ymm15 | |||
vpand mask0011(%rip), %ymm13, %ymm13 | |||
vpand mask0110(%rip), %ymm10, %ymm10 | |||
vpand mask1100(%rip), %ymm15, %ymm15 | |||
vpxor %ymm13, %ymm10, %ymm10 | |||
vpxor %ymm10, %ymm15, %ymm13 | |||
vpxor %ymm13, %ymm4, %ymm13 | |||
vpxor %ymm13, %ymm1, %ymm13 | |||
vpxor %ymm11, %ymm11, %ymm11 | |||
vextracti128 $1, %ymm13, %xmm11 | |||
vpxor %ymm4, %ymm11, %ymm4 | |||
vpxor %ymm11, %ymm11, %ymm11 | |||
vinserti128 $1, %xmm13, %ymm11, %ymm11 | |||
vpxor %ymm11, %ymm1, %ymm1 | |||
vpxor %ymm1, %ymm3, %ymm3 | |||
vpxor %ymm4, %ymm5, %ymm5 | |||
vpxor %ymm9, %ymm3, %ymm3 | |||
vpxor %ymm5, %ymm1, %ymm1 | |||
vpand mask1100(%rip), %ymm8, %ymm13 | |||
vpand mask0011(%rip), %ymm3, %ymm12 | |||
vpxor %ymm12, %ymm13, %ymm12 | |||
vpsrlq $37, %ymm12, %ymm12 | |||
vpermq $78, %ymm12, %ymm12 | |||
vpxor %ymm12, %ymm2, %ymm2 | |||
vpand mask1000(%rip), %ymm8, %ymm12 | |||
vpand mask0111(%rip), %ymm3, %ymm13 | |||
vpxor %ymm12, %ymm13, %ymm12 | |||
vpsllq $27, %ymm12, %ymm12 | |||
vpermq $147, %ymm12, %ymm12 | |||
vpxor %ymm12, %ymm2, %ymm2 | |||
vpand mask1100(%rip), %ymm3, %ymm13 | |||
vpand mask0011(%rip), %ymm1, %ymm12 | |||
vpxor %ymm12, %ymm13, %ymm12 | |||
vpsrlq $37, %ymm12, %ymm12 | |||
vpermq $78, %ymm12, %ymm12 | |||
vpxor %ymm12, %ymm0, %ymm0 | |||
vpand mask1000(%rip), %ymm3, %ymm12 | |||
vpand mask0111(%rip), %ymm1, %ymm13 | |||
vpxor %ymm12, %ymm13, %ymm12 | |||
vpsllq $27, %ymm12, %ymm12 | |||
vpermq $147, %ymm12, %ymm12 | |||
vpxor %ymm12, %ymm0, %ymm0 | |||
vpand mask1100(%rip), %ymm1, %ymm13 | |||
vpand mask0011(%rip), %ymm4, %ymm12 | |||
vpxor %ymm12, %ymm13, %ymm12 | |||
vpsrlq $37, %ymm12, %ymm12 | |||
vpermq $78, %ymm12, %ymm12 | |||
vpxor %ymm12, %ymm8, %ymm8 | |||
vpand mask1000(%rip), %ymm1, %ymm12 | |||
vpand mask0111(%rip), %ymm4, %ymm13 | |||
vpxor %ymm12, %ymm13, %ymm12 | |||
vpsllq $27, %ymm12, %ymm12 | |||
vpermq $147, %ymm12, %ymm12 | |||
vpxor %ymm12, %ymm8, %ymm8 | |||
vpand low165(%rip), %ymm8, %ymm8 | |||
vmovdqa %ymm2, 0(%rdi) | |||
vmovdqa %ymm0, 32(%rdi) | |||
vmovdqa %ymm8, 64(%rdi) | |||
ret |
@@ -0,0 +1,569 @@ | |||
#include "poly.h" | |||
#include <immintrin.h> | |||
typedef signed char small; | |||
#define p 676 | |||
#define ppad 768 | |||
#define numvec 3 | |||
typedef __m256i vec256; | |||
/* | |||
This code stores 768-coeff poly as vec256[3]. | |||
Order of 256 coefficients in each vec256 | |||
is optimized in light of costs of vector instructions: | |||
0,4,...,252 in 64-bit word; | |||
1,5,...,253 in 64-bit word; | |||
2,6,...,254 in 64-bit word; | |||
3,7,...,255 in 64-bit word. | |||
*/ | |||
static inline void vec256_frombits(vec256 *v, const small *b) { | |||
int i; | |||
for (i = 0; i < numvec; ++i) { | |||
vec256 b0 = _mm256_loadu_si256((vec256 *) b); | |||
b += 32; /* 0,1,...,31 */ | |||
vec256 b1 = _mm256_loadu_si256((vec256 *) b); | |||
b += 32; /* 32,33,... */ | |||
vec256 b2 = _mm256_loadu_si256((vec256 *) b); | |||
b += 32; | |||
vec256 b3 = _mm256_loadu_si256((vec256 *) b); | |||
b += 32; | |||
vec256 b4 = _mm256_loadu_si256((vec256 *) b); | |||
b += 32; | |||
vec256 b5 = _mm256_loadu_si256((vec256 *) b); | |||
b += 32; | |||
vec256 b6 = _mm256_loadu_si256((vec256 *) b); | |||
b += 32; | |||
vec256 b7 = _mm256_loadu_si256((vec256 *) b); | |||
b += 32; | |||
vec256 c0 = _mm256_unpacklo_epi32(b0, b1); /* 0 1 2 3 32 33 34 35 4 5 6 7 36 37 38 39 ... 55 */ | |||
vec256 c1 = _mm256_unpackhi_epi32(b0, b1); /* 8 9 10 11 40 41 42 43 ... 63 */ | |||
vec256 c2 = _mm256_unpacklo_epi32(b2, b3); | |||
vec256 c3 = _mm256_unpackhi_epi32(b2, b3); | |||
vec256 c4 = _mm256_unpacklo_epi32(b4, b5); | |||
vec256 c5 = _mm256_unpackhi_epi32(b4, b5); | |||
vec256 c6 = _mm256_unpacklo_epi32(b6, b7); | |||
vec256 c7 = _mm256_unpackhi_epi32(b6, b7); | |||
vec256 d0 = c0 | _mm256_slli_epi32(c1, 2); /* 0 8, 1 9, 2 10, 3 11, 32 40, 33 41, ..., 55 63 */ | |||
vec256 d2 = c2 | _mm256_slli_epi32(c3, 2); | |||
vec256 d4 = c4 | _mm256_slli_epi32(c5, 2); | |||
vec256 d6 = c6 | _mm256_slli_epi32(c7, 2); | |||
vec256 e0 = _mm256_unpacklo_epi64(d0, d2); | |||
vec256 e2 = _mm256_unpackhi_epi64(d0, d2); | |||
vec256 e4 = _mm256_unpacklo_epi64(d4, d6); | |||
vec256 e6 = _mm256_unpackhi_epi64(d4, d6); | |||
vec256 f0 = e0 | _mm256_slli_epi32(e2, 1); | |||
vec256 f4 = e4 | _mm256_slli_epi32(e6, 1); | |||
vec256 g0 = _mm256_permute2x128_si256(f0, f4, 0x20); | |||
vec256 g4 = _mm256_permute2x128_si256(f0, f4, 0x31); | |||
vec256 h = g0 | _mm256_slli_epi32(g4, 4); | |||
#define TRANSPOSE _mm256_set_epi8( 31,27,23,19, 30,26,22,18, 29,25,21,17, 28,24,20,16, 15,11,7,3, 14,10,6,2, 13,9,5,1, 12,8,4,0 ) | |||
h = _mm256_shuffle_epi8(h, TRANSPOSE); | |||
h = _mm256_permute4x64_epi64(h, 0xd8); | |||
h = _mm256_shuffle_epi32(h, 0xd8); | |||
*v++ = h; | |||
} | |||
} | |||
static inline void vec256_tobits(const vec256 *v, small *b) { | |||
int i; | |||
for (i = 0; i < numvec; ++i) { | |||
vec256 h = *v++; | |||
h = _mm256_shuffle_epi32(h, 0xd8); | |||
h = _mm256_permute4x64_epi64(h, 0xd8); | |||
h = _mm256_shuffle_epi8(h, TRANSPOSE); | |||
vec256 g0 = h & _mm256_set1_epi8(15); | |||
vec256 g4 = _mm256_srli_epi32(h, 4) & _mm256_set1_epi8(15); | |||
vec256 f0 = _mm256_permute2x128_si256(g0, g4, 0x20); | |||
vec256 f4 = _mm256_permute2x128_si256(g0, g4, 0x31); | |||
vec256 e0 = f0 & _mm256_set1_epi8(5); | |||
vec256 e2 = _mm256_srli_epi32(f0, 1) & _mm256_set1_epi8(5); | |||
vec256 e4 = f4 & _mm256_set1_epi8(5); | |||
vec256 e6 = _mm256_srli_epi32(f4, 1) & _mm256_set1_epi8(5); | |||
vec256 d0 = _mm256_unpacklo_epi32(e0, e2); | |||
vec256 d2 = _mm256_unpackhi_epi32(e0, e2); | |||
vec256 d4 = _mm256_unpacklo_epi32(e4, e6); | |||
vec256 d6 = _mm256_unpackhi_epi32(e4, e6); | |||
vec256 c0 = d0 & _mm256_set1_epi8(1); | |||
vec256 c1 = _mm256_srli_epi32(d0, 2) & _mm256_set1_epi8(1); | |||
vec256 c2 = d2 & _mm256_set1_epi8(1); | |||
vec256 c3 = _mm256_srli_epi32(d2, 2) & _mm256_set1_epi8(1); | |||
vec256 c4 = d4 & _mm256_set1_epi8(1); | |||
vec256 c5 = _mm256_srli_epi32(d4, 2) & _mm256_set1_epi8(1); | |||
vec256 c6 = d6 & _mm256_set1_epi8(1); | |||
vec256 c7 = _mm256_srli_epi32(d6, 2) & _mm256_set1_epi8(1); | |||
vec256 b0 = _mm256_unpacklo_epi64(c0, c1); | |||
vec256 b1 = _mm256_unpackhi_epi64(c0, c1); | |||
vec256 b2 = _mm256_unpacklo_epi64(c2, c3); | |||
vec256 b3 = _mm256_unpackhi_epi64(c2, c3); | |||
vec256 b4 = _mm256_unpacklo_epi64(c4, c5); | |||
vec256 b5 = _mm256_unpackhi_epi64(c4, c5); | |||
vec256 b6 = _mm256_unpacklo_epi64(c6, c7); | |||
vec256 b7 = _mm256_unpackhi_epi64(c6, c7); | |||
_mm256_storeu_si256((vec256 *) b, b0); | |||
b += 32; | |||
_mm256_storeu_si256((vec256 *) b, b1); | |||
b += 32; | |||
_mm256_storeu_si256((vec256 *) b, b2); | |||
b += 32; | |||
_mm256_storeu_si256((vec256 *) b, b3); | |||
b += 32; | |||
_mm256_storeu_si256((vec256 *) b, b4); | |||
b += 32; | |||
_mm256_storeu_si256((vec256 *) b, b5); | |||
b += 32; | |||
_mm256_storeu_si256((vec256 *) b, b6); | |||
b += 32; | |||
_mm256_storeu_si256((vec256 *) b, b7); | |||
b += 32; | |||
} | |||
} | |||
static void vec256_init(vec256 *G0, vec256 *G1, const small *s) { | |||
int i; | |||
small srev[ppad + (ppad - p)]; | |||
small si; | |||
small g0[ppad]; | |||
small g1[ppad]; | |||
for (i = 0; i < p; ++i) { | |||
srev[ppad - 1 - i] = s[i]; | |||
} | |||
for (i = 0; i < ppad - p; ++i) { | |||
srev[i] = 0; | |||
} | |||
for (i = p; i < ppad; ++i) { | |||
srev[i + ppad - p] = 0; | |||
} | |||
for (i = 0; i < ppad; ++i) { | |||
si = srev[i + ppad - p]; | |||
g0[i] = si & 1; | |||
g1[i] = (si >> 1) & g0[i]; | |||
} | |||
vec256_frombits(G0, g0); | |||
vec256_frombits(G1, g1); | |||
} | |||
static void vec256_final(small *out, const vec256 *V0, const vec256 *V1) { | |||
int i; | |||
small v0[ppad]; | |||
small v1[ppad]; | |||
small v[ppad]; | |||
small vrev[ppad + (ppad - p)]; | |||
vec256_tobits(V0, v0); | |||
vec256_tobits(V1, v1); | |||
for (i = 0; i < ppad; ++i) { | |||
v[i] = v0[i] + 2 * v1[i] - 4 * (v0[i] & v1[i]); | |||
} | |||
for (i = 0; i < ppad; ++i) { | |||
vrev[i] = v[ppad - 1 - i]; | |||
} | |||
for (i = ppad; i < ppad + (ppad - p); ++i) { | |||
vrev[i] = 0; | |||
} | |||
for (i = 0; i < p; ++i) { | |||
out[i] = vrev[i + ppad - p]; | |||
} | |||
} | |||
static inline int negative_mask(int x) { | |||
return x >> 31; | |||
} | |||
static inline void vec256_swap(vec256 *f, vec256 *g, int len, vec256 mask) { | |||
vec256 flip; | |||
int i; | |||
for (i = 0; i < len; ++i) { | |||
flip = mask & (f[i] ^ g[i]); | |||
f[i] ^= flip; | |||
g[i] ^= flip; | |||
} | |||
} | |||
static inline void vec256_scale(vec256 *f0, vec256 *f1, const vec256 c0, const vec256 c1) { | |||
int i; | |||
for (i = 0; i < numvec; ++i) { | |||
vec256 f0i = f0[i]; | |||
vec256 f1i = f1[i]; | |||
f0i &= c0; | |||
f1i ^= c1; | |||
f1i &= f0i; | |||
f0[i] = f0i; | |||
f1[i] = f1i; | |||
} | |||
} | |||
static inline void vec256_eliminate(vec256 *f0, vec256 *f1, vec256 *g0, vec256 *g1, int len, const vec256 c0, const vec256 c1) { | |||
int i; | |||
for (i = 0; i < len; ++i) { | |||
vec256 f0i = f0[i]; | |||
vec256 f1i = f1[i]; | |||
vec256 g0i = g0[i]; | |||
vec256 g1i = g1[i]; | |||
vec256 t; | |||
f0i &= c0; | |||
f1i ^= c1; | |||
f1i &= f0i; | |||
t = g0i ^ f0i; | |||
g0[i] = t | (g1i ^ f1i); | |||
g1[i] = (g1i ^ f0i) & (f1i ^ t); | |||
} | |||
} | |||
static inline int vec256_bit0mask(vec256 *f) { | |||
return -(_mm_cvtsi128_si32(_mm256_castsi256_si128(f[0])) & 1); | |||
} | |||
static inline void vec256_divx_1(vec256 *f) { | |||
vec256 f0 = f[0]; | |||
unsigned long long low0 = _mm_cvtsi128_si64(_mm256_castsi256_si128(f0)); | |||
low0 = low0 >> 1; | |||
f0 = _mm256_blend_epi32(f0, _mm256_set_epi64x(0, 0, 0, low0), 0x3); | |||
f[0] = _mm256_permute4x64_epi64(f0, 0x39); | |||
} | |||
static inline void vec256_divx_2(vec256 *f) { | |||
vec256 f0 = f[0]; | |||
vec256 f1 = f[1]; | |||
unsigned long long low0 = _mm_cvtsi128_si64(_mm256_castsi256_si128(f0)); | |||
unsigned long long low1 = _mm_cvtsi128_si64(_mm256_castsi256_si128(f1)); | |||
low0 = (low0 >> 1) | (low1 << 63); | |||
low1 = low1 >> 1; | |||
f0 = _mm256_blend_epi32(f0, _mm256_set_epi64x(0, 0, 0, low0), 0x3); | |||
f1 = _mm256_blend_epi32(f1, _mm256_set_epi64x(0, 0, 0, low1), 0x3); | |||
f[0] = _mm256_permute4x64_epi64(f0, 0x39); | |||
f[1] = _mm256_permute4x64_epi64(f1, 0x39); | |||
} | |||
static inline void vec256_divx_3(vec256 *f) { | |||
vec256 f0 = f[0]; | |||
vec256 f1 = f[1]; | |||
vec256 f2 = f[2]; | |||
unsigned long long low0 = _mm_cvtsi128_si64(_mm256_castsi256_si128(f0)); | |||
unsigned long long low1 = _mm_cvtsi128_si64(_mm256_castsi256_si128(f1)); | |||
unsigned long long low2 = _mm_cvtsi128_si64(_mm256_castsi256_si128(f2)); | |||
low0 = (low0 >> 1) | (low1 << 63); | |||
low1 = (low1 >> 1) | (low2 << 63); | |||
low2 = low2 >> 1; | |||
f0 = _mm256_blend_epi32(f0, _mm256_set_epi64x(0, 0, 0, low0), 0x3); | |||
f1 = _mm256_blend_epi32(f1, _mm256_set_epi64x(0, 0, 0, low1), 0x3); | |||
f2 = _mm256_blend_epi32(f2, _mm256_set_epi64x(0, 0, 0, low2), 0x3); | |||
f[0] = _mm256_permute4x64_epi64(f0, 0x39); | |||
f[1] = _mm256_permute4x64_epi64(f1, 0x39); | |||
f[2] = _mm256_permute4x64_epi64(f2, 0x39); | |||
} | |||
static inline void vec256_timesx_1(vec256 *f) { | |||
vec256 f0 = _mm256_permute4x64_epi64(f[0], 0x93); | |||
unsigned long long low0 = _mm_cvtsi128_si64(_mm256_castsi256_si128(f0)); | |||
low0 = low0 << 1; | |||
f0 = _mm256_blend_epi32(f0, _mm256_set_epi64x(0, 0, 0, low0), 0x3); | |||
f[0] = f0; | |||
} | |||
static inline void vec256_timesx_2(vec256 *f) { | |||
vec256 f0 = _mm256_permute4x64_epi64(f[0], 0x93); | |||
vec256 f1 = _mm256_permute4x64_epi64(f[1], 0x93); | |||
unsigned long long low0 = _mm_cvtsi128_si64(_mm256_castsi256_si128(f0)); | |||
unsigned long long low1 = _mm_cvtsi128_si64(_mm256_castsi256_si128(f1)); | |||
low1 = (low1 << 1) | (low0 >> 63); | |||
low0 = low0 << 1; | |||
f0 = _mm256_blend_epi32(f0, _mm256_set_epi64x(0, 0, 0, low0), 0x3); | |||
f1 = _mm256_blend_epi32(f1, _mm256_set_epi64x(0, 0, 0, low1), 0x3); | |||
f[0] = f0; | |||
f[1] = f1; | |||
} | |||
static inline void vec256_timesx_3(vec256 *f) { | |||
vec256 f0 = _mm256_permute4x64_epi64(f[0], 0x93); | |||
vec256 f1 = _mm256_permute4x64_epi64(f[1], 0x93); | |||
vec256 f2 = _mm256_permute4x64_epi64(f[2], 0x93); | |||
unsigned long long low0 = *(unsigned long long *) &f0; | |||
unsigned long long low1 = *(unsigned long long *) &f1; | |||
unsigned long long low2 = _mm_cvtsi128_si64(_mm256_castsi256_si128(f2)); | |||
low2 = (low2 << 1) | (low1 >> 63); | |||
low1 = (low1 << 1) | (low0 >> 63); | |||
low0 = low0 << 1; | |||
*(unsigned long long *) &f0 = low0; | |||
*(unsigned long long *) &f1 = low1; | |||
f2 = _mm256_blend_epi32(f2, _mm256_set_epi64x(0, 0, 0, low2), 0x3); | |||
f[0] = f0; | |||
f[1] = f1; | |||
f[2] = f2; | |||
} | |||
static int __poly_S3_inv(unsigned char *outbytes, const unsigned char *inbytes) { | |||
small *out = (void *) outbytes; | |||
small *in = (void *) inbytes; | |||
vec256 F0[numvec]; | |||
vec256 F1[numvec]; | |||
vec256 G0[numvec]; | |||
vec256 G1[numvec]; | |||
vec256 V0[numvec]; | |||
vec256 V1[numvec]; | |||
vec256 R0[numvec]; | |||
vec256 R1[numvec]; | |||
vec256 c0vec, c1vec; | |||
int loop; | |||
int c0, c1; | |||
int minusdelta = -1; | |||
int swapmask; | |||
vec256 swapvec; | |||
vec256_init(G0, G1, in); | |||
F0[0] = _mm256_set_epi32(-1, -1, -1, -1, -1, -1, -1, -1); | |||
F0[1] = _mm256_set_epi32(-1, -1, -1, -1, -1, -1, -1, -1); | |||
F0[2] = _mm256_set_epi32(511, -1, 511, -1, 511, -1, 1023, -1); | |||
F1[0] = _mm256_set1_epi32(0); | |||
F1[1] = _mm256_set1_epi32(0); | |||
F1[2] = _mm256_set1_epi32(0); | |||
V0[0] = _mm256_set1_epi32(0); | |||
V1[0] = _mm256_set1_epi32(0); | |||
V0[1] = _mm256_set1_epi32(0); | |||
V1[1] = _mm256_set1_epi32(0); | |||
V0[2] = _mm256_set1_epi32(0); | |||
V1[2] = _mm256_set1_epi32(0); | |||
R0[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, 1); | |||
R1[0] = _mm256_set1_epi32(0); | |||
R0[1] = _mm256_set1_epi32(0); | |||
R1[1] = _mm256_set1_epi32(0); | |||
R0[2] = _mm256_set1_epi32(0); | |||
R1[2] = _mm256_set1_epi32(0); | |||
for (loop = 256; loop > 0; --loop) { | |||
vec256_timesx_1(V0); | |||
vec256_timesx_1(V1); | |||
swapmask = negative_mask(minusdelta) & vec256_bit0mask(G0); | |||
c0 = vec256_bit0mask(F0) & vec256_bit0mask(G0); | |||
c1 = vec256_bit0mask(F1) ^ vec256_bit0mask(G1); | |||
c1 &= c0; | |||
minusdelta ^= swapmask & (minusdelta ^ -minusdelta); | |||
minusdelta -= 1; | |||
swapvec = _mm256_set1_epi32(swapmask); | |||
vec256_swap(F0, G0, 3, swapvec); | |||
vec256_swap(F1, G1, 3, swapvec); | |||
c0vec = _mm256_set1_epi32(c0); | |||
c1vec = _mm256_set1_epi32(c1); | |||
vec256_eliminate(F0, F1, G0, G1, 3, c0vec, c1vec); | |||
vec256_divx_3(G0); | |||
vec256_divx_3(G1); | |||
vec256_swap(V0, R0, 1, swapvec); | |||
vec256_swap(V1, R1, 1, swapvec); | |||
vec256_eliminate(V0, V1, R0, R1, 1, c0vec, c1vec); | |||
} | |||
for (loop = 256; loop > 0; --loop) { | |||
vec256_timesx_2(V0); | |||
vec256_timesx_2(V1); | |||
swapmask = negative_mask(minusdelta) & vec256_bit0mask(G0); | |||
c0 = vec256_bit0mask(F0) & vec256_bit0mask(G0); | |||
c1 = vec256_bit0mask(F1) ^ vec256_bit0mask(G1); | |||
c1 &= c0; | |||
minusdelta ^= swapmask & (minusdelta ^ -minusdelta); | |||
minusdelta -= 1; | |||
swapvec = _mm256_set1_epi32(swapmask); | |||
vec256_swap(F0, G0, 3, swapvec); | |||
vec256_swap(F1, G1, 3, swapvec); | |||
c0vec = _mm256_set1_epi32(c0); | |||
c1vec = _mm256_set1_epi32(c1); | |||
vec256_eliminate(F0, F1, G0, G1, 3, c0vec, c1vec); | |||
vec256_divx_3(G0); | |||
vec256_divx_3(G1); | |||
vec256_swap(V0, R0, 2, swapvec); | |||
vec256_swap(V1, R1, 2, swapvec); | |||
vec256_eliminate(V0, V1, R0, R1, 2, c0vec, c1vec); | |||
} | |||
for (loop = 327; loop > 0; --loop) { | |||
vec256_timesx_3(V0); | |||
vec256_timesx_3(V1); | |||
swapmask = negative_mask(minusdelta) & vec256_bit0mask(G0); | |||
c0 = vec256_bit0mask(F0) & vec256_bit0mask(G0); | |||
c1 = vec256_bit0mask(F1) ^ vec256_bit0mask(G1); | |||
c1 &= c0; | |||
minusdelta ^= swapmask & (minusdelta ^ -minusdelta); | |||
minusdelta -= 1; | |||
swapvec = _mm256_set1_epi32(swapmask); | |||
vec256_swap(F0, G0, 3, swapvec); | |||
vec256_swap(F1, G1, 3, swapvec); | |||
c0vec = _mm256_set1_epi32(c0); | |||
c1vec = _mm256_set1_epi32(c1); | |||
vec256_eliminate(F0, F1, G0, G1, 3, c0vec, c1vec); | |||
vec256_divx_3(G0); | |||
vec256_divx_3(G1); | |||
vec256_swap(V0, R0, 3, swapvec); | |||
vec256_swap(V1, R1, 3, swapvec); | |||
vec256_eliminate(V0, V1, R0, R1, 3, c0vec, c1vec); | |||
} | |||
for (loop = 256; loop > 0; --loop) { | |||
vec256_timesx_3(V0); | |||
vec256_timesx_3(V1); | |||
swapmask = negative_mask(minusdelta) & vec256_bit0mask(G0); | |||
c0 = vec256_bit0mask(F0) & vec256_bit0mask(G0); | |||
c1 = vec256_bit0mask(F1) ^ vec256_bit0mask(G1); | |||
c1 &= c0; | |||
minusdelta ^= swapmask & (minusdelta ^ -minusdelta); | |||
minusdelta -= 1; | |||
swapvec = _mm256_set1_epi32(swapmask); | |||
vec256_swap(F0, G0, 2, swapvec); | |||
vec256_swap(F1, G1, 2, swapvec); | |||
c0vec = _mm256_set1_epi32(c0); | |||
c1vec = _mm256_set1_epi32(c1); | |||
vec256_eliminate(F0, F1, G0, G1, 2, c0vec, c1vec); | |||
vec256_divx_2(G0); | |||
vec256_divx_2(G1); | |||
vec256_swap(V0, R0, 3, swapvec); | |||
vec256_swap(V1, R1, 3, swapvec); | |||
vec256_eliminate(V0, V1, R0, R1, 3, c0vec, c1vec); | |||
} | |||
for (loop = 256; loop > 0; --loop) { | |||
vec256_timesx_3(V0); | |||
vec256_timesx_3(V1); | |||
swapmask = negative_mask(minusdelta) & vec256_bit0mask(G0); | |||
c0 = vec256_bit0mask(F0) & vec256_bit0mask(G0); | |||
c1 = vec256_bit0mask(F1) ^ vec256_bit0mask(G1); | |||
c1 &= c0; | |||
minusdelta ^= swapmask & (minusdelta ^ -minusdelta); | |||
minusdelta -= 1; | |||
swapvec = _mm256_set1_epi32(swapmask); | |||
vec256_swap(F0, G0, 1, swapvec); | |||
vec256_swap(F1, G1, 1, swapvec); | |||
c0vec = _mm256_set1_epi32(c0); | |||
c1vec = _mm256_set1_epi32(c1); | |||
vec256_eliminate(F0, F1, G0, G1, 1, c0vec, c1vec); | |||
vec256_divx_1(G0); | |||
vec256_divx_1(G1); | |||
vec256_swap(V0, R0, 3, swapvec); | |||
vec256_swap(V1, R1, 3, swapvec); | |||
vec256_eliminate(V0, V1, R0, R1, 3, c0vec, c1vec); | |||
} | |||
c0vec = _mm256_set1_epi32(vec256_bit0mask(F0)); | |||
c1vec = _mm256_set1_epi32(vec256_bit0mask(F1)); | |||
vec256_scale(V0, V1, c0vec, c1vec); | |||
vec256_final(out, V0, V1); | |||
out[p] = negative_mask(minusdelta); | |||
return 0; | |||
} | |||
// This code is based on crypto_core/invhrss701/faster from SUPERCOP. The code was written as a case study | |||
// for the paper "Fast constant-time gcd computation and modular inversion" by Daniel J. Bernstein and Bo-Yin Yang. | |||
void PQCLEAN_NTRUHPS2048677_AVX2_poly_S3_inv(poly *r_out, const poly *a) { | |||
const unsigned char *in = (void *) a; | |||
unsigned char *out = (void *) r_out; | |||
small input[ppad]; | |||
small output[ppad]; | |||
int i; | |||
/* XXX: obviously input/output format should be packed into bytes */ | |||
for (i = 0; i < p; ++i) { | |||
small x = in[2 * i] & 3; /* 0 1 2 3 */ | |||
x += 1; /* 0 1 2 3 4 5 6, offset by 1 */ | |||
x &= (x - 3) >> 5; /* 0 1 2, offset by 1 */ | |||
input[i] = x - 1; | |||
} | |||
/* XXX: merge with vec256_init */ | |||
__poly_S3_inv((unsigned char *)output, (unsigned char *)input); | |||
for (i = 0; i < p; ++i) { | |||
out[2 * i] = (3 & output[i]) ^ ((3 & output[i]) >> 1); | |||
out[2 * i + 1] = 0; | |||
} | |||
} |
@@ -0,0 +1,46 @@ | |||
#include "sample.h" | |||
void PQCLEAN_NTRUHPS2048677_AVX2_sample_fg(poly *f, poly *g, const unsigned char uniformbytes[NTRU_SAMPLE_FG_BYTES]) { | |||
PQCLEAN_NTRUHPS2048677_AVX2_sample_iid(f, uniformbytes); | |||
PQCLEAN_NTRUHPS2048677_AVX2_sample_fixed_type(g, uniformbytes + NTRU_SAMPLE_IID_BYTES); | |||
} | |||
void PQCLEAN_NTRUHPS2048677_AVX2_sample_rm(poly *r, poly *m, const unsigned char uniformbytes[NTRU_SAMPLE_RM_BYTES]) { | |||
PQCLEAN_NTRUHPS2048677_AVX2_sample_iid(r, uniformbytes); | |||
PQCLEAN_NTRUHPS2048677_AVX2_sample_fixed_type(m, uniformbytes + NTRU_SAMPLE_IID_BYTES); | |||
} | |||
#include "crypto_sort_int32.h" | |||
void PQCLEAN_NTRUHPS2048677_AVX2_sample_fixed_type(poly *r, const unsigned char u[NTRU_SAMPLE_FT_BYTES]) { | |||
// Assumes NTRU_SAMPLE_FT_BYTES = ceil(30*(n-1)/8) | |||
int32_t s[NTRU_N - 1]; | |||
int i; | |||
// Use 30 bits of u per word | |||
for (i = 0; i < (NTRU_N - 1) / 4; i++) { | |||
s[4 * i + 0] = (u[15 * i + 0] << 2) + (u[15 * i + 1] << 10) + (u[15 * i + 2] << 18) + ((uint32_t) u[15 * i + 3] << 26); | |||
s[4 * i + 1] = ((u[15 * i + 3] & 0xc0) >> 4) + (u[15 * i + 4] << 4) + (u[15 * i + 5] << 12) + (u[15 * i + 6] << 20) + ((uint32_t) u[15 * i + 7] << 28); | |||
s[4 * i + 2] = ((u[15 * i + 7] & 0xf0) >> 2) + (u[15 * i + 8] << 6) + (u[15 * i + 9] << 14) + (u[15 * i + 10] << 22) + ((uint32_t) u[15 * i + 11] << 30); | |||
s[4 * i + 3] = (u[15 * i + 11] & 0xfc) + (u[15 * i + 12] << 8) + (u[15 * i + 13] << 15) + ((uint32_t) u[15 * i + 14] << 24); | |||
} | |||
for (i = 0; i < NTRU_WEIGHT / 2; i++) { | |||
s[i] |= 1; | |||
} | |||
for (i = NTRU_WEIGHT / 2; i < NTRU_WEIGHT; i++) { | |||
s[i] |= 2; | |||
} | |||
PQCLEAN_NTRUHPS2048677_AVX2_crypto_sort_int32(s, NTRU_N - 1); | |||
for (i = 0; i < NTRU_N - 1; i++) { | |||
r->coeffs[i] = ((uint16_t) (s[i] & 3)); | |||
} | |||
r->coeffs[NTRU_N - 1] = 0; | |||
} |
@@ -0,0 +1,15 @@ | |||
#ifndef SAMPLE_H | |||
#define SAMPLE_H | |||
#include "params.h" | |||
#include "poly.h" | |||
void PQCLEAN_NTRUHPS2048677_AVX2_sample_fg(poly *f, poly *g, const unsigned char uniformbytes[NTRU_SAMPLE_FG_BYTES]); | |||
void PQCLEAN_NTRUHPS2048677_AVX2_sample_rm(poly *r, poly *m, const unsigned char uniformbytes[NTRU_SAMPLE_RM_BYTES]); | |||
void PQCLEAN_NTRUHPS2048677_AVX2_sample_iid(poly *r, const unsigned char uniformbytes[NTRU_SAMPLE_IID_BYTES]); | |||
void PQCLEAN_NTRUHPS2048677_AVX2_sample_fixed_type(poly *r, const unsigned char uniformbytes[NTRU_SAMPLE_FT_BYTES]); | |||
#endif |
@@ -0,0 +1,21 @@ | |||
#include <immintrin.h> | |||
#include "sample.h" | |||
extern void PQCLEAN_NTRUHPS2048677_AVX2_vec32_sample_iid(poly *r, const unsigned char uniformbytes[PAD32(NTRU_SAMPLE_IID_BYTES)]); | |||
void PQCLEAN_NTRUHPS2048677_AVX2_sample_iid(poly *r, const unsigned char uniformbytes[NTRU_SAMPLE_IID_BYTES]) { | |||
int i; | |||
union { /* align to 32 byte boundary for vmovdqa */ | |||
unsigned char b[PAD32(NTRU_SAMPLE_IID_BYTES)]; | |||
__m256i b_x32[PAD32(NTRU_SAMPLE_IID_BYTES) / 32]; | |||
} buffer; | |||
for (i = 0; i < NTRU_SAMPLE_IID_BYTES; i++) { | |||
buffer.b[i] = uniformbytes[i]; | |||
} | |||
for (i = NTRU_SAMPLE_IID_BYTES; i < PAD32(NTRU_SAMPLE_IID_BYTES); i++) { | |||
buffer.b[i] = 0; | |||
} | |||
PQCLEAN_NTRUHPS2048677_AVX2_vec32_sample_iid(r, buffer.b); | |||
} |
@@ -0,0 +1,134 @@ | |||
.data | |||
.p2align 5 | |||
.text | |||
.global PQCLEAN_NTRUHPS2048677_AVX2_square_1_677 | |||
.global _PQCLEAN_NTRUHPS2048677_AVX2_square_1_677 | |||
PQCLEAN_NTRUHPS2048677_AVX2_square_1_677: | |||
_PQCLEAN_NTRUHPS2048677_AVX2_square_1_677: | |||
push %r15 | |||
push %r14 | |||
push %r13 | |||
push %r12 | |||
push %rbx | |||
push %rbp | |||
mov 0(%rsi), %r11 | |||
mov %r11, %r10 | |||
and $-0x1, %r10 | |||
mov $0x5555555555555555, %rbp | |||
pdep %rbp, %r10, %r10 | |||
mov %r10, 0(%rdi) | |||
mov $0xffffffff00000000, %rbx | |||
pext %rbx, %r11, %r10 | |||
pdep %rbp, %r10, %r10 | |||
mov %r10, 8(%rdi) | |||
mov 8(%rsi), %r11 | |||
mov %r11, %r10 | |||
and $-0x1, %r10 | |||
pdep %rbp, %r10, %r10 | |||
mov %r10, 16(%rdi) | |||
pext %rbx, %r11, %r10 | |||
pdep %rbp, %r10, %r10 | |||
mov %r10, 24(%rdi) | |||
mov 16(%rsi), %r11 | |||
mov %r11, %r10 | |||
and $-0x1, %r10 | |||
pdep %rbp, %r10, %r10 | |||
mov %r10, 32(%rdi) | |||
pext %rbx, %r11, %r10 | |||
pdep %rbp, %r10, %r10 | |||
mov %r10, 40(%rdi) | |||
mov 24(%rsi), %r11 | |||
mov %r11, %r10 | |||
and $-0x1, %r10 | |||
pdep %rbp, %r10, %r10 | |||
mov %r10, 48(%rdi) | |||
pext %rbx, %r11, %r10 | |||
pdep %rbp, %r10, %r10 | |||
mov %r10, 56(%rdi) | |||
mov 32(%rsi), %r11 | |||
mov %r11, %r10 | |||
and $-0x1, %r10 | |||
pdep %rbp, %r10, %r10 | |||
mov %r10, 64(%rdi) | |||
pext %rbx, %r11, %r10 | |||
pdep %rbp, %r10, %r10 | |||
mov %r10, 72(%rdi) | |||
mov 40(%rsi), %r11 | |||
mov $0x7fffffff80000, %r12 | |||
pext %r12, %r11, %r10 | |||
mov $0xaaaaaaaaaaaaaaaa, %r13 | |||
pdep %r13, %r10, %r10 | |||
xor %r10, 0(%rdi) | |||
mov $0xfff8000000000000, %r14 | |||
pext %r14, %r11, %r10 | |||
mov $0x2aaaaaa, %r15 | |||
pdep %r15, %r10, %r10 | |||
xor %r10, 8(%rdi) | |||
mov %r11, %r10 | |||
and $0x7ffff, %r10 | |||
mov $0x1555555555, %r9 | |||
pdep %r9, %r10, %r10 | |||
mov %r10, 80(%rdi) | |||
mov 48(%rsi), %r11 | |||
mov %r11, %r10 | |||
and $0x7ffff, %r10 | |||
mov $0xaaaaaaaaa8000000, %r8 | |||
pdep %r8, %r10, %r10 | |||
xor %r10, 8(%rdi) | |||
pext %r12, %r11, %r10 | |||
pdep %r13, %r10, %r10 | |||
xor %r10, 16(%rdi) | |||
pext %r14, %r11, %r10 | |||
pdep %r15, %r10, %r10 | |||
xor %r10, 24(%rdi) | |||
mov 56(%rsi), %r11 | |||
mov %r11, %r10 | |||
and $0x7ffff, %r10 | |||
pdep %r8, %r10, %r10 | |||
xor %r10, 24(%rdi) | |||
pext %r12, %r11, %r10 | |||
pdep %r13, %r10, %r10 | |||
xor %r10, 32(%rdi) | |||
pext %r14, %r11, %r10 | |||
pdep %r15, %r10, %r10 | |||
xor %r10, 40(%rdi) | |||
mov 64(%rsi), %r11 | |||
mov %r11, %r10 | |||
and $0x7ffff, %r10 | |||
pdep %r8, %r10, %r10 | |||
xor %r10, 40(%rdi) | |||
pext %r12, %r11, %r10 | |||
pdep %r13, %r10, %r10 | |||
xor %r10, 48(%rdi) | |||
pext %r14, %r11, %r10 | |||
pdep %r15, %r10, %r10 | |||
xor %r10, 56(%rdi) | |||
mov 72(%rsi), %r11 | |||
mov %r11, %r10 | |||
and $0x7ffff, %r10 | |||
pdep %r8, %r10, %r10 | |||
xor %r10, 56(%rdi) | |||
pext %r12, %r11, %r10 | |||
pdep %r13, %r10, %r10 | |||
xor %r10, 64(%rdi) | |||
pext %r14, %r11, %r10 | |||
pdep %r15, %r10, %r10 | |||
xor %r10, 72(%rdi) | |||
mov 80(%rsi), %r11 | |||
mov %r11, %r10 | |||
and $0x7ffff, %r10 | |||
pdep %r8, %r10, %r10 | |||
xor %r10, 72(%rdi) | |||
mov $0x1ffff80000, %rdx | |||
pext %rdx, %r11, %r10 | |||
mov $0xaaaaaaaaa, %rcx | |||
pdep %rcx, %r10, %r10 | |||
xor %r10, 80(%rdi) | |||
movq $0x0, 88(%rdi) | |||
pop %rbp | |||
pop %rbx | |||
pop %r12 | |||
pop %r13 | |||
pop %r14 | |||
pop %r15 | |||
ret |
@@ -0,0 +1,235 @@ | |||
.data | |||
.p2align 5 | |||
.text | |||
.global PQCLEAN_NTRUHPS2048677_AVX2_square_2_677 | |||
.global _PQCLEAN_NTRUHPS2048677_AVX2_square_2_677 | |||
PQCLEAN_NTRUHPS2048677_AVX2_square_2_677: | |||
_PQCLEAN_NTRUHPS2048677_AVX2_square_2_677: | |||
push %r15 | |||
push %r14 | |||
push %r13 | |||
push %r12 | |||
push %rbx | |||
push %rbp | |||
mov 0(%rsi), %r11 | |||
mov %r11, %r10 | |||
and $0xffff, %r10 | |||
mov $0x1111111111111111, %rbp | |||
pdep %rbp, %r10, %r10 | |||
mov %r10, 0(%rdi) | |||
mov $0xffff0000, %rbx | |||
pext %rbx, %r11, %r10 | |||
pdep %rbp, %r10, %r10 | |||
mov %r10, 8(%rdi) | |||
mov $0xffff00000000, %r12 | |||
pext %r12, %r11, %r10 | |||
pdep %rbp, %r10, %r10 | |||
mov %r10, 16(%rdi) | |||
mov $0xffff000000000000, %r13 | |||
pext %r13, %r11, %r10 | |||
pdep %rbp, %r10, %r10 | |||
mov %r10, 24(%rdi) | |||
mov 8(%rsi), %r11 | |||
mov %r11, %r10 | |||
and $0xffff, %r10 | |||
pdep %rbp, %r10, %r10 | |||
mov %r10, 32(%rdi) | |||
pext %rbx, %r11, %r10 | |||
pdep %rbp, %r10, %r10 | |||
mov %r10, 40(%rdi) | |||
pext %r12, %r11, %r10 | |||
pdep %rbp, %r10, %r10 | |||
mov %r10, 48(%rdi) | |||
pext %r13, %r11, %r10 | |||
pdep %rbp, %r10, %r10 | |||
mov %r10, 56(%rdi) | |||
mov 16(%rsi), %r11 | |||
mov $0x3fffc0000000000, %r14 | |||
pext %r14, %r11, %r10 | |||
mov $0x8888888888888888, %r15 | |||
pdep %r15, %r10, %r10 | |||
xor %r10, 0(%rdi) | |||
mov $0xfc00000000000000, %r9 | |||
pext %r9, %r11, %r10 | |||
mov $0x888888, %r8 | |||
pdep %r8, %r10, %r10 | |||
xor %r10, 8(%rdi) | |||
mov %r11, %r10 | |||
and $0xffff, %r10 | |||
pdep %rbp, %r10, %r10 | |||
mov %r10, 64(%rdi) | |||
pext %rbx, %r11, %r10 | |||
pdep %rbp, %r10, %r10 | |||
mov %r10, 72(%rdi) | |||
mov $0x3ff00000000, %rdx | |||
pext %rdx, %r11, %r10 | |||
mov $0x1111111111, %rcx | |||
pdep %rcx, %r10, %r10 | |||
mov %r10, 80(%rdi) | |||
mov 24(%rsi), %r11 | |||
mov %r11, %r10 | |||
and $0x3ff, %r10 | |||
mov $0x8888888888000000, %rax | |||
pdep %rax, %r10, %r10 | |||
xor %r10, 8(%rdi) | |||
mov $0x3fffc00, %r12 | |||
pext %r12, %r11, %r10 | |||
pdep %r15, %r10, %r10 | |||
xor %r10, 16(%rdi) | |||
mov $0x3fffc000000, %r13 | |||
pext %r13, %r11, %r10 | |||
pdep %r15, %r10, %r10 | |||
xor %r10, 24(%rdi) | |||
pext %r14, %r11, %r10 | |||
pdep %r15, %r10, %r10 | |||
xor %r10, 32(%rdi) | |||
pext %r9, %r11, %r10 | |||
pdep %r8, %r10, %r10 | |||
xor %r10, 40(%rdi) | |||
mov 32(%rsi), %r11 | |||
mov %r11, %r10 | |||
and $0x3ff, %r10 | |||
pdep %rax, %r10, %r10 | |||
xor %r10, 40(%rdi) | |||
pext %r12, %r11, %r10 | |||
pdep %r15, %r10, %r10 | |||
xor %r10, 48(%rdi) | |||
pext %r13, %r11, %r10 | |||
pdep %r15, %r10, %r10 | |||
xor %r10, 56(%rdi) | |||
pext %r14, %r11, %r10 | |||
pdep %r15, %r10, %r10 | |||
xor %r10, 64(%rdi) | |||
pext %r9, %r11, %r10 | |||
pdep %r8, %r10, %r10 | |||
xor %r10, 72(%rdi) | |||
mov 40(%rsi), %r11 | |||
mov $0x7fff80000, %rbx | |||
pext %rbx, %r11, %r10 | |||
mov $0x4444444444444444, %rbp | |||
pdep %rbp, %r10, %r10 | |||
xor %r10, 0(%rdi) | |||
mov $0x7fff800000000, %rdx | |||
pext %rdx, %r11, %r10 | |||
pdep %rbp, %r10, %r10 | |||
xor %r10, 8(%rdi) | |||
mov $0xfff8000000000000, %rcx | |||
pext %rcx, %r11, %r10 | |||
mov $0x4444444444444, %rax | |||
pdep %rax, %r10, %r10 | |||
xor %r10, 16(%rdi) | |||
mov %r11, %r10 | |||
and $0x3ff, %r10 | |||
mov $0x8888888888000000, %r12 | |||
pdep %r12, %r10, %r10 | |||
xor %r10, 72(%rdi) | |||
mov $0x7fc00, %r13 | |||
pext %r13, %r11, %r10 | |||
mov $0x888888888, %r14 | |||
pdep %r14, %r10, %r10 | |||
xor %r10, 80(%rdi) | |||
mov 48(%rsi), %r11 | |||
mov %r11, %r10 | |||
and $0x7, %r10 | |||
mov $0x4440000000000000, %r15 | |||
pdep %r15, %r10, %r10 | |||
xor %r10, 16(%rdi) | |||
mov $0x7fff8, %r9 | |||
pext %r9, %r11, %r10 | |||
pdep %rbp, %r10, %r10 | |||
xor %r10, 24(%rdi) | |||
pext %rbx, %r11, %r10 | |||
pdep %rbp, %r10, %r10 | |||
xor %r10, 32(%rdi) | |||
pext %rdx, %r11, %r10 | |||
pdep %rbp, %r10, %r10 | |||
xor %r10, 40(%rdi) | |||
pext %rcx, %r11, %r10 | |||
pdep %rax, %r10, %r10 | |||
xor %r10, 48(%rdi) | |||
mov 56(%rsi), %r11 | |||
mov $0xf000000000000000, %r8 | |||
pext %r8, %r11, %r10 | |||
mov $0x2222, %r12 | |||
pdep %r12, %r10, %r10 | |||
xor %r10, 0(%rdi) | |||
mov %r11, %r10 | |||
and $0x7, %r10 | |||
pdep %r15, %r10, %r10 | |||
xor %r10, 48(%rdi) | |||
pext %r9, %r11, %r10 | |||
pdep %rbp, %r10, %r10 | |||
xor %r10, 56(%rdi) | |||
pext %rbx, %r11, %r10 | |||
pdep %rbp, %r10, %r10 | |||
xor %r10, 64(%rdi) | |||
pext %rdx, %r11, %r10 | |||
pdep %rbp, %r10, %r10 | |||
xor %r10, 72(%rdi) | |||
mov $0xff8000000000000, %r13 | |||
pext %r13, %r11, %r10 | |||
mov $0x444444444, %r14 | |||
pdep %r14, %r10, %r10 | |||
xor %r10, 80(%rdi) | |||
mov 64(%rsi), %r11 | |||
mov %r11, %r10 | |||
and $0xfff, %r10 | |||
mov $0x2222222222220000, %rcx | |||
pdep %rcx, %r10, %r10 | |||
xor %r10, 0(%rdi) | |||
mov $0xffff000, %rax | |||
pext %rax, %r11, %r10 | |||
mov $0x2222222222222222, %r8 | |||
pdep %r8, %r10, %r10 | |||
xor %r10, 8(%rdi) | |||
mov $0xffff0000000, %r12 | |||
pext %r12, %r11, %r10 | |||
pdep %r8, %r10, %r10 | |||
xor %r10, 16(%rdi) | |||
mov $0xffff00000000000, %r15 | |||
pext %r15, %r11, %r10 | |||
pdep %r8, %r10, %r10 | |||
xor %r10, 24(%rdi) | |||
mov $0xf000000000000000, %r9 | |||
pext %r9, %r11, %r10 | |||
mov $0x2222, %rbx | |||
pdep %rbx, %r10, %r10 | |||
xor %r10, 32(%rdi) | |||
mov 72(%rsi), %r11 | |||
mov %r11, %r10 | |||
and $0xfff, %r10 | |||
pdep %rcx, %r10, %r10 | |||
xor %r10, 32(%rdi) | |||
pext %rax, %r11, %r10 | |||
pdep %r8, %r10, %r10 | |||
xor %r10, 40(%rdi) | |||
pext %r12, %r11, %r10 | |||
pdep %r8, %r10, %r10 | |||
xor %r10, 48(%rdi) | |||
pext %r15, %r11, %r10 | |||
pdep %r8, %r10, %r10 | |||
xor %r10, 56(%rdi) | |||
pext %r9, %r11, %r10 | |||
pdep %rbx, %r10, %r10 | |||
xor %r10, 64(%rdi) | |||
mov 80(%rsi), %r11 | |||
mov %r11, %r10 | |||
and $0xfff, %r10 | |||
pdep %rcx, %r10, %r10 | |||
xor %r10, 64(%rdi) | |||
pext %rax, %r11, %r10 | |||
pdep %r8, %r10, %r10 | |||
xor %r10, 72(%rdi) | |||
mov $0x1ff0000000, %rdx | |||
pext %rdx, %r11, %r10 | |||
mov $0x222222222, %rbp | |||
pdep %rbp, %r10, %r10 | |||
xor %r10, 80(%rdi) | |||
movq $0x0, 88(%rdi) | |||
pop %rbp | |||
pop %rbx | |||
pop %r12 | |||
pop %r13 | |||
pop %r14 | |||
pop %r15 | |||
ret |
@@ -0,0 +1,452 @@ | |||
.data | |||
.p2align 5 | |||
.text | |||
.global PQCLEAN_NTRUHPS2048677_AVX2_square_3_677 | |||
.global _PQCLEAN_NTRUHPS2048677_AVX2_square_3_677 | |||
PQCLEAN_NTRUHPS2048677_AVX2_square_3_677: | |||
_PQCLEAN_NTRUHPS2048677_AVX2_square_3_677: | |||
push %r15 | |||
push %r14 | |||
push %r13 | |||
push %r12 | |||
push %rbx | |||
push %rbp | |||
mov 0(%rsi), %r11 | |||
mov %r11, %r10 | |||
and $0xff, %r10 | |||
mov $0x101010101010101, %rbp | |||
pdep %rbp, %r10, %r10 | |||
mov %r10, 0(%rdi) | |||
mov $0xff00, %rbx | |||
pext %rbx, %r11, %r10 | |||
pdep %rbp, %r10, %r10 | |||
mov %r10, 8(%rdi) | |||
mov $0xff0000, %r12 | |||
pext %r12, %r11, %r10 | |||
pdep %rbp, %r10, %r10 | |||
mov %r10, 16(%rdi) | |||
mov $0xff000000, %r13 | |||
pext %r13, %r11, %r10 | |||
pdep %rbp, %r10, %r10 | |||
mov %r10, 24(%rdi) | |||
mov $0xff00000000, %r14 | |||
pext %r14, %r11, %r10 | |||
pdep %rbp, %r10, %r10 | |||
mov %r10, 32(%rdi) | |||
mov $0xff0000000000, %r15 | |||
pext %r15, %r11, %r10 | |||
pdep %rbp, %r10, %r10 | |||
mov %r10, 40(%rdi) | |||
mov $0xff000000000000, %r9 | |||
pext %r9, %r11, %r10 | |||
pdep %rbp, %r10, %r10 | |||
mov %r10, 48(%rdi) | |||
mov $0xff00000000000000, %r8 | |||
pext %r8, %r11, %r10 | |||
pdep %rbp, %r10, %r10 | |||
mov %r10, 56(%rdi) | |||
mov 8(%rsi), %r11 | |||
mov $0x1fe00000, %rdx | |||
pext %rdx, %r11, %r10 | |||
mov $0x808080808080808, %rcx | |||
pdep %rcx, %r10, %r10 | |||
xor %r10, 0(%rdi) | |||
mov $0x1fe0000000, %rax | |||
pext %rax, %r11, %r10 | |||
pdep %rcx, %r10, %r10 | |||
xor %r10, 8(%rdi) | |||
mov $0x1fe000000000, %rbx | |||
pext %rbx, %r11, %r10 | |||
pdep %rcx, %r10, %r10 | |||
xor %r10, 16(%rdi) | |||
mov $0x1fe00000000000, %r12 | |||
pext %r12, %r11, %r10 | |||
pdep %rcx, %r10, %r10 | |||
xor %r10, 24(%rdi) | |||
mov $0x1fe0000000000000, %r13 | |||
pext %r13, %r11, %r10 | |||
pdep %rcx, %r10, %r10 | |||
xor %r10, 32(%rdi) | |||
mov $0xe000000000000000, %r14 | |||
pext %r14, %r11, %r10 | |||
mov $0x80808, %r15 | |||
pdep %r15, %r10, %r10 | |||
xor %r10, 40(%rdi) | |||
mov %r11, %r10 | |||
and $0xff, %r10 | |||
pdep %rbp, %r10, %r10 | |||
mov %r10, 64(%rdi) | |||
mov $0xff00, %r9 | |||
pext %r9, %r11, %r10 | |||
pdep %rbp, %r10, %r10 | |||
mov %r10, 72(%rdi) | |||
mov $0x1f0000, %r8 | |||
pext %r8, %r11, %r10 | |||
mov $0x101010101, %rdx | |||
pdep %rdx, %r10, %r10 | |||
mov %r10, 80(%rdi) | |||
mov 16(%rsi), %r11 | |||
mov $0x3fc0000000000, %rax | |||
pext %rax, %r11, %r10 | |||
mov $0x4040404040404040, %rbx | |||
pdep %rbx, %r10, %r10 | |||
xor %r10, 0(%rdi) | |||
mov $0x3fc000000000000, %r12 | |||
pext %r12, %r11, %r10 | |||
pdep %rbx, %r10, %r10 | |||
xor %r10, 8(%rdi) | |||
mov $0xfc00000000000000, %r13 | |||
pext %r13, %r11, %r10 | |||
mov $0x404040404040, %rcx | |||
pdep %rcx, %r10, %r10 | |||
xor %r10, 16(%rdi) | |||
mov %r11, %r10 | |||
and $0x1f, %r10 | |||
mov $0x808080808000000, %r14 | |||
pdep %r14, %r10, %r10 | |||
xor %r10, 40(%rdi) | |||
mov $0x1fe0, %r15 | |||
pext %r15, %r11, %r10 | |||
mov $0x808080808080808, %r9 | |||
pdep %r9, %r10, %r10 | |||
xor %r10, 48(%rdi) | |||
mov $0x1fe000, %rbp | |||
pext %rbp, %r11, %r10 | |||
pdep %r9, %r10, %r10 | |||
xor %r10, 56(%rdi) | |||
mov $0x1fe00000, %r8 | |||
pext %r8, %r11, %r10 | |||
pdep %r9, %r10, %r10 | |||
xor %r10, 64(%rdi) | |||
mov $0x1fe0000000, %rdx | |||
pext %rdx, %r11, %r10 | |||
pdep %r9, %r10, %r10 | |||
xor %r10, 72(%rdi) | |||
mov $0x3e000000000, %rax | |||
pext %rax, %r11, %r10 | |||
mov $0x808080808, %r12 | |||
pdep %r12, %r10, %r10 | |||
xor %r10, 80(%rdi) | |||
mov 24(%rsi), %r11 | |||
mov $0xc000000000000000, %rbx | |||
pext %rbx, %r11, %r10 | |||
mov $0x202, %r13 | |||
pdep %r13, %r10, %r10 | |||
xor %r10, 0(%rdi) | |||
mov %r11, %r10 | |||
and $0x3, %r10 | |||
mov $0x4040000000000000, %rcx | |||
pdep %rcx, %r10, %r10 | |||
xor %r10, 16(%rdi) | |||
mov $0x3fc, %r14 | |||
pext %r14, %r11, %r10 | |||
mov $0x4040404040404040, %r15 | |||
pdep %r15, %r10, %r10 | |||
xor %r10, 24(%rdi) | |||
mov $0x3fc00, %rbp | |||
pext %rbp, %r11, %r10 | |||
pdep %r15, %r10, %r10 | |||
xor %r10, 32(%rdi) | |||
mov $0x3fc0000, %r8 | |||
pext %r8, %r11, %r10 | |||
pdep %r15, %r10, %r10 | |||
xor %r10, 40(%rdi) | |||
mov $0x3fc000000, %rdx | |||
pext %rdx, %r11, %r10 | |||
pdep %r15, %r10, %r10 | |||
xor %r10, 48(%rdi) | |||
mov $0x3fc00000000, %r9 | |||
pext %r9, %r11, %r10 | |||
pdep %r15, %r10, %r10 | |||
xor %r10, 56(%rdi) | |||
mov $0x3fc0000000000, %rax | |||
pext %rax, %r11, %r10 | |||
pdep %r15, %r10, %r10 | |||
xor %r10, 64(%rdi) | |||
mov $0x3fc000000000000, %r12 | |||
pext %r12, %r11, %r10 | |||
pdep %r15, %r10, %r10 | |||
xor %r10, 72(%rdi) | |||
mov $0x3c00000000000000, %rbx | |||
pext %rbx, %r11, %r10 | |||
mov $0x40404040, %r13 | |||
pdep %r13, %r10, %r10 | |||
xor %r10, 80(%rdi) | |||
mov 32(%rsi), %r11 | |||
mov %r11, %r10 | |||
and $0x3f, %r10 | |||
mov $0x202020202020000, %rcx | |||
pdep %rcx, %r10, %r10 | |||
xor %r10, 0(%rdi) | |||
mov $0x3fc0, %r14 | |||
pext %r14, %r11, %r10 | |||
mov $0x202020202020202, %rbp | |||
pdep %rbp, %r10, %r10 | |||
xor %r10, 8(%rdi) | |||
mov $0x3fc000, %r8 | |||
pext %r8, %r11, %r10 | |||
pdep %rbp, %r10, %r10 | |||
xor %r10, 16(%rdi) | |||
mov $0x3fc00000, %rdx | |||
pext %rdx, %r11, %r10 | |||
pdep %rbp, %r10, %r10 | |||
xor %r10, 24(%rdi) | |||
mov $0x3fc0000000, %r9 | |||
pext %r9, %r11, %r10 | |||
pdep %rbp, %r10, %r10 | |||
xor %r10, 32(%rdi) | |||
mov $0x3fc000000000, %rax | |||
pext %rax, %r11, %r10 | |||
pdep %rbp, %r10, %r10 | |||
xor %r10, 40(%rdi) | |||
mov $0x3fc00000000000, %r12 | |||
pext %r12, %r11, %r10 | |||
pdep %rbp, %r10, %r10 | |||
xor %r10, 48(%rdi) | |||
mov $0x3fc0000000000000, %r15 | |||
pext %r15, %r11, %r10 | |||
pdep %rbp, %r10, %r10 | |||
xor %r10, 56(%rdi) | |||
mov $0xc000000000000000, %rbx | |||
pext %rbx, %r11, %r10 | |||
mov $0x202, %r13 | |||
pdep %r13, %r10, %r10 | |||
xor %r10, 64(%rdi) | |||
mov 40(%rsi), %r11 | |||
mov $0x7f80000, %rcx | |||
pext %rcx, %r11, %r10 | |||
mov $0x1010101010101010, %r14 | |||
pdep %r14, %r10, %r10 | |||
xor %r10, 0(%rdi) | |||
mov $0x7f8000000, %r8 | |||
pext %r8, %r11, %r10 | |||
pdep %r14, %r10, %r10 | |||
xor %r10, 8(%rdi) | |||
mov $0x7f800000000, %rdx | |||
pext %rdx, %r11, %r10 | |||
pdep %r14, %r10, %r10 | |||
xor %r10, 16(%rdi) | |||
mov $0x7f80000000000, %r9 | |||
pext %r9, %r11, %r10 | |||
pdep %r14, %r10, %r10 | |||
xor %r10, 24(%rdi) | |||
mov $0x7f8000000000000, %rax | |||
pext %rax, %r11, %r10 | |||
pdep %r14, %r10, %r10 | |||
xor %r10, 32(%rdi) | |||
mov $0xf800000000000000, %r12 | |||
pext %r12, %r11, %r10 | |||
mov $0x1010101010, %r15 | |||
pdep %r15, %r10, %r10 | |||
xor %r10, 40(%rdi) | |||
mov %r11, %r10 | |||
and $0x3f, %r10 | |||
mov $0x202020202020000, %rbp | |||
pdep %rbp, %r10, %r10 | |||
xor %r10, 64(%rdi) | |||
mov $0x3fc0, %rbx | |||
pext %rbx, %r11, %r10 | |||
mov $0x202020202020202, %r13 | |||
pdep %r13, %r10, %r10 | |||
xor %r10, 72(%rdi) | |||
mov $0x7c000, %rcx | |||
pext %rcx, %r11, %r10 | |||
mov $0x202020202, %r8 | |||
pdep %r8, %r10, %r10 | |||
xor %r10, 80(%rdi) | |||
mov 48(%rsi), %r11 | |||
mov $0xff0000000000, %rdx | |||
pext %rdx, %r11, %r10 | |||
mov $0x8080808080808080, %r9 | |||
pdep %r9, %r10, %r10 | |||
xor %r10, 0(%rdi) | |||
mov $0xff000000000000, %rax | |||
pext %rax, %r11, %r10 | |||
pdep %r9, %r10, %r10 | |||
xor %r10, 8(%rdi) | |||
mov $0xff00000000000000, %r14 | |||
pext %r14, %r11, %r10 | |||
pdep %r9, %r10, %r10 | |||
xor %r10, 16(%rdi) | |||
mov %r11, %r10 | |||
and $0x7, %r10 | |||
mov $0x1010100000000000, %r12 | |||
pdep %r12, %r10, %r10 | |||
xor %r10, 40(%rdi) | |||
mov $0x7f8, %r15 | |||
pext %r15, %r11, %r10 | |||
mov $0x1010101010101010, %rbp | |||
pdep %rbp, %r10, %r10 | |||
xor %r10, 48(%rdi) | |||
mov $0x7f800, %rbx | |||
pext %rbx, %r11, %r10 | |||
pdep %rbp, %r10, %r10 | |||
xor %r10, 56(%rdi) | |||
mov $0x7f80000, %r13 | |||
pext %r13, %r11, %r10 | |||
pdep %rbp, %r10, %r10 | |||
xor %r10, 64(%rdi) | |||
mov $0x7f8000000, %rcx | |||
pext %rcx, %r11, %r10 | |||
pdep %rbp, %r10, %r10 | |||
xor %r10, 72(%rdi) | |||
mov $0xf800000000, %r8 | |||
pext %r8, %r11, %r10 | |||
mov $0x1010101010, %rdx | |||
pdep %rdx, %r10, %r10 | |||
xor %r10, 80(%rdi) | |||
mov 56(%rsi), %r11 | |||
mov $0xf000000000000000, %rax | |||
pext %rax, %r11, %r10 | |||
mov $0x4040404, %r14 | |||
pdep %r14, %r10, %r10 | |||
xor %r10, 0(%rdi) | |||
mov %r11, %r10 | |||
and $0xff, %r10 | |||
pdep %r9, %r10, %r10 | |||
xor %r10, 24(%rdi) | |||
mov $0xff00, %r12 | |||
pext %r12, %r11, %r10 | |||
pdep %r9, %r10, %r10 | |||
xor %r10, 32(%rdi) | |||
mov $0xff0000, %r15 | |||
pext %r15, %r11, %r10 | |||
pdep %r9, %r10, %r10 | |||
xor %r10, 40(%rdi) | |||
mov $0xff000000, %rbx | |||
pext %rbx, %r11, %r10 | |||
pdep %r9, %r10, %r10 | |||
xor %r10, 48(%rdi) | |||
mov $0xff00000000, %r13 | |||
pext %r13, %r11, %r10 | |||
pdep %r9, %r10, %r10 | |||
xor %r10, 56(%rdi) | |||
mov $0xff0000000000, %rcx | |||
pext %rcx, %r11, %r10 | |||
pdep %r9, %r10, %r10 | |||
xor %r10, 64(%rdi) | |||
mov $0xff000000000000, %rbp | |||
pext %rbp, %r11, %r10 | |||
pdep %r9, %r10, %r10 | |||
xor %r10, 72(%rdi) | |||
mov $0xf00000000000000, %r8 | |||
pext %r8, %r11, %r10 | |||
mov $0x80808080, %rdx | |||
pdep %rdx, %r10, %r10 | |||
xor %r10, 80(%rdi) | |||
mov 64(%rsi), %r11 | |||
mov %r11, %r10 | |||
and $0xf, %r10 | |||
mov $0x404040400000000, %rax | |||
pdep %rax, %r10, %r10 | |||
xor %r10, 0(%rdi) | |||
mov $0xff0, %r14 | |||
pext %r14, %r11, %r10 | |||
mov $0x404040404040404, %r12 | |||
pdep %r12, %r10, %r10 | |||
xor %r10, 8(%rdi) | |||
mov $0xff000, %r15 | |||
pext %r15, %r11, %r10 | |||
pdep %r12, %r10, %r10 | |||
xor %r10, 16(%rdi) | |||
mov $0xff00000, %rbx | |||
pext %rbx, %r11, %r10 | |||
pdep %r12, %r10, %r10 | |||
xor %r10, 24(%rdi) | |||
mov $0xff0000000, %r13 | |||
pext %r13, %r11, %r10 | |||
pdep %r12, %r10, %r10 | |||
xor %r10, 32(%rdi) | |||
mov $0xff000000000, %rcx | |||
pext %rcx, %r11, %r10 | |||
pdep %r12, %r10, %r10 | |||
xor %r10, 40(%rdi) | |||
mov $0xff00000000000, %rbp | |||
pext %rbp, %r11, %r10 | |||
pdep %r12, %r10, %r10 | |||
xor %r10, 48(%rdi) | |||
mov $0xff0000000000000, %r9 | |||
pext %r9, %r11, %r10 | |||
pdep %r12, %r10, %r10 | |||
xor %r10, 56(%rdi) | |||
mov $0xf000000000000000, %r8 | |||
pext %r8, %r11, %r10 | |||
mov $0x4040404, %rdx | |||
pdep %rdx, %r10, %r10 | |||
xor %r10, 64(%rdi) | |||
mov 72(%rsi), %r11 | |||
mov $0x1fe0000, %rax | |||
pext %rax, %r11, %r10 | |||
mov $0x2020202020202020, %r14 | |||
pdep %r14, %r10, %r10 | |||
xor %r10, 0(%rdi) | |||
mov $0x1fe000000, %r15 | |||
pext %r15, %r11, %r10 | |||
pdep %r14, %r10, %r10 | |||
xor %r10, 8(%rdi) | |||
mov $0x1fe00000000, %rbx | |||
pext %rbx, %r11, %r10 | |||
pdep %r14, %r10, %r10 | |||
xor %r10, 16(%rdi) | |||
mov $0x1fe0000000000, %r13 | |||
pext %r13, %r11, %r10 | |||
pdep %r14, %r10, %r10 | |||
xor %r10, 24(%rdi) | |||
mov $0x1fe000000000000, %rcx | |||
pext %rcx, %r11, %r10 | |||
pdep %r14, %r10, %r10 | |||
xor %r10, 32(%rdi) | |||
mov $0xfe00000000000000, %rbp | |||
pext %rbp, %r11, %r10 | |||
mov $0x20202020202020, %r9 | |||
pdep %r9, %r10, %r10 | |||
xor %r10, 40(%rdi) | |||
mov %r11, %r10 | |||
and $0xf, %r10 | |||
mov $0x404040400000000, %r12 | |||
pdep %r12, %r10, %r10 | |||
xor %r10, 64(%rdi) | |||
mov $0xff0, %r8 | |||
pext %r8, %r11, %r10 | |||
mov $0x404040404040404, %rdx | |||
pdep %rdx, %r10, %r10 | |||
xor %r10, 72(%rdi) | |||
mov $0x1f000, %rax | |||
pext %rax, %r11, %r10 | |||
mov $0x404040404, %r15 | |||
pdep %r15, %r10, %r10 | |||
xor %r10, 80(%rdi) | |||
mov 80(%rsi), %r11 | |||
mov %r11, %r10 | |||
and $0x1, %r10 | |||
rol $61, %r10 | |||
xor %r10, 40(%rdi) | |||
mov $0x1fe, %rbx | |||
pext %rbx, %r11, %r10 | |||
pdep %r14, %r10, %r10 | |||
xor %r10, 48(%rdi) | |||
mov $0x1fe00, %r13 | |||
pext %r13, %r11, %r10 | |||
pdep %r14, %r10, %r10 | |||
xor %r10, 56(%rdi) | |||
mov $0x1fe0000, %rcx | |||
pext %rcx, %r11, %r10 | |||
pdep %r14, %r10, %r10 | |||
xor %r10, 64(%rdi) | |||
mov $0x1fe000000, %rbp | |||
pext %rbp, %r11, %r10 | |||
pdep %r14, %r10, %r10 | |||
xor %r10, 72(%rdi) | |||
mov $0x1e00000000, %r9 | |||
pext %r9, %r11, %r10 | |||
mov $0x20202020, %r12 | |||
pdep %r12, %r10, %r10 | |||
xor %r10, 80(%rdi) | |||
movq $0x0, 88(%rdi) | |||
pop %rbp | |||
pop %rbx | |||
pop %r12 | |||
pop %r13 | |||
pop %r14 | |||
pop %r15 | |||
ret |
@@ -1,10 +1,10 @@ | |||
# This Makefile can be used with GNU Make or BSD Make | |||
LIB=libntruhps2048677_clean.a | |||
HEADERS=api.h crypto_sort.h owcpa.h params.h poly.h sample.h verify.h | |||
OBJECTS=crypto_sort.o kem.o owcpa.o pack3.o packq.o poly.o poly_lift.o poly_mod.o poly_r2_inv.o poly_rq_mul.o poly_s3_inv.o sample.o sample_iid.o verify.o | |||
HEADERS=api.h cmov.h crypto_sort_int32.h owcpa.h params.h poly.h sample.h | |||
OBJECTS=cmov.o crypto_sort_int32.o kem.o owcpa.o pack3.o packq.o poly.o poly_lift.o poly_mod.o poly_r2_inv.o poly_rq_mul.o poly_s3_inv.o sample.o sample_iid.o | |||
CFLAGS=-O3 -Wall -Wextra -Wpedantic -Werror -Wmissing-prototypes -Wredundant-decls -std=c99 -I../../../common $(EXTRAFLAGS) | |||
CFLAGS=-O3 -Wall -Wextra -Wpedantic -Wvla -Werror -Wredundant-decls -Wmissing-prototypes -std=c99 -I../../../common $(EXTRAFLAGS) | |||
all: $(LIB) | |||
@@ -2,7 +2,7 @@ | |||
# nmake /f Makefile.Microsoft_nmake | |||
LIBRARY=libntruhps2048677_clean.lib | |||
OBJECTS=crypto_sort.obj kem.obj owcpa.obj pack3.obj packq.obj poly.obj poly_lift.obj poly_mod.obj poly_r2_inv.obj poly_rq_mul.obj poly_s3_inv.obj sample.obj sample_iid.obj verify.obj | |||
OBJECTS=cmov.obj crypto_sort_int32.obj kem.obj owcpa.obj pack3.obj packq.obj poly.obj poly_lift.obj poly_mod.obj poly_r2_inv.obj poly_rq_mul.obj poly_s3_inv.obj sample.obj sample_iid.obj | |||
CFLAGS=/nologo /O2 /I ..\..\..\common /W4 /WX | |||