Browse Source

Update NTRU and add AVX2 NTRU implementations

tags/v0.0.1
John M. Schanck 4 years ago
committed by Kris Kwiatkowski
parent
commit
f37f0f3e85
100 changed files with 91978 additions and 343 deletions
  1. +15
    -2
      crypto_kem/ntruhps2048509/META.yml
  2. +1
    -0
      crypto_kem/ntruhps2048509/avx2/LICENSE
  3. +24
    -0
      crypto_kem/ntruhps2048509/avx2/Makefile
  4. +19
    -0
      crypto_kem/ntruhps2048509/avx2/api.h
  5. +11
    -0
      crypto_kem/ntruhps2048509/avx2/cmov.c
  6. +10
    -0
      crypto_kem/ntruhps2048509/avx2/cmov.h
  7. +1215
    -0
      crypto_kem/ntruhps2048509/avx2/crypto_sort_int32.c
  8. +11
    -0
      crypto_kem/ntruhps2048509/avx2/crypto_sort_int32.h
  9. +68
    -0
      crypto_kem/ntruhps2048509/avx2/kem.c
  10. +160
    -0
      crypto_kem/ntruhps2048509/avx2/owcpa.c
  11. +22
    -0
      crypto_kem/ntruhps2048509/avx2/owcpa.h
  12. +46
    -0
      crypto_kem/ntruhps2048509/avx2/pack3.c
  13. +93
    -0
      crypto_kem/ntruhps2048509/avx2/packq.c
  14. +37
    -0
      crypto_kem/ntruhps2048509/avx2/params.h
  15. +67
    -0
      crypto_kem/ntruhps2048509/avx2/poly.c
  16. +41
    -0
      crypto_kem/ntruhps2048509/avx2/poly.h
  17. +11
    -0
      crypto_kem/ntruhps2048509/avx2/poly_lift.c
  18. +676
    -0
      crypto_kem/ntruhps2048509/avx2/poly_mod_3_Phi_n.s
  19. +80
    -0
      crypto_kem/ntruhps2048509/avx2/poly_mod_q_Phi_n.s
  20. +80
    -0
      crypto_kem/ntruhps2048509/avx2/poly_r2_inv.c
  21. +20
    -0
      crypto_kem/ntruhps2048509/avx2/poly_r2_inv.h
  22. +285
    -0
      crypto_kem/ntruhps2048509/avx2/poly_r2_mul.s
  23. +5625
    -0
      crypto_kem/ntruhps2048509/avx2/poly_rq_mul.s
  24. +955
    -0
      crypto_kem/ntruhps2048509/avx2/poly_rq_to_s3.s
  25. +463
    -0
      crypto_kem/ntruhps2048509/avx2/poly_s3_inv.c
  26. +46
    -0
      crypto_kem/ntruhps2048509/avx2/sample.c
  27. +15
    -0
      crypto_kem/ntruhps2048509/avx2/sample.h
  28. +21
    -0
      crypto_kem/ntruhps2048509/avx2/sample_iid.c
  29. +2854
    -0
      crypto_kem/ntruhps2048509/avx2/square_126_509_shufbytes.s
  30. +5125
    -0
      crypto_kem/ntruhps2048509/avx2/square_15_509_shufbytes.s
  31. +109
    -0
      crypto_kem/ntruhps2048509/avx2/square_1_509_patience.s
  32. +3992
    -0
      crypto_kem/ntruhps2048509/avx2/square_252_509_shufbytes.s
  33. +4316
    -0
      crypto_kem/ntruhps2048509/avx2/square_30_509_shufbytes.s
  34. +272
    -0
      crypto_kem/ntruhps2048509/avx2/square_3_509_patience.s
  35. +4186
    -0
      crypto_kem/ntruhps2048509/avx2/square_63_509_shufbytes.s
  36. +296
    -0
      crypto_kem/ntruhps2048509/avx2/square_6_509_patience.s
  37. +784
    -0
      crypto_kem/ntruhps2048509/avx2/vec32_sample_iid.s
  38. +2
    -2
      crypto_kem/ntruhps2048509/clean/Makefile
  39. +1
    -1
      crypto_kem/ntruhps2048509/clean/Makefile.Microsoft_nmake
  40. +1
    -1
      crypto_kem/ntruhps2048509/clean/api.h
  41. +11
    -0
      crypto_kem/ntruhps2048509/clean/cmov.c
  42. +10
    -0
      crypto_kem/ntruhps2048509/clean/cmov.h
  43. +0
    -50
      crypto_kem/ntruhps2048509/clean/crypto_sort.c
  44. +0
    -6
      crypto_kem/ntruhps2048509/clean/crypto_sort.h
  45. +86
    -0
      crypto_kem/ntruhps2048509/clean/crypto_sort_int32.c
  46. +11
    -0
      crypto_kem/ntruhps2048509/clean/crypto_sort_int32.h
  47. +2
    -4
      crypto_kem/ntruhps2048509/clean/kem.c
  48. +1
    -0
      crypto_kem/ntruhps2048509/clean/owcpa.c
  49. +0
    -1
      crypto_kem/ntruhps2048509/clean/pack3.c
  50. +6
    -8
      crypto_kem/ntruhps2048509/clean/packq.c
  51. +1
    -0
      crypto_kem/ntruhps2048509/clean/params.h
  52. +2
    -6
      crypto_kem/ntruhps2048509/clean/poly.h
  53. +1
    -0
      crypto_kem/ntruhps2048509/clean/poly_lift.c
  54. +51
    -95
      crypto_kem/ntruhps2048509/clean/poly_r2_inv.c
  55. +53
    -112
      crypto_kem/ntruhps2048509/clean/poly_s3_inv.c
  56. +9
    -6
      crypto_kem/ntruhps2048509/clean/sample.c
  57. +1
    -2
      crypto_kem/ntruhps2048509/clean/sample.h
  58. +0
    -29
      crypto_kem/ntruhps2048509/clean/verify.c
  59. +0
    -12
      crypto_kem/ntruhps2048509/clean/verify.h
  60. +15
    -2
      crypto_kem/ntruhps2048677/META.yml
  61. +1
    -0
      crypto_kem/ntruhps2048677/avx2/LICENSE
  62. +24
    -0
      crypto_kem/ntruhps2048677/avx2/Makefile
  63. +19
    -0
      crypto_kem/ntruhps2048677/avx2/api.h
  64. +11
    -0
      crypto_kem/ntruhps2048677/avx2/cmov.c
  65. +10
    -0
      crypto_kem/ntruhps2048677/avx2/cmov.h
  66. +1215
    -0
      crypto_kem/ntruhps2048677/avx2/crypto_sort_int32.c
  67. +11
    -0
      crypto_kem/ntruhps2048677/avx2/crypto_sort_int32.h
  68. +68
    -0
      crypto_kem/ntruhps2048677/avx2/kem.c
  69. +160
    -0
      crypto_kem/ntruhps2048677/avx2/owcpa.c
  70. +22
    -0
      crypto_kem/ntruhps2048677/avx2/owcpa.h
  71. +46
    -0
      crypto_kem/ntruhps2048677/avx2/pack3.c
  72. +93
    -0
      crypto_kem/ntruhps2048677/avx2/packq.c
  73. +37
    -0
      crypto_kem/ntruhps2048677/avx2/params.h
  74. +67
    -0
      crypto_kem/ntruhps2048677/avx2/poly.c
  75. +41
    -0
      crypto_kem/ntruhps2048677/avx2/poly.h
  76. +11
    -0
      crypto_kem/ntruhps2048677/avx2/poly_lift.c
  77. +928
    -0
      crypto_kem/ntruhps2048677/avx2/poly_mod_3_Phi_n.s
  78. +104
    -0
      crypto_kem/ntruhps2048677/avx2/poly_mod_q_Phi_n.s
  79. +73
    -0
      crypto_kem/ntruhps2048677/avx2/poly_r2_inv.c
  80. +23
    -0
      crypto_kem/ntruhps2048677/avx2/poly_r2_inv.h
  81. +466
    -0
      crypto_kem/ntruhps2048677/avx2/poly_r2_mul.s
  82. +8010
    -0
      crypto_kem/ntruhps2048677/avx2/poly_rq_mul.s
  83. +1255
    -0
      crypto_kem/ntruhps2048677/avx2/poly_rq_to_s3.s
  84. +569
    -0
      crypto_kem/ntruhps2048677/avx2/poly_s3_inv.c
  85. +46
    -0
      crypto_kem/ntruhps2048677/avx2/sample.c
  86. +15
    -0
      crypto_kem/ntruhps2048677/avx2/sample.h
  87. +21
    -0
      crypto_kem/ntruhps2048677/avx2/sample_iid.c
  88. +7189
    -0
      crypto_kem/ntruhps2048677/avx2/square_10_677_shufbytes.s
  89. +7312
    -0
      crypto_kem/ntruhps2048677/avx2/square_168_677_shufbytes.s
  90. +134
    -0
      crypto_kem/ntruhps2048677/avx2/square_1_677_patience.s
  91. +6580
    -0
      crypto_kem/ntruhps2048677/avx2/square_21_677_shufbytes.s
  92. +235
    -0
      crypto_kem/ntruhps2048677/avx2/square_2_677_patience.s
  93. +6450
    -0
      crypto_kem/ntruhps2048677/avx2/square_336_677_shufbytes.s
  94. +452
    -0
      crypto_kem/ntruhps2048677/avx2/square_3_677_patience.s
  95. +8477
    -0
      crypto_kem/ntruhps2048677/avx2/square_42_677_shufbytes.s
  96. +1478
    -0
      crypto_kem/ntruhps2048677/avx2/square_5_677_patience.s
  97. +6940
    -0
      crypto_kem/ntruhps2048677/avx2/square_84_677_shufbytes.s
  98. +1066
    -0
      crypto_kem/ntruhps2048677/avx2/vec32_sample_iid.s
  99. +3
    -3
      crypto_kem/ntruhps2048677/clean/Makefile
  100. +1
    -1
      crypto_kem/ntruhps2048677/clean/Makefile.Microsoft_nmake

+ 15
- 2
crypto_kem/ntruhps2048509/META.yml View File

@@ -1,4 +1,4 @@
name: NTRU-HPS2048509
name: ntruhps2048509
type: kem
claimed-nist-level: 1
claimed-security: IND-CCA2
@@ -15,9 +15,22 @@ auxiliary-submitters:
- Jeffrey Hoffstein
- Andreas Hülsing
- Joost Rijneveld
- Tsunekazu Saito
- Peter Schwabe
- William Whyte
- Keita Xagawa
- Takashi Yamakawa
- Zhenfei Zhang
implementations:
- name: clean
version: https://github.com/jschanck/ntru/tree/485dde03 reference implementation
version: https://github.com/jschanck/ntru/tree/4699d70a reference implementation
- name: avx2
version: https://github.com/jschanck/ntru/tree/4699d70a avx2 implementation
supported_platforms:
- architecture: x86_64
operating_systems:
- Linux
- Darwin
required_flags:
- avx2
- bmi2

+ 1
- 0
crypto_kem/ntruhps2048509/avx2/LICENSE View File

@@ -0,0 +1 @@
Public Domain

+ 24
- 0
crypto_kem/ntruhps2048509/avx2/Makefile View File

@@ -0,0 +1,24 @@
# This Makefile can be used with GNU Make or BSD Make

LIB=libntruhps2048509_avx2.a
HEADERS=api.h cmov.h crypto_sort_int32.h owcpa.h params.h poly.h poly_r2_inv.h sample.h
OBJECTS=cmov.o crypto_sort_int32.o kem.o owcpa.o pack3.o packq.o poly.o poly_lift.o poly_r2_inv.o poly_s3_inv.o sample.o sample_iid.o \
square_1_509_patience.o square_3_509_patience.o square_6_509_patience.o square_15_509_shufbytes.o square_30_509_shufbytes.o square_63_509_shufbytes.o square_126_509_shufbytes.o square_252_509_shufbytes.o \
poly_mod_3_Phi_n.o poly_mod_q_Phi_n.o poly_r2_mul.o poly_rq_mul.o poly_rq_to_s3.o vec32_sample_iid.o

CFLAGS=-O3 -mavx2 -mbmi2 -Wall -Wextra -Wpedantic -Wvla -Werror -Wredundant-decls -Wmissing-prototypes -std=c99 -I../../../common $(EXTRAFLAGS)

all: $(LIB)

%.o: %.s $(HEADERS)
$(AS) -o $@ $<

%.o: %.c $(HEADERS)
$(CC) $(CFLAGS) -c -o $@ $<

$(LIB): $(OBJECTS)
$(AR) -r $@ $(OBJECTS)

clean:
$(RM) $(OBJECTS)
$(RM) $(LIB)

+ 19
- 0
crypto_kem/ntruhps2048509/avx2/api.h View File

@@ -0,0 +1,19 @@
#ifndef PQCLEAN_NTRUHPS2048509_AVX2_API_H
#define PQCLEAN_NTRUHPS2048509_AVX2_API_H

#include <stdint.h>

#define PQCLEAN_NTRUHPS2048509_AVX2_CRYPTO_SECRETKEYBYTES 935
#define PQCLEAN_NTRUHPS2048509_AVX2_CRYPTO_PUBLICKEYBYTES 699
#define PQCLEAN_NTRUHPS2048509_AVX2_CRYPTO_CIPHERTEXTBYTES 699
#define PQCLEAN_NTRUHPS2048509_AVX2_CRYPTO_BYTES 32

#define PQCLEAN_NTRUHPS2048509_AVX2_CRYPTO_ALGNAME "ntruhps2048509"

int PQCLEAN_NTRUHPS2048509_AVX2_crypto_kem_keypair(uint8_t *pk, uint8_t *sk);

int PQCLEAN_NTRUHPS2048509_AVX2_crypto_kem_enc(uint8_t *c, uint8_t *k, const uint8_t *pk);

int PQCLEAN_NTRUHPS2048509_AVX2_crypto_kem_dec(uint8_t *k, const uint8_t *c, const uint8_t *sk);

#endif

+ 11
- 0
crypto_kem/ntruhps2048509/avx2/cmov.c View File

@@ -0,0 +1,11 @@
#include "cmov.h"

/* b = 1 means mov, b = 0 means don't mov*/
void PQCLEAN_NTRUHPS2048509_AVX2_cmov(unsigned char *r, const unsigned char *x, size_t len, unsigned char b) {
size_t i;

b = (~b + 1);
for (i = 0; i < len; i++) {
r[i] ^= b & (x[i] ^ r[i]);
}
}

+ 10
- 0
crypto_kem/ntruhps2048509/avx2/cmov.h View File

@@ -0,0 +1,10 @@
#ifndef VERIFY_H
#define VERIFY_H

#include "params.h"

#include <stddef.h>

void PQCLEAN_NTRUHPS2048509_AVX2_cmov(unsigned char *r, const unsigned char *x, size_t len, unsigned char b);

#endif

+ 1215
- 0
crypto_kem/ntruhps2048509/avx2/crypto_sort_int32.c
File diff suppressed because it is too large
View File


+ 11
- 0
crypto_kem/ntruhps2048509/avx2/crypto_sort_int32.h View File

@@ -0,0 +1,11 @@
#ifndef CRYPTO_SORT
#define CRYPTO_SORT

#include "params.h"

#include <stddef.h>
#include <stdint.h>

void PQCLEAN_NTRUHPS2048509_AVX2_crypto_sort_int32(int32_t *x, size_t n);

#endif

+ 68
- 0
crypto_kem/ntruhps2048509/avx2/kem.c View File

@@ -0,0 +1,68 @@
#include "api.h"
#include "cmov.h"
#include "fips202.h"
#include "owcpa.h"
#include "params.h"
#include "randombytes.h"
#include "sample.h"

// API FUNCTIONS
int PQCLEAN_NTRUHPS2048509_AVX2_crypto_kem_keypair(uint8_t *pk, uint8_t *sk) {
uint8_t seed[NTRU_SAMPLE_FG_BYTES];

randombytes(seed, NTRU_SAMPLE_FG_BYTES);
PQCLEAN_NTRUHPS2048509_AVX2_owcpa_keypair(pk, sk, seed);

randombytes(sk + NTRU_OWCPA_SECRETKEYBYTES, NTRU_PRFKEYBYTES);

return 0;
}

int PQCLEAN_NTRUHPS2048509_AVX2_crypto_kem_enc(uint8_t *c, uint8_t *k, const uint8_t *pk) {
poly r, m;
uint8_t rm[NTRU_OWCPA_MSGBYTES];
uint8_t rm_seed[NTRU_SAMPLE_RM_BYTES];

randombytes(rm_seed, NTRU_SAMPLE_RM_BYTES);

PQCLEAN_NTRUHPS2048509_AVX2_sample_rm(&r, &m, rm_seed);

PQCLEAN_NTRUHPS2048509_AVX2_poly_S3_tobytes(rm, &r);
PQCLEAN_NTRUHPS2048509_AVX2_poly_S3_tobytes(rm + NTRU_PACK_TRINARY_BYTES, &m);
sha3_256(k, rm, NTRU_OWCPA_MSGBYTES);

PQCLEAN_NTRUHPS2048509_AVX2_poly_Z3_to_Zq(&r);
PQCLEAN_NTRUHPS2048509_AVX2_owcpa_enc(c, &r, &m, pk);

return 0;
}

int PQCLEAN_NTRUHPS2048509_AVX2_crypto_kem_dec(uint8_t *k, const uint8_t *c, const uint8_t *sk) {
int i, fail;
uint8_t rm[NTRU_OWCPA_MSGBYTES];
uint8_t buf[NTRU_PRFKEYBYTES + NTRU_CIPHERTEXTBYTES];

fail = 0;

/* Check that unused bits of last byte of ciphertext are zero */
fail |= c[NTRU_CIPHERTEXTBYTES - 1] & (0xff << (8 - (7 & (NTRU_LOGQ * NTRU_PACK_DEG))));

fail |= PQCLEAN_NTRUHPS2048509_AVX2_owcpa_dec(rm, c, sk);
/* If fail = 0 then c = Enc(h, rm). There is no need to re-encapsulate. */
/* See comment in PQCLEAN_NTRUHPS2048509_AVX2_owcpa_dec for details. */

sha3_256(k, rm, NTRU_OWCPA_MSGBYTES);

/* shake(secret PRF key || input ciphertext) */
for (i = 0; i < NTRU_PRFKEYBYTES; i++) {
buf[i] = sk[i + NTRU_OWCPA_SECRETKEYBYTES];
}
for (i = 0; i < NTRU_CIPHERTEXTBYTES; i++) {
buf[NTRU_PRFKEYBYTES + i] = c[i];
}
sha3_256(rm, buf, NTRU_PRFKEYBYTES + NTRU_CIPHERTEXTBYTES);

PQCLEAN_NTRUHPS2048509_AVX2_cmov(k, rm, NTRU_SHAREDKEYBYTES, (unsigned char) fail);

return 0;
}

+ 160
- 0
crypto_kem/ntruhps2048509/avx2/owcpa.c View File

@@ -0,0 +1,160 @@
#include "owcpa.h"
#include "poly.h"
#include "sample.h"

static int owcpa_check_r(const poly *r) {
/* Check that r is in message space. */
/* Note: Assumes that r has coefficients in {0, 1, ..., q-1} */
int i;
uint64_t t = 0;
uint16_t c;
for (i = 0; i < NTRU_N; i++) {
c = MODQ(r->coeffs[i] + 1);
t |= c & (NTRU_Q - 4); /* 0 if c is in {0,1,2,3} */
t |= (c + 1) & 0x4; /* 0 if c is in {0,1,2} */
}
t |= MODQ(r->coeffs[NTRU_N - 1]); /* Coefficient n-1 must be zero */
t = (~t + 1); // two's complement
t >>= 63;
return (int) t;
}

static int owcpa_check_m(const poly *m) {
/* Check that m is in message space. */
/* Note: Assumes that m has coefficients in {0,1,2}. */
int i;
uint64_t t = 0;
uint16_t p1 = 0;
uint16_t m1 = 0;
for (i = 0; i < NTRU_N; i++) {
p1 += m->coeffs[i] & 0x01;
m1 += (m->coeffs[i] & 0x02) >> 1;
}
/* Need p1 = m1 and p1 + m1 = NTRU_WEIGHT */
t |= p1 ^ m1;
t |= (p1 + m1) ^ NTRU_WEIGHT;
t = (~t + 1); // two's complement
t >>= 63;
return (int) t;
}

void PQCLEAN_NTRUHPS2048509_AVX2_owcpa_keypair(unsigned char *pk,
unsigned char *sk,
const unsigned char seed[NTRU_SAMPLE_FG_BYTES]) {
int i;

poly x1, x2, x3, x4, x5;

poly *f = &x1, *g = &x2, *invf_mod3 = &x3;
poly *gf = &x3, *invgf = &x4, *tmp = &x5;
poly *invh = &x3, *h = &x3;

PQCLEAN_NTRUHPS2048509_AVX2_sample_fg(f, g, seed);

PQCLEAN_NTRUHPS2048509_AVX2_poly_S3_inv(invf_mod3, f);
PQCLEAN_NTRUHPS2048509_AVX2_poly_S3_tobytes(sk, f);
PQCLEAN_NTRUHPS2048509_AVX2_poly_S3_tobytes(sk + NTRU_PACK_TRINARY_BYTES, invf_mod3);

/* Lift coeffs of f and g from Z_p to Z_q */
PQCLEAN_NTRUHPS2048509_AVX2_poly_Z3_to_Zq(f);
PQCLEAN_NTRUHPS2048509_AVX2_poly_Z3_to_Zq(g);


/* g = 3*g */
for (i = 0; i < NTRU_N; i++) {
g->coeffs[i] = 3 * g->coeffs[i];
}

PQCLEAN_NTRUHPS2048509_AVX2_poly_Rq_mul(gf, g, f);

PQCLEAN_NTRUHPS2048509_AVX2_poly_Rq_inv(invgf, gf);

PQCLEAN_NTRUHPS2048509_AVX2_poly_Rq_mul(tmp, invgf, f);
PQCLEAN_NTRUHPS2048509_AVX2_poly_Sq_mul(invh, tmp, f);
PQCLEAN_NTRUHPS2048509_AVX2_poly_Sq_tobytes(sk + 2 * NTRU_PACK_TRINARY_BYTES, invh);

PQCLEAN_NTRUHPS2048509_AVX2_poly_Rq_mul(tmp, invgf, g);
PQCLEAN_NTRUHPS2048509_AVX2_poly_Rq_mul(h, tmp, g);
PQCLEAN_NTRUHPS2048509_AVX2_poly_Rq_sum_zero_tobytes(pk, h);
}


void PQCLEAN_NTRUHPS2048509_AVX2_owcpa_enc(unsigned char *c,
const poly *r,
const poly *m,
const unsigned char *pk) {
int i;
poly x1, x2;
poly *h = &x1, *liftm = &x1;
poly *ct = &x2;

PQCLEAN_NTRUHPS2048509_AVX2_poly_Rq_sum_zero_frombytes(h, pk);

PQCLEAN_NTRUHPS2048509_AVX2_poly_Rq_mul(ct, r, h);

PQCLEAN_NTRUHPS2048509_AVX2_poly_lift(liftm, m);
for (i = 0; i < NTRU_N; i++) {
ct->coeffs[i] = ct->coeffs[i] + liftm->coeffs[i];
}

PQCLEAN_NTRUHPS2048509_AVX2_poly_Rq_sum_zero_tobytes(c, ct);
}

int PQCLEAN_NTRUHPS2048509_AVX2_owcpa_dec(unsigned char *rm,
const unsigned char *ciphertext,
const unsigned char *secretkey) {
int i;
int fail;
poly x1, x2, x3, x4;

poly *c = &x1, *f = &x2, *cf = &x3;
poly *mf = &x2, *finv3 = &x3, *m = &x4;
poly *liftm = &x2, *invh = &x3, *r = &x4;
poly *b = &x1;

PQCLEAN_NTRUHPS2048509_AVX2_poly_Rq_sum_zero_frombytes(c, ciphertext);
PQCLEAN_NTRUHPS2048509_AVX2_poly_S3_frombytes(f, secretkey);
PQCLEAN_NTRUHPS2048509_AVX2_poly_Z3_to_Zq(f);

PQCLEAN_NTRUHPS2048509_AVX2_poly_Rq_mul(cf, c, f);
PQCLEAN_NTRUHPS2048509_AVX2_poly_Rq_to_S3(mf, cf);

PQCLEAN_NTRUHPS2048509_AVX2_poly_S3_frombytes(finv3, secretkey + NTRU_PACK_TRINARY_BYTES);
PQCLEAN_NTRUHPS2048509_AVX2_poly_S3_mul(m, mf, finv3);
PQCLEAN_NTRUHPS2048509_AVX2_poly_S3_tobytes(rm + NTRU_PACK_TRINARY_BYTES, m);

/* NOTE: For the IND-CCA2 KEM we must ensure that c = Enc(h, (r,m)). */
/* We can avoid re-computing r*h + Lift(m) as long as we check that */
/* r (defined as b/h mod (q, Phi_n)) and m are in the message space. */
/* (m can take any value in S3 in NTRU_HRSS) */
fail = 0;
fail |= owcpa_check_m(m);

/* b = c - Lift(m) mod (q, x^n - 1) */
PQCLEAN_NTRUHPS2048509_AVX2_poly_lift(liftm, m);
for (i = 0; i < NTRU_N; i++) {
b->coeffs[i] = c->coeffs[i] - liftm->coeffs[i];
}

/* r = b / h mod (q, Phi_n) */
PQCLEAN_NTRUHPS2048509_AVX2_poly_Sq_frombytes(invh, secretkey + 2 * NTRU_PACK_TRINARY_BYTES);
PQCLEAN_NTRUHPS2048509_AVX2_poly_Sq_mul(r, b, invh);

/* NOTE: Our definition of r as b/h mod (q, Phi_n) follows Figure 4 of */
/* [Sch18] https://eprint.iacr.org/2018/1174/20181203:032458. */
/* This differs from Figure 10 of Saito--Xagawa--Yamakawa */
/* [SXY17] https://eprint.iacr.org/2017/1005/20180516:055500 */
/* where r gets a final reduction modulo p. */
/* We need this change to use Proposition 1 of [Sch18]. */

/* Proposition 1 of [Sch18] shows that re-encryption with (r,m) yields c. */
/* if and only if fail==0 after the following call to owcpa_check_r */
/* The procedure given in Fig. 8 of [Sch18] can be skipped because we have */
/* c(1) = 0 due to the use of poly_Rq_sum_zero_{to,from}bytes. */
fail |= owcpa_check_r(r);

PQCLEAN_NTRUHPS2048509_AVX2_poly_trinary_Zq_to_Z3(r);
PQCLEAN_NTRUHPS2048509_AVX2_poly_S3_tobytes(rm, r);

return fail;
}

+ 22
- 0
crypto_kem/ntruhps2048509/avx2/owcpa.h View File

@@ -0,0 +1,22 @@
#ifndef OWCPA_H
#define OWCPA_H

#include "params.h"
#include "poly.h"

void PQCLEAN_NTRUHPS2048509_AVX2_owcpa_samplemsg(unsigned char msg[NTRU_OWCPA_MSGBYTES],
const unsigned char seed[NTRU_SEEDBYTES]);

void PQCLEAN_NTRUHPS2048509_AVX2_owcpa_keypair(unsigned char *pk,
unsigned char *sk,
const unsigned char seed[NTRU_SEEDBYTES]);

void PQCLEAN_NTRUHPS2048509_AVX2_owcpa_enc(unsigned char *c,
const poly *r,
const poly *m,
const unsigned char *pk);

int PQCLEAN_NTRUHPS2048509_AVX2_owcpa_dec(unsigned char *rm,
const unsigned char *ciphertext,
const unsigned char *secretkey);
#endif

+ 46
- 0
crypto_kem/ntruhps2048509/avx2/pack3.c View File

@@ -0,0 +1,46 @@
#include "poly.h"

void PQCLEAN_NTRUHPS2048509_AVX2_poly_S3_tobytes(unsigned char msg[NTRU_OWCPA_MSGBYTES], const poly *a) {
int i;
unsigned char c;
int j;

for (i = 0; i < NTRU_PACK_DEG / 5; i++) {
c = a->coeffs[5 * i + 4] & 255;
c = (3 * c + a->coeffs[5 * i + 3]) & 255;
c = (3 * c + a->coeffs[5 * i + 2]) & 255;
c = (3 * c + a->coeffs[5 * i + 1]) & 255;
c = (3 * c + a->coeffs[5 * i + 0]) & 255;
msg[i] = c;
}
i = NTRU_PACK_DEG / 5;
c = 0;
for (j = NTRU_PACK_DEG - (5 * i) - 1; j >= 0; j--) {
c = (3 * c + a->coeffs[5 * i + j]) & 255;
}
msg[i] = c;
}

void PQCLEAN_NTRUHPS2048509_AVX2_poly_S3_frombytes(poly *r, const unsigned char msg[NTRU_OWCPA_MSGBYTES]) {
int i;
unsigned char c;
int j;

for (i = 0; i < NTRU_PACK_DEG / 5; i++) {
c = msg[i];
r->coeffs[5 * i + 0] = c;
r->coeffs[5 * i + 1] = c * 171 >> 9; // this is division by 3
r->coeffs[5 * i + 2] = c * 57 >> 9; // division by 3^2
r->coeffs[5 * i + 3] = c * 19 >> 9; // division by 3^3
r->coeffs[5 * i + 4] = c * 203 >> 14; // etc.
}
i = NTRU_PACK_DEG / 5;
c = msg[i];
for (j = 0; (5 * i + j) < NTRU_PACK_DEG; j++) {
r->coeffs[5 * i + j] = c;
c = c * 171 >> 9;
}
r->coeffs[NTRU_N - 1] = 0;
PQCLEAN_NTRUHPS2048509_AVX2_poly_mod_3_Phi_n(r);
}


+ 93
- 0
crypto_kem/ntruhps2048509/avx2/packq.c View File

@@ -0,0 +1,93 @@
#include "poly.h"

void PQCLEAN_NTRUHPS2048509_AVX2_poly_Sq_tobytes(unsigned char *r, const poly *a) {
int i, j;
uint16_t t[8];

for (i = 0; i < NTRU_PACK_DEG / 8; i++) {
for (j = 0; j < 8; j++) {
t[j] = MODQ(a->coeffs[8 * i + j]);
}

r[11 * i + 0] = (unsigned char) ( t[0] & 0xff);
r[11 * i + 1] = (unsigned char) ((t[0] >> 8) | ((t[1] & 0x1f) << 3));
r[11 * i + 2] = (unsigned char) ((t[1] >> 5) | ((t[2] & 0x03) << 6));
r[11 * i + 3] = (unsigned char) ((t[2] >> 2) & 0xff);
r[11 * i + 4] = (unsigned char) ((t[2] >> 10) | ((t[3] & 0x7f) << 1));
r[11 * i + 5] = (unsigned char) ((t[3] >> 7) | ((t[4] & 0x0f) << 4));
r[11 * i + 6] = (unsigned char) ((t[4] >> 4) | ((t[5] & 0x01) << 7));
r[11 * i + 7] = (unsigned char) ((t[5] >> 1) & 0xff);
r[11 * i + 8] = (unsigned char) ((t[5] >> 9) | ((t[6] & 0x3f) << 2));
r[11 * i + 9] = (unsigned char) ((t[6] >> 6) | ((t[7] & 0x07) << 5));
r[11 * i + 10] = (unsigned char) ((t[7] >> 3));
}

for (j = 0; j < NTRU_PACK_DEG - 8 * i; j++) {
t[j] = MODQ(a->coeffs[8 * i + j]);
}
for (; j < 8; j++) {
t[j] = 0;
}

switch (NTRU_PACK_DEG & 0x07) {
// cases 0 and 6 are impossible since 2 generates (Z/n)* and
// p mod 8 in {1, 7} implies that 2 is a quadratic residue.
case 4:
r[11 * i + 0] = (unsigned char) (t[0] & 0xff);
r[11 * i + 1] = (unsigned char) (t[0] >> 8) | ((t[1] & 0x1f) << 3);
r[11 * i + 2] = (unsigned char) (t[1] >> 5) | ((t[2] & 0x03) << 6);
r[11 * i + 3] = (unsigned char) (t[2] >> 2) & 0xff;
r[11 * i + 4] = (unsigned char) (t[2] >> 10) | ((t[3] & 0x7f) << 1);
r[11 * i + 5] = (unsigned char) (t[3] >> 7) | ((t[4] & 0x0f) << 4);
break;
case 2:
r[11 * i + 0] = (unsigned char) (t[0] & 0xff);
r[11 * i + 1] = (unsigned char) (t[0] >> 8) | ((t[1] & 0x1f) << 3);
r[11 * i + 2] = (unsigned char) (t[1] >> 5) | ((t[2] & 0x03) << 6);
break;
}
}

void PQCLEAN_NTRUHPS2048509_AVX2_poly_Sq_frombytes(poly *r, const unsigned char *a) {
int i;
for (i = 0; i < NTRU_PACK_DEG / 8; i++) {
r->coeffs[8 * i + 0] = (a[11 * i + 0] >> 0) | (((uint16_t)a[11 * i + 1] & 0x07) << 8);
r->coeffs[8 * i + 1] = (a[11 * i + 1] >> 3) | (((uint16_t)a[11 * i + 2] & 0x3f) << 5);
r->coeffs[8 * i + 2] = (a[11 * i + 2] >> 6) | (((uint16_t)a[11 * i + 3] & 0xff) << 2) | (((uint16_t)a[11 * i + 4] & 0x01) << 10);
r->coeffs[8 * i + 3] = (a[11 * i + 4] >> 1) | (((uint16_t)a[11 * i + 5] & 0x0f) << 7);
r->coeffs[8 * i + 4] = (a[11 * i + 5] >> 4) | (((uint16_t)a[11 * i + 6] & 0x7f) << 4);
r->coeffs[8 * i + 5] = (a[11 * i + 6] >> 7) | (((uint16_t)a[11 * i + 7] & 0xff) << 1) | (((uint16_t)a[11 * i + 8] & 0x03) << 9);
r->coeffs[8 * i + 6] = (a[11 * i + 8] >> 2) | (((uint16_t)a[11 * i + 9] & 0x1f) << 6);
r->coeffs[8 * i + 7] = (a[11 * i + 9] >> 5) | (((uint16_t)a[11 * i + 10] & 0xff) << 3);
}
switch (NTRU_PACK_DEG & 0x07) {
// cases 0 and 6 are impossible since 2 generates (Z/n)* and
// p mod 8 in {1, 7} implies that 2 is a quadratic residue.
case 4:
r->coeffs[8 * i + 0] = (a[11 * i + 0] >> 0) | (((uint16_t)a[11 * i + 1] & 0x07) << 8);
r->coeffs[8 * i + 1] = (a[11 * i + 1] >> 3) | (((uint16_t)a[11 * i + 2] & 0x3f) << 5);
r->coeffs[8 * i + 2] = (a[11 * i + 2] >> 6) | (((uint16_t)a[11 * i + 3] & 0xff) << 2) | (((uint16_t)a[11 * i + 4] & 0x01) << 10);
r->coeffs[8 * i + 3] = (a[11 * i + 4] >> 1) | (((uint16_t)a[11 * i + 5] & 0x0f) << 7);
break;
case 2:
r->coeffs[8 * i + 0] = (a[11 * i + 0] >> 0) | (((uint16_t)a[11 * i + 1] & 0x07) << 8);
r->coeffs[8 * i + 1] = (a[11 * i + 1] >> 3) | (((uint16_t)a[11 * i + 2] & 0x3f) << 5);
break;
}
r->coeffs[NTRU_N - 1] = 0;
}

void PQCLEAN_NTRUHPS2048509_AVX2_poly_Rq_sum_zero_tobytes(unsigned char *r, const poly *a) {
PQCLEAN_NTRUHPS2048509_AVX2_poly_Sq_tobytes(r, a);
}

void PQCLEAN_NTRUHPS2048509_AVX2_poly_Rq_sum_zero_frombytes(poly *r, const unsigned char *a) {
int i;
PQCLEAN_NTRUHPS2048509_AVX2_poly_Sq_frombytes(r, a);

/* Set r[n-1] so that the sum of coefficients is zero mod q */
r->coeffs[NTRU_N - 1] = 0;
for (i = 0; i < NTRU_PACK_DEG; i++) {
r->coeffs[NTRU_N - 1] -= r->coeffs[i];
}
}

+ 37
- 0
crypto_kem/ntruhps2048509/avx2/params.h View File

@@ -0,0 +1,37 @@
#ifndef PARAMS_H
#define PARAMS_H

#define NTRU_HPS
#define NTRU_N 509
#define NTRU_LOGQ 11


/* Do not modify below this line */

#define PAD32(X) ((((X) + 31)/32)*32)

#define NTRU_Q (1 << NTRU_LOGQ)
#define NTRU_WEIGHT (NTRU_Q/8 - 2)

#define NTRU_SEEDBYTES 32
#define NTRU_PRFKEYBYTES 32
#define NTRU_SHAREDKEYBYTES 32

#define NTRU_SAMPLE_IID_BYTES (NTRU_N-1)
#define NTRU_SAMPLE_FT_BYTES ((30*(NTRU_N-1)+7)/8)
#define NTRU_SAMPLE_FG_BYTES (NTRU_SAMPLE_IID_BYTES+NTRU_SAMPLE_FT_BYTES)
#define NTRU_SAMPLE_RM_BYTES (NTRU_SAMPLE_IID_BYTES+NTRU_SAMPLE_FT_BYTES)

#define NTRU_PACK_DEG (NTRU_N-1)
#define NTRU_PACK_TRINARY_BYTES ((NTRU_PACK_DEG+4)/5)

#define NTRU_OWCPA_MSGBYTES (2*NTRU_PACK_TRINARY_BYTES)
#define NTRU_OWCPA_PUBLICKEYBYTES ((NTRU_LOGQ*NTRU_PACK_DEG+7)/8)
#define NTRU_OWCPA_SECRETKEYBYTES (2*NTRU_PACK_TRINARY_BYTES + NTRU_OWCPA_PUBLICKEYBYTES)
#define NTRU_OWCPA_BYTES ((NTRU_LOGQ*NTRU_PACK_DEG+7)/8)

#define NTRU_PUBLICKEYBYTES (NTRU_OWCPA_PUBLICKEYBYTES)
#define NTRU_SECRETKEYBYTES (NTRU_OWCPA_SECRETKEYBYTES + NTRU_PRFKEYBYTES)
#define NTRU_CIPHERTEXTBYTES (NTRU_OWCPA_BYTES)

#endif

+ 67
- 0
crypto_kem/ntruhps2048509/avx2/poly.c View File

@@ -0,0 +1,67 @@
#include "poly.h"

/* Map {0, 1, 2} -> {0,1,q-1} in place */
void PQCLEAN_NTRUHPS2048509_AVX2_poly_Z3_to_Zq(poly *r) {
int i;
for (i = 0; i < NTRU_N; i++) {
r->coeffs[i] = r->coeffs[i] | ((-(r->coeffs[i] >> 1)) & (NTRU_Q - 1));
}
}

/* Map {0, 1, q-1} -> {0,1,2} in place */
void PQCLEAN_NTRUHPS2048509_AVX2_poly_trinary_Zq_to_Z3(poly *r) {
int i;
for (i = 0; i < NTRU_N; i++) {
r->coeffs[i] = MODQ(r->coeffs[i]);
r->coeffs[i] = 3 & (r->coeffs[i] ^ (r->coeffs[i] >> (NTRU_LOGQ - 1)));
}
}

void PQCLEAN_NTRUHPS2048509_AVX2_poly_Sq_mul(poly *r, const poly *a, const poly *b) {
PQCLEAN_NTRUHPS2048509_AVX2_poly_Rq_mul(r, a, b);
PQCLEAN_NTRUHPS2048509_AVX2_poly_mod_q_Phi_n(r);
}

void PQCLEAN_NTRUHPS2048509_AVX2_poly_S3_mul(poly *r, const poly *a, const poly *b) {
PQCLEAN_NTRUHPS2048509_AVX2_poly_Rq_mul(r, a, b);
PQCLEAN_NTRUHPS2048509_AVX2_poly_mod_3_Phi_n(r);
}

static void PQCLEAN_NTRUHPS2048509_AVX2_poly_R2_inv_to_Rq_inv(poly *r, const poly *ai, const poly *a) {

int i;
poly b, c;
poly s;

// for 0..4
// ai = ai * (2 - a*ai) mod q
for (i = 0; i < NTRU_N; i++) {
b.coeffs[i] = -(a->coeffs[i]);
}

for (i = 0; i < NTRU_N; i++) {
r->coeffs[i] = ai->coeffs[i];
}

PQCLEAN_NTRUHPS2048509_AVX2_poly_Rq_mul(&c, r, &b);
c.coeffs[0] += 2; // c = 2 - a*ai
PQCLEAN_NTRUHPS2048509_AVX2_poly_Rq_mul(&s, &c, r); // s = ai*c

PQCLEAN_NTRUHPS2048509_AVX2_poly_Rq_mul(&c, &s, &b);
c.coeffs[0] += 2; // c = 2 - a*s
PQCLEAN_NTRUHPS2048509_AVX2_poly_Rq_mul(r, &c, &s); // r = s*c

PQCLEAN_NTRUHPS2048509_AVX2_poly_Rq_mul(&c, r, &b);
c.coeffs[0] += 2; // c = 2 - a*r
PQCLEAN_NTRUHPS2048509_AVX2_poly_Rq_mul(&s, &c, r); // s = r*c

PQCLEAN_NTRUHPS2048509_AVX2_poly_Rq_mul(&c, &s, &b);
c.coeffs[0] += 2; // c = 2 - a*s
PQCLEAN_NTRUHPS2048509_AVX2_poly_Rq_mul(r, &c, &s); // r = s*c
}

void PQCLEAN_NTRUHPS2048509_AVX2_poly_Rq_inv(poly *r, const poly *a) {
poly ai2;
PQCLEAN_NTRUHPS2048509_AVX2_poly_R2_inv(&ai2, a);
PQCLEAN_NTRUHPS2048509_AVX2_poly_R2_inv_to_Rq_inv(r, &ai2, a);
}

+ 41
- 0
crypto_kem/ntruhps2048509/avx2/poly.h View File

@@ -0,0 +1,41 @@
#ifndef POLY_H
#define POLY_H

#include <immintrin.h>
#include <stdint.h>

#include "params.h"

#define MODQ(X) ((X) & (NTRU_Q-1))

typedef union { /* align to 32 byte boundary for vmovdqa */
uint16_t coeffs[PAD32(NTRU_N)];
__m256i coeffs_x16[PAD32(NTRU_N) / 16];
} poly;

void PQCLEAN_NTRUHPS2048509_AVX2_poly_mod_3_Phi_n(poly *r);
void PQCLEAN_NTRUHPS2048509_AVX2_poly_mod_q_Phi_n(poly *r);

void PQCLEAN_NTRUHPS2048509_AVX2_poly_Sq_tobytes(unsigned char *r, const poly *a);
void PQCLEAN_NTRUHPS2048509_AVX2_poly_Sq_frombytes(poly *r, const unsigned char *a);

void PQCLEAN_NTRUHPS2048509_AVX2_poly_Rq_sum_zero_tobytes(unsigned char *r, const poly *a);
void PQCLEAN_NTRUHPS2048509_AVX2_poly_Rq_sum_zero_frombytes(poly *r, const unsigned char *a);

void PQCLEAN_NTRUHPS2048509_AVX2_poly_S3_tobytes(unsigned char msg[NTRU_PACK_TRINARY_BYTES], const poly *a);
void PQCLEAN_NTRUHPS2048509_AVX2_poly_S3_frombytes(poly *r, const unsigned char msg[NTRU_PACK_TRINARY_BYTES]);

void PQCLEAN_NTRUHPS2048509_AVX2_poly_Sq_mul(poly *r, const poly *a, const poly *b);
void PQCLEAN_NTRUHPS2048509_AVX2_poly_Rq_mul(poly *r, const poly *a, const poly *b);
void PQCLEAN_NTRUHPS2048509_AVX2_poly_S3_mul(poly *r, const poly *a, const poly *b);
void PQCLEAN_NTRUHPS2048509_AVX2_poly_lift(poly *r, const poly *a);
void PQCLEAN_NTRUHPS2048509_AVX2_poly_Rq_to_S3(poly *r, const poly *a);

void PQCLEAN_NTRUHPS2048509_AVX2_poly_R2_inv(poly *r, const poly *a);
void PQCLEAN_NTRUHPS2048509_AVX2_poly_Rq_inv(poly *r, const poly *a);
void PQCLEAN_NTRUHPS2048509_AVX2_poly_S3_inv(poly *r, const poly *a);

void PQCLEAN_NTRUHPS2048509_AVX2_poly_Z3_to_Zq(poly *r);
void PQCLEAN_NTRUHPS2048509_AVX2_poly_trinary_Zq_to_Z3(poly *r);

#endif

+ 11
- 0
crypto_kem/ntruhps2048509/avx2/poly_lift.c View File

@@ -0,0 +1,11 @@
#include "poly.h"

void PQCLEAN_NTRUHPS2048509_AVX2_poly_lift(poly *r, const poly *a) {
int i;
for (i = 0; i < NTRU_N; i++) {
r->coeffs[i] = a->coeffs[i];
}
PQCLEAN_NTRUHPS2048509_AVX2_poly_Z3_to_Zq(r);
}



+ 676
- 0
crypto_kem/ntruhps2048509/avx2/poly_mod_3_Phi_n.s View File

@@ -0,0 +1,676 @@
.data
.p2align 5
mask_ff:
.word 0xff
.word 0xff
.word 0xff
.word 0xff
.word 0xff
.word 0xff
.word 0xff
.word 0xff
.word 0xff
.word 0xff
.word 0xff
.word 0xff
.word 0xff
.word 0xff
.word 0xff
.word 0xff
mask_f:
.word 0xf
.word 0xf
.word 0xf
.word 0xf
.word 0xf
.word 0xf
.word 0xf
.word 0xf
.word 0xf
.word 0xf
.word 0xf
.word 0xf
.word 0xf
.word 0xf
.word 0xf
.word 0xf
mask_3:
.word 0x03
.word 0x03
.word 0x03
.word 0x03
.word 0x03
.word 0x03
.word 0x03
.word 0x03
.word 0x03
.word 0x03
.word 0x03
.word 0x03
.word 0x03
.word 0x03
.word 0x03
.word 0x03
.text
.global PQCLEAN_NTRUHPS2048509_AVX2_poly_mod_3_Phi_n
.global _PQCLEAN_NTRUHPS2048509_AVX2_poly_mod_3_Phi_n
PQCLEAN_NTRUHPS2048509_AVX2_poly_mod_3_Phi_n:
_PQCLEAN_NTRUHPS2048509_AVX2_poly_mod_3_Phi_n:
vmovdqa 992(%rdi), %ymm0
vpermq $3, %ymm0, %ymm0
vpslld $17, %ymm0, %ymm0
vpsrld $16, %ymm0, %ymm1
vpor %ymm0, %ymm1, %ymm0
vbroadcastss %xmm0, %ymm0
vpaddw 0(%rdi), %ymm0, %ymm1
vpsrlw $8, %ymm1, %ymm2
vpand mask_ff(%rip), %ymm1, %ymm1
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_f(%rip), %ymm2, %ymm1
vpsrlw $4, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpsubw mask_3(%rip), %ymm2, %ymm14
vpsraw $15, %ymm14, %ymm15
vpandn %ymm14, %ymm15, %ymm1
vpand %ymm15, %ymm2, %ymm14
vpxor %ymm14, %ymm1, %ymm2
vmovdqa %ymm2, 0(%rdi)
vpaddw 32(%rdi), %ymm0, %ymm1
vpsrlw $8, %ymm1, %ymm2
vpand mask_ff(%rip), %ymm1, %ymm1
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_f(%rip), %ymm2, %ymm1
vpsrlw $4, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpsubw mask_3(%rip), %ymm2, %ymm14
vpsraw $15, %ymm14, %ymm15
vpandn %ymm14, %ymm15, %ymm1
vpand %ymm15, %ymm2, %ymm14
vpxor %ymm14, %ymm1, %ymm2
vmovdqa %ymm2, 32(%rdi)
vpaddw 64(%rdi), %ymm0, %ymm1
vpsrlw $8, %ymm1, %ymm2
vpand mask_ff(%rip), %ymm1, %ymm1
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_f(%rip), %ymm2, %ymm1
vpsrlw $4, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpsubw mask_3(%rip), %ymm2, %ymm14
vpsraw $15, %ymm14, %ymm15
vpandn %ymm14, %ymm15, %ymm1
vpand %ymm15, %ymm2, %ymm14
vpxor %ymm14, %ymm1, %ymm2
vmovdqa %ymm2, 64(%rdi)
vpaddw 96(%rdi), %ymm0, %ymm1
vpsrlw $8, %ymm1, %ymm2
vpand mask_ff(%rip), %ymm1, %ymm1
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_f(%rip), %ymm2, %ymm1
vpsrlw $4, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpsubw mask_3(%rip), %ymm2, %ymm14
vpsraw $15, %ymm14, %ymm15
vpandn %ymm14, %ymm15, %ymm1
vpand %ymm15, %ymm2, %ymm14
vpxor %ymm14, %ymm1, %ymm2
vmovdqa %ymm2, 96(%rdi)
vpaddw 128(%rdi), %ymm0, %ymm1
vpsrlw $8, %ymm1, %ymm2
vpand mask_ff(%rip), %ymm1, %ymm1
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_f(%rip), %ymm2, %ymm1
vpsrlw $4, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpsubw mask_3(%rip), %ymm2, %ymm14
vpsraw $15, %ymm14, %ymm15
vpandn %ymm14, %ymm15, %ymm1
vpand %ymm15, %ymm2, %ymm14
vpxor %ymm14, %ymm1, %ymm2
vmovdqa %ymm2, 128(%rdi)
vpaddw 160(%rdi), %ymm0, %ymm1
vpsrlw $8, %ymm1, %ymm2
vpand mask_ff(%rip), %ymm1, %ymm1
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_f(%rip), %ymm2, %ymm1
vpsrlw $4, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpsubw mask_3(%rip), %ymm2, %ymm14
vpsraw $15, %ymm14, %ymm15
vpandn %ymm14, %ymm15, %ymm1
vpand %ymm15, %ymm2, %ymm14
vpxor %ymm14, %ymm1, %ymm2
vmovdqa %ymm2, 160(%rdi)
vpaddw 192(%rdi), %ymm0, %ymm1
vpsrlw $8, %ymm1, %ymm2
vpand mask_ff(%rip), %ymm1, %ymm1
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_f(%rip), %ymm2, %ymm1
vpsrlw $4, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpsubw mask_3(%rip), %ymm2, %ymm14
vpsraw $15, %ymm14, %ymm15
vpandn %ymm14, %ymm15, %ymm1
vpand %ymm15, %ymm2, %ymm14
vpxor %ymm14, %ymm1, %ymm2
vmovdqa %ymm2, 192(%rdi)
vpaddw 224(%rdi), %ymm0, %ymm1
vpsrlw $8, %ymm1, %ymm2
vpand mask_ff(%rip), %ymm1, %ymm1
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_f(%rip), %ymm2, %ymm1
vpsrlw $4, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpsubw mask_3(%rip), %ymm2, %ymm14
vpsraw $15, %ymm14, %ymm15
vpandn %ymm14, %ymm15, %ymm1
vpand %ymm15, %ymm2, %ymm14
vpxor %ymm14, %ymm1, %ymm2
vmovdqa %ymm2, 224(%rdi)
vpaddw 256(%rdi), %ymm0, %ymm1
vpsrlw $8, %ymm1, %ymm2
vpand mask_ff(%rip), %ymm1, %ymm1
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_f(%rip), %ymm2, %ymm1
vpsrlw $4, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpsubw mask_3(%rip), %ymm2, %ymm14
vpsraw $15, %ymm14, %ymm15
vpandn %ymm14, %ymm15, %ymm1
vpand %ymm15, %ymm2, %ymm14
vpxor %ymm14, %ymm1, %ymm2
vmovdqa %ymm2, 256(%rdi)
vpaddw 288(%rdi), %ymm0, %ymm1
vpsrlw $8, %ymm1, %ymm2
vpand mask_ff(%rip), %ymm1, %ymm1
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_f(%rip), %ymm2, %ymm1
vpsrlw $4, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpsubw mask_3(%rip), %ymm2, %ymm14
vpsraw $15, %ymm14, %ymm15
vpandn %ymm14, %ymm15, %ymm1
vpand %ymm15, %ymm2, %ymm14
vpxor %ymm14, %ymm1, %ymm2
vmovdqa %ymm2, 288(%rdi)
vpaddw 320(%rdi), %ymm0, %ymm1
vpsrlw $8, %ymm1, %ymm2
vpand mask_ff(%rip), %ymm1, %ymm1
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_f(%rip), %ymm2, %ymm1
vpsrlw $4, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpsubw mask_3(%rip), %ymm2, %ymm14
vpsraw $15, %ymm14, %ymm15
vpandn %ymm14, %ymm15, %ymm1
vpand %ymm15, %ymm2, %ymm14
vpxor %ymm14, %ymm1, %ymm2
vmovdqa %ymm2, 320(%rdi)
vpaddw 352(%rdi), %ymm0, %ymm1
vpsrlw $8, %ymm1, %ymm2
vpand mask_ff(%rip), %ymm1, %ymm1
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_f(%rip), %ymm2, %ymm1
vpsrlw $4, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpsubw mask_3(%rip), %ymm2, %ymm14
vpsraw $15, %ymm14, %ymm15
vpandn %ymm14, %ymm15, %ymm1
vpand %ymm15, %ymm2, %ymm14
vpxor %ymm14, %ymm1, %ymm2
vmovdqa %ymm2, 352(%rdi)
vpaddw 384(%rdi), %ymm0, %ymm1
vpsrlw $8, %ymm1, %ymm2
vpand mask_ff(%rip), %ymm1, %ymm1
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_f(%rip), %ymm2, %ymm1
vpsrlw $4, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpsubw mask_3(%rip), %ymm2, %ymm14
vpsraw $15, %ymm14, %ymm15
vpandn %ymm14, %ymm15, %ymm1
vpand %ymm15, %ymm2, %ymm14
vpxor %ymm14, %ymm1, %ymm2
vmovdqa %ymm2, 384(%rdi)
vpaddw 416(%rdi), %ymm0, %ymm1
vpsrlw $8, %ymm1, %ymm2
vpand mask_ff(%rip), %ymm1, %ymm1
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_f(%rip), %ymm2, %ymm1
vpsrlw $4, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpsubw mask_3(%rip), %ymm2, %ymm14
vpsraw $15, %ymm14, %ymm15
vpandn %ymm14, %ymm15, %ymm1
vpand %ymm15, %ymm2, %ymm14
vpxor %ymm14, %ymm1, %ymm2
vmovdqa %ymm2, 416(%rdi)
vpaddw 448(%rdi), %ymm0, %ymm1
vpsrlw $8, %ymm1, %ymm2
vpand mask_ff(%rip), %ymm1, %ymm1
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_f(%rip), %ymm2, %ymm1
vpsrlw $4, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpsubw mask_3(%rip), %ymm2, %ymm14
vpsraw $15, %ymm14, %ymm15
vpandn %ymm14, %ymm15, %ymm1
vpand %ymm15, %ymm2, %ymm14
vpxor %ymm14, %ymm1, %ymm2
vmovdqa %ymm2, 448(%rdi)
vpaddw 480(%rdi), %ymm0, %ymm1
vpsrlw $8, %ymm1, %ymm2
vpand mask_ff(%rip), %ymm1, %ymm1
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_f(%rip), %ymm2, %ymm1
vpsrlw $4, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpsubw mask_3(%rip), %ymm2, %ymm14
vpsraw $15, %ymm14, %ymm15
vpandn %ymm14, %ymm15, %ymm1
vpand %ymm15, %ymm2, %ymm14
vpxor %ymm14, %ymm1, %ymm2
vmovdqa %ymm2, 480(%rdi)
vpaddw 512(%rdi), %ymm0, %ymm1
vpsrlw $8, %ymm1, %ymm2
vpand mask_ff(%rip), %ymm1, %ymm1
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_f(%rip), %ymm2, %ymm1
vpsrlw $4, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpsubw mask_3(%rip), %ymm2, %ymm14
vpsraw $15, %ymm14, %ymm15
vpandn %ymm14, %ymm15, %ymm1
vpand %ymm15, %ymm2, %ymm14
vpxor %ymm14, %ymm1, %ymm2
vmovdqa %ymm2, 512(%rdi)
vpaddw 544(%rdi), %ymm0, %ymm1
vpsrlw $8, %ymm1, %ymm2
vpand mask_ff(%rip), %ymm1, %ymm1
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_f(%rip), %ymm2, %ymm1
vpsrlw $4, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpsubw mask_3(%rip), %ymm2, %ymm14
vpsraw $15, %ymm14, %ymm15
vpandn %ymm14, %ymm15, %ymm1
vpand %ymm15, %ymm2, %ymm14
vpxor %ymm14, %ymm1, %ymm2
vmovdqa %ymm2, 544(%rdi)
vpaddw 576(%rdi), %ymm0, %ymm1
vpsrlw $8, %ymm1, %ymm2
vpand mask_ff(%rip), %ymm1, %ymm1
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_f(%rip), %ymm2, %ymm1
vpsrlw $4, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpsubw mask_3(%rip), %ymm2, %ymm14
vpsraw $15, %ymm14, %ymm15
vpandn %ymm14, %ymm15, %ymm1
vpand %ymm15, %ymm2, %ymm14
vpxor %ymm14, %ymm1, %ymm2
vmovdqa %ymm2, 576(%rdi)
vpaddw 608(%rdi), %ymm0, %ymm1
vpsrlw $8, %ymm1, %ymm2
vpand mask_ff(%rip), %ymm1, %ymm1
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_f(%rip), %ymm2, %ymm1
vpsrlw $4, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpsubw mask_3(%rip), %ymm2, %ymm14
vpsraw $15, %ymm14, %ymm15
vpandn %ymm14, %ymm15, %ymm1
vpand %ymm15, %ymm2, %ymm14
vpxor %ymm14, %ymm1, %ymm2
vmovdqa %ymm2, 608(%rdi)
vpaddw 640(%rdi), %ymm0, %ymm1
vpsrlw $8, %ymm1, %ymm2
vpand mask_ff(%rip), %ymm1, %ymm1
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_f(%rip), %ymm2, %ymm1
vpsrlw $4, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpsubw mask_3(%rip), %ymm2, %ymm14
vpsraw $15, %ymm14, %ymm15
vpandn %ymm14, %ymm15, %ymm1
vpand %ymm15, %ymm2, %ymm14
vpxor %ymm14, %ymm1, %ymm2
vmovdqa %ymm2, 640(%rdi)
vpaddw 672(%rdi), %ymm0, %ymm1
vpsrlw $8, %ymm1, %ymm2
vpand mask_ff(%rip), %ymm1, %ymm1
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_f(%rip), %ymm2, %ymm1
vpsrlw $4, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpsubw mask_3(%rip), %ymm2, %ymm14
vpsraw $15, %ymm14, %ymm15
vpandn %ymm14, %ymm15, %ymm1
vpand %ymm15, %ymm2, %ymm14
vpxor %ymm14, %ymm1, %ymm2
vmovdqa %ymm2, 672(%rdi)
vpaddw 704(%rdi), %ymm0, %ymm1
vpsrlw $8, %ymm1, %ymm2
vpand mask_ff(%rip), %ymm1, %ymm1
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_f(%rip), %ymm2, %ymm1
vpsrlw $4, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpsubw mask_3(%rip), %ymm2, %ymm14
vpsraw $15, %ymm14, %ymm15
vpandn %ymm14, %ymm15, %ymm1
vpand %ymm15, %ymm2, %ymm14
vpxor %ymm14, %ymm1, %ymm2
vmovdqa %ymm2, 704(%rdi)
vpaddw 736(%rdi), %ymm0, %ymm1
vpsrlw $8, %ymm1, %ymm2
vpand mask_ff(%rip), %ymm1, %ymm1
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_f(%rip), %ymm2, %ymm1
vpsrlw $4, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpsubw mask_3(%rip), %ymm2, %ymm14
vpsraw $15, %ymm14, %ymm15
vpandn %ymm14, %ymm15, %ymm1
vpand %ymm15, %ymm2, %ymm14
vpxor %ymm14, %ymm1, %ymm2
vmovdqa %ymm2, 736(%rdi)
vpaddw 768(%rdi), %ymm0, %ymm1
vpsrlw $8, %ymm1, %ymm2
vpand mask_ff(%rip), %ymm1, %ymm1
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_f(%rip), %ymm2, %ymm1
vpsrlw $4, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpsubw mask_3(%rip), %ymm2, %ymm14
vpsraw $15, %ymm14, %ymm15
vpandn %ymm14, %ymm15, %ymm1
vpand %ymm15, %ymm2, %ymm14
vpxor %ymm14, %ymm1, %ymm2
vmovdqa %ymm2, 768(%rdi)
vpaddw 800(%rdi), %ymm0, %ymm1
vpsrlw $8, %ymm1, %ymm2
vpand mask_ff(%rip), %ymm1, %ymm1
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_f(%rip), %ymm2, %ymm1
vpsrlw $4, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpsubw mask_3(%rip), %ymm2, %ymm14
vpsraw $15, %ymm14, %ymm15
vpandn %ymm14, %ymm15, %ymm1
vpand %ymm15, %ymm2, %ymm14
vpxor %ymm14, %ymm1, %ymm2
vmovdqa %ymm2, 800(%rdi)
vpaddw 832(%rdi), %ymm0, %ymm1
vpsrlw $8, %ymm1, %ymm2
vpand mask_ff(%rip), %ymm1, %ymm1
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_f(%rip), %ymm2, %ymm1
vpsrlw $4, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpsubw mask_3(%rip), %ymm2, %ymm14
vpsraw $15, %ymm14, %ymm15
vpandn %ymm14, %ymm15, %ymm1
vpand %ymm15, %ymm2, %ymm14
vpxor %ymm14, %ymm1, %ymm2
vmovdqa %ymm2, 832(%rdi)
vpaddw 864(%rdi), %ymm0, %ymm1
vpsrlw $8, %ymm1, %ymm2
vpand mask_ff(%rip), %ymm1, %ymm1
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_f(%rip), %ymm2, %ymm1
vpsrlw $4, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpsubw mask_3(%rip), %ymm2, %ymm14
vpsraw $15, %ymm14, %ymm15
vpandn %ymm14, %ymm15, %ymm1
vpand %ymm15, %ymm2, %ymm14
vpxor %ymm14, %ymm1, %ymm2
vmovdqa %ymm2, 864(%rdi)
vpaddw 896(%rdi), %ymm0, %ymm1
vpsrlw $8, %ymm1, %ymm2
vpand mask_ff(%rip), %ymm1, %ymm1
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_f(%rip), %ymm2, %ymm1
vpsrlw $4, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpsubw mask_3(%rip), %ymm2, %ymm14
vpsraw $15, %ymm14, %ymm15
vpandn %ymm14, %ymm15, %ymm1
vpand %ymm15, %ymm2, %ymm14
vpxor %ymm14, %ymm1, %ymm2
vmovdqa %ymm2, 896(%rdi)
vpaddw 928(%rdi), %ymm0, %ymm1
vpsrlw $8, %ymm1, %ymm2
vpand mask_ff(%rip), %ymm1, %ymm1
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_f(%rip), %ymm2, %ymm1
vpsrlw $4, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpsubw mask_3(%rip), %ymm2, %ymm14
vpsraw $15, %ymm14, %ymm15
vpandn %ymm14, %ymm15, %ymm1
vpand %ymm15, %ymm2, %ymm14
vpxor %ymm14, %ymm1, %ymm2
vmovdqa %ymm2, 928(%rdi)
vpaddw 960(%rdi), %ymm0, %ymm1
vpsrlw $8, %ymm1, %ymm2
vpand mask_ff(%rip), %ymm1, %ymm1
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_f(%rip), %ymm2, %ymm1
vpsrlw $4, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpsubw mask_3(%rip), %ymm2, %ymm14
vpsraw $15, %ymm14, %ymm15
vpandn %ymm14, %ymm15, %ymm1
vpand %ymm15, %ymm2, %ymm14
vpxor %ymm14, %ymm1, %ymm2
vmovdqa %ymm2, 960(%rdi)
vpaddw 992(%rdi), %ymm0, %ymm1
vpsrlw $8, %ymm1, %ymm2
vpand mask_ff(%rip), %ymm1, %ymm1
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_f(%rip), %ymm2, %ymm1
vpsrlw $4, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpsubw mask_3(%rip), %ymm2, %ymm14
vpsraw $15, %ymm14, %ymm15
vpandn %ymm14, %ymm15, %ymm1
vpand %ymm15, %ymm2, %ymm14
vpxor %ymm14, %ymm1, %ymm2
vmovdqa %ymm2, 992(%rdi)
movw $0, 1018(%rdi)
movw $0, 1020(%rdi)
movw $0, 1022(%rdi)
ret

+ 80
- 0
crypto_kem/ntruhps2048509/avx2/poly_mod_q_Phi_n.s View File

@@ -0,0 +1,80 @@
.data
.p2align 5
.text
.global PQCLEAN_NTRUHPS2048509_AVX2_poly_mod_q_Phi_n
.global _PQCLEAN_NTRUHPS2048509_AVX2_poly_mod_q_Phi_n
PQCLEAN_NTRUHPS2048509_AVX2_poly_mod_q_Phi_n:
_PQCLEAN_NTRUHPS2048509_AVX2_poly_mod_q_Phi_n:
vmovdqa 992(%rdi), %ymm0
vpermq $3, %ymm0, %ymm0
vpslld $16, %ymm0, %ymm0
vpsrld $16, %ymm0, %ymm1
vpor %ymm0, %ymm1, %ymm0
vbroadcastss %xmm0, %ymm0
vxorpd %ymm1, %ymm1, %ymm1
vpsubw %ymm0, %ymm1, %ymm0
vpaddw 0(%rdi), %ymm0, %ymm1
vmovdqa %ymm1, 0(%rdi)
vpaddw 32(%rdi), %ymm0, %ymm1
vmovdqa %ymm1, 32(%rdi)
vpaddw 64(%rdi), %ymm0, %ymm1
vmovdqa %ymm1, 64(%rdi)
vpaddw 96(%rdi), %ymm0, %ymm1
vmovdqa %ymm1, 96(%rdi)
vpaddw 128(%rdi), %ymm0, %ymm1
vmovdqa %ymm1, 128(%rdi)
vpaddw 160(%rdi), %ymm0, %ymm1
vmovdqa %ymm1, 160(%rdi)
vpaddw 192(%rdi), %ymm0, %ymm1
vmovdqa %ymm1, 192(%rdi)
vpaddw 224(%rdi), %ymm0, %ymm1
vmovdqa %ymm1, 224(%rdi)
vpaddw 256(%rdi), %ymm0, %ymm1
vmovdqa %ymm1, 256(%rdi)
vpaddw 288(%rdi), %ymm0, %ymm1
vmovdqa %ymm1, 288(%rdi)
vpaddw 320(%rdi), %ymm0, %ymm1
vmovdqa %ymm1, 320(%rdi)
vpaddw 352(%rdi), %ymm0, %ymm1
vmovdqa %ymm1, 352(%rdi)
vpaddw 384(%rdi), %ymm0, %ymm1
vmovdqa %ymm1, 384(%rdi)
vpaddw 416(%rdi), %ymm0, %ymm1
vmovdqa %ymm1, 416(%rdi)
vpaddw 448(%rdi), %ymm0, %ymm1
vmovdqa %ymm1, 448(%rdi)
vpaddw 480(%rdi), %ymm0, %ymm1
vmovdqa %ymm1, 480(%rdi)
vpaddw 512(%rdi), %ymm0, %ymm1
vmovdqa %ymm1, 512(%rdi)
vpaddw 544(%rdi), %ymm0, %ymm1
vmovdqa %ymm1, 544(%rdi)
vpaddw 576(%rdi), %ymm0, %ymm1
vmovdqa %ymm1, 576(%rdi)
vpaddw 608(%rdi), %ymm0, %ymm1
vmovdqa %ymm1, 608(%rdi)
vpaddw 640(%rdi), %ymm0, %ymm1
vmovdqa %ymm1, 640(%rdi)
vpaddw 672(%rdi), %ymm0, %ymm1
vmovdqa %ymm1, 672(%rdi)
vpaddw 704(%rdi), %ymm0, %ymm1
vmovdqa %ymm1, 704(%rdi)
vpaddw 736(%rdi), %ymm0, %ymm1
vmovdqa %ymm1, 736(%rdi)
vpaddw 768(%rdi), %ymm0, %ymm1
vmovdqa %ymm1, 768(%rdi)
vpaddw 800(%rdi), %ymm0, %ymm1
vmovdqa %ymm1, 800(%rdi)
vpaddw 832(%rdi), %ymm0, %ymm1
vmovdqa %ymm1, 832(%rdi)
vpaddw 864(%rdi), %ymm0, %ymm1
vmovdqa %ymm1, 864(%rdi)
vpaddw 896(%rdi), %ymm0, %ymm1
vmovdqa %ymm1, 896(%rdi)
vpaddw 928(%rdi), %ymm0, %ymm1
vmovdqa %ymm1, 928(%rdi)
vpaddw 960(%rdi), %ymm0, %ymm1
vmovdqa %ymm1, 960(%rdi)
vpaddw 992(%rdi), %ymm0, %ymm1
vmovdqa %ymm1, 992(%rdi)
ret

+ 80
- 0
crypto_kem/ntruhps2048509/avx2/poly_r2_inv.c View File

@@ -0,0 +1,80 @@
#include <immintrin.h>

#include "poly_r2_inv.h"
#include "poly.h"

// Using pdep/pext for these two functions is faster but not a lot since they work on uint64_t which means
// we can only do 4 coefficients at a time. Per byte (where we store 8 coefficients) we will thus need 2 pdeps/pexts
// and an additional shift. In the case of tobytes we also need a logical or.
// On AMD Ryzen pdep/pext are quite slow and the naive solution (looping through and setting each bit individually)
// is preferred.
void PQCLEAN_NTRUHPS2048509_AVX2_poly_R2_tobytes(unsigned char *out, const poly *a) {
// Since pext works on a uint64_t we view the coefficient pointer as a 64-bit pointer
// so that we can extract 4 coefficient at a time. It also makes arithmetic a little easier.
uint64_t *coeff_pointer = (void *) a->coeffs;

int i;
for (i = 0; i < 63; i++) {
out[i] = _pext_u64(coeff_pointer[2 * i], 0x1000100010001);
out[i] |= _pext_u64(coeff_pointer[2 * i + 1], 0x1000100010001) << 4;
}
out[i] = _pext_u64(coeff_pointer[2 * 63], 0x1000100010001);
out[i] |= _pext_u64(coeff_pointer[2 * 63 + 1], 0x1) << 4;
}

void PQCLEAN_NTRUHPS2048509_AVX2_poly_R2_frombytes(poly *a, const unsigned char *in) {
// Since pdep results in a uint64_t we view the coefficient pointer as a 64-bit pointer
// so that we can store 4 coefficient at a time. It also makes arithmetic a little easier.
uint64_t *coeff_pointer = (void *) a->coeffs;

int i;
for (i = 0; i < 63; i++) {
coeff_pointer[2 * i] = _pdep_u64(in[i], 0x1000100010001);
coeff_pointer[2 * i + 1] = _pdep_u64(in[i] >> 4, 0x1000100010001);
}
// From the last byte we only want 5 bits (since we have 509 total, not 512).
coeff_pointer[2 * 63] = _pdep_u64(in[i], 0x1000100010001);
coeff_pointer[2 * 63 + 1] = _pdep_u64(in[i] >> 4, 0x1);
}

void PQCLEAN_NTRUHPS2048509_AVX2_poly_R2_inv(poly *r, const poly *a) {
union {
unsigned char s[64];
__m256i s_x32[2];
} squares[13];
#define s(x) squares[(x)].s

// This relies on the following addition chain:
// 1, 2, 3, 6, 12, 15, 30, 60, 63, 126, 252, 504, 507

PQCLEAN_NTRUHPS2048509_AVX2_poly_R2_tobytes(s(0), a); // TODO alignment

PQCLEAN_NTRUHPS2048509_AVX2_square_1_509(s(1), s(0));
PQCLEAN_NTRUHPS2048509_AVX2_poly_R2_mul(s(1), s(1), s(0));
PQCLEAN_NTRUHPS2048509_AVX2_square_1_509(s(2), s(1));
PQCLEAN_NTRUHPS2048509_AVX2_poly_R2_mul(s(2), s(2), s(0));
PQCLEAN_NTRUHPS2048509_AVX2_square_3_509(s(3), s(2));
PQCLEAN_NTRUHPS2048509_AVX2_poly_R2_mul(s(3), s(3), s(2));
PQCLEAN_NTRUHPS2048509_AVX2_square_6_509(s(4), s(3));
PQCLEAN_NTRUHPS2048509_AVX2_poly_R2_mul(s(4), s(4), s(3));
PQCLEAN_NTRUHPS2048509_AVX2_square_3_509(s(5), s(4));
PQCLEAN_NTRUHPS2048509_AVX2_poly_R2_mul(s(5), s(5), s(2));
PQCLEAN_NTRUHPS2048509_AVX2_square_15_509(s(6), s(5));
PQCLEAN_NTRUHPS2048509_AVX2_poly_R2_mul(s(6), s(6), s(5));
PQCLEAN_NTRUHPS2048509_AVX2_square_30_509(s(7), s(6));
PQCLEAN_NTRUHPS2048509_AVX2_poly_R2_mul(s(7), s(7), s(6));
PQCLEAN_NTRUHPS2048509_AVX2_square_3_509(s(8), s(7));
PQCLEAN_NTRUHPS2048509_AVX2_poly_R2_mul(s(8), s(8), s(2));
PQCLEAN_NTRUHPS2048509_AVX2_square_63_509(s(9), s(8));
PQCLEAN_NTRUHPS2048509_AVX2_poly_R2_mul(s(9), s(9), s(8));
PQCLEAN_NTRUHPS2048509_AVX2_square_126_509(s(10), s(9));
PQCLEAN_NTRUHPS2048509_AVX2_poly_R2_mul(s(10), s(10), s(9));
PQCLEAN_NTRUHPS2048509_AVX2_square_252_509(s(11), s(10));
PQCLEAN_NTRUHPS2048509_AVX2_poly_R2_mul(s(11), s(11), s(10));
PQCLEAN_NTRUHPS2048509_AVX2_square_3_509(s(12), s(11));
PQCLEAN_NTRUHPS2048509_AVX2_poly_R2_mul(s(12), s(12), s(2));
PQCLEAN_NTRUHPS2048509_AVX2_square_1_509(s(0), s(12));

PQCLEAN_NTRUHPS2048509_AVX2_poly_R2_frombytes(r, s(0));
#undef s
}

+ 20
- 0
crypto_kem/ntruhps2048509/avx2/poly_r2_inv.h View File

@@ -0,0 +1,20 @@
#ifndef POLY_R2_INV_H
#define POLY_R2_INV_H

#include "poly.h"

void PQCLEAN_NTRUHPS2048509_AVX2_poly_R2_tobytes(unsigned char *out, const poly *a);
void PQCLEAN_NTRUHPS2048509_AVX2_poly_R2_frombytes(poly *a, const unsigned char *in);

extern void PQCLEAN_NTRUHPS2048509_AVX2_square_1_509(unsigned char *out, const unsigned char *a);
extern void PQCLEAN_NTRUHPS2048509_AVX2_square_3_509(unsigned char *out, const unsigned char *a);
extern void PQCLEAN_NTRUHPS2048509_AVX2_square_6_509(unsigned char *out, const unsigned char *a);
extern void PQCLEAN_NTRUHPS2048509_AVX2_square_15_509(unsigned char *out, const unsigned char *a);
extern void PQCLEAN_NTRUHPS2048509_AVX2_square_30_509(unsigned char *out, const unsigned char *a);
extern void PQCLEAN_NTRUHPS2048509_AVX2_square_63_509(unsigned char *out, const unsigned char *a);
extern void PQCLEAN_NTRUHPS2048509_AVX2_square_126_509(unsigned char *out, const unsigned char *a);
extern void PQCLEAN_NTRUHPS2048509_AVX2_square_252_509(unsigned char *out, const unsigned char *a);

extern void PQCLEAN_NTRUHPS2048509_AVX2_poly_R2_mul(unsigned char *out, const unsigned char *a,
const unsigned char *b);
#endif

+ 285
- 0
crypto_kem/ntruhps2048509/avx2/poly_r2_mul.s View File

@@ -0,0 +1,285 @@
.data
.p2align 5
mask1100:
.word 0
.word 0
.word 0
.word 0
.word 0
.word 0
.word 0
.word 0
.word 65535
.word 65535
.word 65535
.word 65535
.word 65535
.word 65535
.word 65535
.word 65535
mask0110:
.word 0
.word 0
.word 0
.word 0
.word 65535
.word 65535
.word 65535
.word 65535
.word 65535
.word 65535
.word 65535
.word 65535
.word 0
.word 0
.word 0
.word 0
mask0011:
.word 65535
.word 65535
.word 65535
.word 65535
.word 65535
.word 65535
.word 65535
.word 65535
.word 0
.word 0
.word 0
.word 0
.word 0
.word 0
.word 0
.word 0
mask1000:
.word 0
.word 0
.word 0
.word 0
.word 0
.word 0
.word 0
.word 0
.word 0
.word 0
.word 0
.word 0
.word 65535
.word 65535
.word 65535
.word 65535
mask0111:
.word 65535
.word 65535
.word 65535
.word 65535
.word 65535
.word 65535
.word 65535
.word 65535
.word 65535
.word 65535
.word 65535
.word 65535
.word 0
.word 0
.word 0
.word 0
low253:
.word 65535
.word 65535
.word 65535
.word 65535
.word 65535
.word 65535
.word 65535
.word 65535
.word 65535
.word 65535
.word 65535
.word 65535
.word 65535
.word 65535
.word 65535
.word 8191
.text
.global PQCLEAN_NTRUHPS2048509_AVX2_poly_R2_mul
.global _PQCLEAN_NTRUHPS2048509_AVX2_poly_R2_mul
PQCLEAN_NTRUHPS2048509_AVX2_poly_R2_mul:
_PQCLEAN_NTRUHPS2048509_AVX2_poly_R2_mul:
vmovdqa 0(%rsi), %ymm0
vmovdqa 32(%rsi), %ymm1
vmovdqa 0(%rdx), %ymm3
vmovdqa 32(%rdx), %ymm4
vpxor %ymm0, %ymm1, %ymm6
vpxor %ymm3, %ymm4, %ymm7
vextracti128 $1, %ymm0, %xmm11
vextracti128 $1, %ymm3, %xmm12
vpclmulqdq $1, %xmm11, %xmm12, %xmm5
vpclmulqdq $16, %xmm11, %xmm12, %xmm14
vpclmulqdq $17, %xmm11, %xmm12, %xmm15
vpxor %xmm5, %xmm14, %xmm14
vpclmulqdq $0, %xmm11, %xmm12, %xmm5
vpermq $16, %ymm14, %ymm14
vinserti128 $1, %xmm15, %ymm15, %ymm15
vpand mask0011(%rip), %ymm5, %ymm5
vpand mask0110(%rip), %ymm14, %ymm14
vpand mask1100(%rip), %ymm15, %ymm15
vpxor %ymm5, %ymm14, %ymm14
vpxor %ymm14, %ymm15, %ymm5
vpxor %xmm0, %xmm11, %xmm11
vpxor %xmm3, %xmm12, %xmm12
vpclmulqdq $1, %xmm11, %xmm12, %xmm13
vpclmulqdq $16, %xmm11, %xmm12, %xmm14
vpclmulqdq $17, %xmm11, %xmm12, %xmm15
vpxor %xmm13, %xmm14, %xmm14
vpclmulqdq $0, %xmm11, %xmm12, %xmm13
vpermq $16, %ymm14, %ymm14
vinserti128 $1, %xmm15, %ymm15, %ymm15
vpand mask0011(%rip), %ymm13, %ymm13
vpand mask0110(%rip), %ymm14, %ymm14
vpand mask1100(%rip), %ymm15, %ymm15
vpxor %ymm13, %ymm14, %ymm14
vpxor %ymm14, %ymm15, %ymm13
vpclmulqdq $1, %xmm0, %xmm3, %xmm2
vpclmulqdq $16, %xmm0, %xmm3, %xmm14
vpclmulqdq $17, %xmm0, %xmm3, %xmm15
vpxor %xmm2, %xmm14, %xmm14
vpclmulqdq $0, %xmm0, %xmm3, %xmm2
vpermq $16, %ymm14, %ymm14
vinserti128 $1, %xmm15, %ymm15, %ymm15
vpand mask0011(%rip), %ymm2, %ymm2
vpand mask0110(%rip), %ymm14, %ymm14
vpand mask1100(%rip), %ymm15, %ymm15
vpxor %ymm2, %ymm14, %ymm14
vpxor %ymm14, %ymm15, %ymm2
vpxor %ymm13, %ymm5, %ymm13
vpxor %ymm13, %ymm2, %ymm13
vpxor %ymm11, %ymm11, %ymm11
vextracti128 $1, %ymm13, %xmm11
vpxor %ymm5, %ymm11, %ymm5
vpxor %ymm11, %ymm11, %ymm11
vinserti128 $1, %xmm13, %ymm11, %ymm11
vpxor %ymm11, %ymm2, %ymm2
vextracti128 $1, %ymm1, %xmm11
vextracti128 $1, %ymm4, %xmm12
vpclmulqdq $1, %xmm11, %xmm12, %xmm9
vpclmulqdq $16, %xmm11, %xmm12, %xmm14
vpclmulqdq $17, %xmm11, %xmm12, %xmm15
vpxor %xmm9, %xmm14, %xmm14
vpclmulqdq $0, %xmm11, %xmm12, %xmm9
vpermq $16, %ymm14, %ymm14
vinserti128 $1, %xmm15, %ymm15, %ymm15
vpand mask0011(%rip), %ymm9, %ymm9
vpand mask0110(%rip), %ymm14, %ymm14
vpand mask1100(%rip), %ymm15, %ymm15
vpxor %ymm9, %ymm14, %ymm14
vpxor %ymm14, %ymm15, %ymm9
vpxor %xmm1, %xmm11, %xmm11
vpxor %xmm4, %xmm12, %xmm12
vpclmulqdq $1, %xmm11, %xmm12, %xmm13
vpclmulqdq $16, %xmm11, %xmm12, %xmm14
vpclmulqdq $17, %xmm11, %xmm12, %xmm15
vpxor %xmm13, %xmm14, %xmm14
vpclmulqdq $0, %xmm11, %xmm12, %xmm13
vpermq $16, %ymm14, %ymm14
vinserti128 $1, %xmm15, %ymm15, %ymm15
vpand mask0011(%rip), %ymm13, %ymm13
vpand mask0110(%rip), %ymm14, %ymm14
vpand mask1100(%rip), %ymm15, %ymm15
vpxor %ymm13, %ymm14, %ymm14
vpxor %ymm14, %ymm15, %ymm13
vpclmulqdq $1, %xmm1, %xmm4, %xmm8
vpclmulqdq $16, %xmm1, %xmm4, %xmm14
vpclmulqdq $17, %xmm1, %xmm4, %xmm15
vpxor %xmm8, %xmm14, %xmm14
vpclmulqdq $0, %xmm1, %xmm4, %xmm8
vpermq $16, %ymm14, %ymm14
vinserti128 $1, %xmm15, %ymm15, %ymm15
vpand mask0011(%rip), %ymm8, %ymm8
vpand mask0110(%rip), %ymm14, %ymm14
vpand mask1100(%rip), %ymm15, %ymm15
vpxor %ymm8, %ymm14, %ymm14
vpxor %ymm14, %ymm15, %ymm8
vpxor %ymm13, %ymm9, %ymm13
vpxor %ymm13, %ymm8, %ymm13
vpxor %ymm11, %ymm11, %ymm11
vextracti128 $1, %ymm13, %xmm11
vpxor %ymm9, %ymm11, %ymm9
vpxor %ymm11, %ymm11, %ymm11
vinserti128 $1, %xmm13, %ymm11, %ymm11
vpxor %ymm11, %ymm8, %ymm8
vextracti128 $1, %ymm6, %xmm11
vextracti128 $1, %ymm7, %xmm12
vpclmulqdq $1, %xmm11, %xmm12, %xmm1
vpclmulqdq $16, %xmm11, %xmm12, %xmm14
vpclmulqdq $17, %xmm11, %xmm12, %xmm15
vpxor %xmm1, %xmm14, %xmm14
vpclmulqdq $0, %xmm11, %xmm12, %xmm1
vpermq $16, %ymm14, %ymm14
vinserti128 $1, %xmm15, %ymm15, %ymm15
vpand mask0011(%rip), %ymm1, %ymm1
vpand mask0110(%rip), %ymm14, %ymm14
vpand mask1100(%rip), %ymm15, %ymm15
vpxor %ymm1, %ymm14, %ymm14
vpxor %ymm14, %ymm15, %ymm1
vpxor %xmm6, %xmm11, %xmm11
vpxor %xmm7, %xmm12, %xmm12
vpclmulqdq $1, %xmm11, %xmm12, %xmm13
vpclmulqdq $16, %xmm11, %xmm12, %xmm14
vpclmulqdq $17, %xmm11, %xmm12, %xmm15
vpxor %xmm13, %xmm14, %xmm14
vpclmulqdq $0, %xmm11, %xmm12, %xmm13
vpermq $16, %ymm14, %ymm14
vinserti128 $1, %xmm15, %ymm15, %ymm15
vpand mask0011(%rip), %ymm13, %ymm13
vpand mask0110(%rip), %ymm14, %ymm14
vpand mask1100(%rip), %ymm15, %ymm15
vpxor %ymm13, %ymm14, %ymm14
vpxor %ymm14, %ymm15, %ymm13
vpclmulqdq $1, %xmm6, %xmm7, %xmm0
vpclmulqdq $16, %xmm6, %xmm7, %xmm14
vpclmulqdq $17, %xmm6, %xmm7, %xmm15
vpxor %xmm0, %xmm14, %xmm14
vpclmulqdq $0, %xmm6, %xmm7, %xmm0
vpermq $16, %ymm14, %ymm14
vinserti128 $1, %xmm15, %ymm15, %ymm15
vpand mask0011(%rip), %ymm0, %ymm0
vpand mask0110(%rip), %ymm14, %ymm14
vpand mask1100(%rip), %ymm15, %ymm15
vpxor %ymm0, %ymm14, %ymm14
vpxor %ymm14, %ymm15, %ymm0
vpxor %ymm13, %ymm1, %ymm13
vpxor %ymm13, %ymm0, %ymm13
vpxor %ymm11, %ymm11, %ymm11
vextracti128 $1, %ymm13, %xmm11
vpxor %ymm1, %ymm11, %ymm1
vpxor %ymm11, %ymm11, %ymm11
vinserti128 $1, %xmm13, %ymm11, %ymm11
vpxor %ymm11, %ymm0, %ymm0
vpxor %ymm0, %ymm2, %ymm0
vpxor %ymm0, %ymm8, %ymm0
vpxor %ymm1, %ymm5, %ymm1
vpxor %ymm1, %ymm9, %ymm1
vpxor %ymm0, %ymm5, %ymm5
vpxor %ymm1, %ymm8, %ymm8
vpand mask1000(%rip), %ymm5, %ymm13
vpand mask0111(%rip), %ymm8, %ymm12
vpxor %ymm12, %ymm13, %ymm12
vpsrlq $61, %ymm12, %ymm12
vpermq $147, %ymm12, %ymm12
vpxor %ymm12, %ymm2, %ymm2
vpsllq $3, %ymm8, %ymm12
vpxor %ymm12, %ymm2, %ymm2
vpand mask1000(%rip), %ymm8, %ymm13
vpand mask0111(%rip), %ymm9, %ymm12
vpxor %ymm12, %ymm13, %ymm12
vpsrlq $61, %ymm12, %ymm12
vpermq $147, %ymm12, %ymm12
vpxor %ymm12, %ymm5, %ymm5
vpsllq $3, %ymm9, %ymm12
vpxor %ymm12, %ymm5, %ymm5
vpand low253(%rip), %ymm5, %ymm5
vmovdqa %ymm2, 0(%rdi)
vmovdqa %ymm5, 32(%rdi)
ret

+ 5625
- 0
crypto_kem/ntruhps2048509/avx2/poly_rq_mul.s
File diff suppressed because it is too large
View File


+ 955
- 0
crypto_kem/ntruhps2048509/avx2/poly_rq_to_s3.s View File

@@ -0,0 +1,955 @@
.data
.p2align 5
const_3_repeating:
.word 0x3
.word 0x3
.word 0x3
.word 0x3
.word 0x3
.word 0x3
.word 0x3
.word 0x3
.word 0x3
.word 0x3
.word 0x3
.word 0x3
.word 0x3
.word 0x3
.word 0x3
.word 0x3
shuf_b8_to_low_doubleword:
.byte 8
.byte 255
.byte 8
.byte 255
.byte 8
.byte 255
.byte 8
.byte 255
.byte 8
.byte 255
.byte 8
.byte 255
.byte 8
.byte 255
.byte 8
.byte 255
.byte 8
.byte 255
.byte 8
.byte 255
.byte 8
.byte 255
.byte 8
.byte 255
.byte 8
.byte 255
.byte 8
.byte 255
.byte 8
.byte 255
.byte 8
.byte 255
mask_modq:
.word 2047
.word 2047
.word 2047
.word 2047
.word 2047
.word 2047
.word 2047
.word 2047
.word 2047
.word 2047
.word 2047
.word 2047
.word 2047
.word 2047
.word 2047
.word 2047
mask_ff:
.word 0xff
.word 0xff
.word 0xff
.word 0xff
.word 0xff
.word 0xff
.word 0xff
.word 0xff
.word 0xff
.word 0xff
.word 0xff
.word 0xff
.word 0xff
.word 0xff
.word 0xff
.word 0xff
mask_f:
.word 0xf
.word 0xf
.word 0xf
.word 0xf
.word 0xf
.word 0xf
.word 0xf
.word 0xf
.word 0xf
.word 0xf
.word 0xf
.word 0xf
.word 0xf
.word 0xf
.word 0xf
.word 0xf
mask_3:
.word 0x03
.word 0x03
.word 0x03
.word 0x03
.word 0x03
.word 0x03
.word 0x03
.word 0x03
.word 0x03
.word 0x03
.word 0x03
.word 0x03
.word 0x03
.word 0x03
.word 0x03
.word 0x03
.text
.global PQCLEAN_NTRUHPS2048509_AVX2_poly_Rq_to_S3
.global _PQCLEAN_NTRUHPS2048509_AVX2_poly_Rq_to_S3
PQCLEAN_NTRUHPS2048509_AVX2_poly_Rq_to_S3:
_PQCLEAN_NTRUHPS2048509_AVX2_poly_Rq_to_S3:
vmovdqa const_3_repeating(%rip), %ymm3
vmovdqa mask_modq(%rip), %ymm6
vmovdqa 992(%rsi), %ymm4
vpand %ymm6, %ymm4, %ymm4
vpsrlw $10, %ymm4, %ymm0
vpxor %ymm3, %ymm0, %ymm0
vpsllw $11, %ymm0, %ymm0
vpaddw %ymm4, %ymm0, %ymm4
vpsrlw $8, %ymm4, %ymm5
vpand mask_ff(%rip), %ymm4, %ymm4
vpaddw %ymm5, %ymm4, %ymm5
vpand mask_f(%rip), %ymm5, %ymm4
vpsrlw $4, %ymm5, %ymm5
vpaddw %ymm5, %ymm4, %ymm5
vpand mask_3(%rip), %ymm5, %ymm4
vpsrlw $2, %ymm5, %ymm5
vpaddw %ymm5, %ymm4, %ymm5
vpand mask_3(%rip), %ymm5, %ymm4
vpsrlw $2, %ymm5, %ymm5
vpaddw %ymm5, %ymm4, %ymm5
vpsubw mask_3(%rip), %ymm5, %ymm14
vpsraw $15, %ymm14, %ymm15
vpandn %ymm14, %ymm15, %ymm4
vpand %ymm15, %ymm5, %ymm14
vpxor %ymm14, %ymm4, %ymm5
vpsllw $1, %ymm5, %ymm4
vextracti128 $1, %ymm4, %xmm4
vpshufb shuf_b8_to_low_doubleword(%rip), %ymm4, %ymm4
vinserti128 $1, %xmm4, %ymm4, %ymm4
vmovdqa 0(%rsi), %ymm1
vpand %ymm6, %ymm1, %ymm1
vpsrlw $10, %ymm1, %ymm0
vpxor %ymm3, %ymm0, %ymm0
vpsllw $11, %ymm0, %ymm0
vpaddw %ymm1, %ymm0, %ymm0
vpaddw %ymm4, %ymm0, %ymm0
vpsrlw $8, %ymm0, %ymm5
vpand mask_ff(%rip), %ymm0, %ymm0
vpaddw %ymm5, %ymm0, %ymm5
vpand mask_f(%rip), %ymm5, %ymm0
vpsrlw $4, %ymm5, %ymm5
vpaddw %ymm5, %ymm0, %ymm5
vpand mask_3(%rip), %ymm5, %ymm0
vpsrlw $2, %ymm5, %ymm5
vpaddw %ymm5, %ymm0, %ymm5
vpand mask_3(%rip), %ymm5, %ymm0
vpsrlw $2, %ymm5, %ymm5
vpaddw %ymm5, %ymm0, %ymm5
vpsubw mask_3(%rip), %ymm5, %ymm14
vpsraw $15, %ymm14, %ymm15
vpandn %ymm14, %ymm15, %ymm0
vpand %ymm15, %ymm5, %ymm14
vpxor %ymm14, %ymm0, %ymm5
vmovdqa %ymm5, 0(%rdi)
vmovdqa 32(%rsi), %ymm1
vpand %ymm6, %ymm1, %ymm1
vpsrlw $10, %ymm1, %ymm0
vpxor %ymm3, %ymm0, %ymm0
vpsllw $11, %ymm0, %ymm0
vpaddw %ymm1, %ymm0, %ymm0
vpaddw %ymm4, %ymm0, %ymm0
vpsrlw $8, %ymm0, %ymm5
vpand mask_ff(%rip), %ymm0, %ymm0
vpaddw %ymm5, %ymm0, %ymm5
vpand mask_f(%rip), %ymm5, %ymm0
vpsrlw $4, %ymm5, %ymm5
vpaddw %ymm5, %ymm0, %ymm5
vpand mask_3(%rip), %ymm5, %ymm0
vpsrlw $2, %ymm5, %ymm5
vpaddw %ymm5, %ymm0, %ymm5
vpand mask_3(%rip), %ymm5, %ymm0
vpsrlw $2, %ymm5, %ymm5
vpaddw %ymm5, %ymm0, %ymm5
vpsubw mask_3(%rip), %ymm5, %ymm14
vpsraw $15, %ymm14, %ymm15
vpandn %ymm14, %ymm15, %ymm0
vpand %ymm15, %ymm5, %ymm14
vpxor %ymm14, %ymm0, %ymm5
vmovdqa %ymm5, 32(%rdi)
vmovdqa 64(%rsi), %ymm1
vpand %ymm6, %ymm1, %ymm1
vpsrlw $10, %ymm1, %ymm0
vpxor %ymm3, %ymm0, %ymm0
vpsllw $11, %ymm0, %ymm0
vpaddw %ymm1, %ymm0, %ymm0
vpaddw %ymm4, %ymm0, %ymm0
vpsrlw $8, %ymm0, %ymm5
vpand mask_ff(%rip), %ymm0, %ymm0
vpaddw %ymm5, %ymm0, %ymm5
vpand mask_f(%rip), %ymm5, %ymm0
vpsrlw $4, %ymm5, %ymm5
vpaddw %ymm5, %ymm0, %ymm5
vpand mask_3(%rip), %ymm5, %ymm0
vpsrlw $2, %ymm5, %ymm5
vpaddw %ymm5, %ymm0, %ymm5
vpand mask_3(%rip), %ymm5, %ymm0
vpsrlw $2, %ymm5, %ymm5
vpaddw %ymm5, %ymm0, %ymm5
vpsubw mask_3(%rip), %ymm5, %ymm14
vpsraw $15, %ymm14, %ymm15
vpandn %ymm14, %ymm15, %ymm0
vpand %ymm15, %ymm5, %ymm14
vpxor %ymm14, %ymm0, %ymm5
vmovdqa %ymm5, 64(%rdi)
vmovdqa 96(%rsi), %ymm1
vpand %ymm6, %ymm1, %ymm1
vpsrlw $10, %ymm1, %ymm0
vpxor %ymm3, %ymm0, %ymm0
vpsllw $11, %ymm0, %ymm0
vpaddw %ymm1, %ymm0, %ymm0
vpaddw %ymm4, %ymm0, %ymm0
vpsrlw $8, %ymm0, %ymm5
vpand mask_ff(%rip), %ymm0, %ymm0
vpaddw %ymm5, %ymm0, %ymm5
vpand mask_f(%rip), %ymm5, %ymm0
vpsrlw $4, %ymm5, %ymm5
vpaddw %ymm5, %ymm0, %ymm5
vpand mask_3(%rip), %ymm5, %ymm0
vpsrlw $2, %ymm5, %ymm5
vpaddw %ymm5, %ymm0, %ymm5
vpand mask_3(%rip), %ymm5, %ymm0
vpsrlw $2, %ymm5, %ymm5
vpaddw %ymm5, %ymm0, %ymm5
vpsubw mask_3(%rip), %ymm5, %ymm14
vpsraw $15, %ymm14, %ymm15
vpandn %ymm14, %ymm15, %ymm0
vpand %ymm15, %ymm5, %ymm14
vpxor %ymm14, %ymm0, %ymm5
vmovdqa %ymm5, 96(%rdi)
vmovdqa 128(%rsi), %ymm1
vpand %ymm6, %ymm1, %ymm1
vpsrlw $10, %ymm1, %ymm0
vpxor %ymm3, %ymm0, %ymm0
vpsllw $11, %ymm0, %ymm0
vpaddw %ymm1, %ymm0, %ymm0
vpaddw %ymm4, %ymm0, %ymm0
vpsrlw $8, %ymm0, %ymm5
vpand mask_ff(%rip), %ymm0, %ymm0
vpaddw %ymm5, %ymm0, %ymm5
vpand mask_f(%rip), %ymm5, %ymm0
vpsrlw $4, %ymm5, %ymm5
vpaddw %ymm5, %ymm0, %ymm5
vpand mask_3(%rip), %ymm5, %ymm0
vpsrlw $2, %ymm5, %ymm5
vpaddw %ymm5, %ymm0, %ymm5
vpand mask_3(%rip), %ymm5, %ymm0
vpsrlw $2, %ymm5, %ymm5
vpaddw %ymm5, %ymm0, %ymm5
vpsubw mask_3(%rip), %ymm5, %ymm14
vpsraw $15, %ymm14, %ymm15
vpandn %ymm14, %ymm15, %ymm0
vpand %ymm15, %ymm5, %ymm14
vpxor %ymm14, %ymm0, %ymm5
vmovdqa %ymm5, 128(%rdi)
vmovdqa 160(%rsi), %ymm1
vpand %ymm6, %ymm1, %ymm1
vpsrlw $10, %ymm1, %ymm0
vpxor %ymm3, %ymm0, %ymm0
vpsllw $11, %ymm0, %ymm0
vpaddw %ymm1, %ymm0, %ymm0
vpaddw %ymm4, %ymm0, %ymm0
vpsrlw $8, %ymm0, %ymm5
vpand mask_ff(%rip), %ymm0, %ymm0
vpaddw %ymm5, %ymm0, %ymm5
vpand mask_f(%rip), %ymm5, %ymm0
vpsrlw $4, %ymm5, %ymm5
vpaddw %ymm5, %ymm0, %ymm5
vpand mask_3(%rip), %ymm5, %ymm0
vpsrlw $2, %ymm5, %ymm5
vpaddw %ymm5, %ymm0, %ymm5
vpand mask_3(%rip), %ymm5, %ymm0
vpsrlw $2, %ymm5, %ymm5
vpaddw %ymm5, %ymm0, %ymm5
vpsubw mask_3(%rip), %ymm5, %ymm14
vpsraw $15, %ymm14, %ymm15
vpandn %ymm14, %ymm15, %ymm0
vpand %ymm15, %ymm5, %ymm14
vpxor %ymm14, %ymm0, %ymm5
vmovdqa %ymm5, 160(%rdi)
vmovdqa 192(%rsi), %ymm1
vpand %ymm6, %ymm1, %ymm1
vpsrlw $10, %ymm1, %ymm0
vpxor %ymm3, %ymm0, %ymm0
vpsllw $11, %ymm0, %ymm0
vpaddw %ymm1, %ymm0, %ymm0
vpaddw %ymm4, %ymm0, %ymm0
vpsrlw $8, %ymm0, %ymm5
vpand mask_ff(%rip), %ymm0, %ymm0
vpaddw %ymm5, %ymm0, %ymm5
vpand mask_f(%rip), %ymm5, %ymm0
vpsrlw $4, %ymm5, %ymm5
vpaddw %ymm5, %ymm0, %ymm5
vpand mask_3(%rip), %ymm5, %ymm0
vpsrlw $2, %ymm5, %ymm5
vpaddw %ymm5, %ymm0, %ymm5
vpand mask_3(%rip), %ymm5, %ymm0
vpsrlw $2, %ymm5, %ymm5
vpaddw %ymm5, %ymm0, %ymm5
vpsubw mask_3(%rip), %ymm5, %ymm14
vpsraw $15, %ymm14, %ymm15
vpandn %ymm14, %ymm15, %ymm0
vpand %ymm15, %ymm5, %ymm14
vpxor %ymm14, %ymm0, %ymm5
vmovdqa %ymm5, 192(%rdi)
vmovdqa 224(%rsi), %ymm1
vpand %ymm6, %ymm1, %ymm1
vpsrlw $10, %ymm1, %ymm0
vpxor %ymm3, %ymm0, %ymm0
vpsllw $11, %ymm0, %ymm0
vpaddw %ymm1, %ymm0, %ymm0
vpaddw %ymm4, %ymm0, %ymm0
vpsrlw $8, %ymm0, %ymm5
vpand mask_ff(%rip), %ymm0, %ymm0
vpaddw %ymm5, %ymm0, %ymm5
vpand mask_f(%rip), %ymm5, %ymm0
vpsrlw $4, %ymm5, %ymm5
vpaddw %ymm5, %ymm0, %ymm5
vpand mask_3(%rip), %ymm5, %ymm0
vpsrlw $2, %ymm5, %ymm5
vpaddw %ymm5, %ymm0, %ymm5
vpand mask_3(%rip), %ymm5, %ymm0
vpsrlw $2, %ymm5, %ymm5
vpaddw %ymm5, %ymm0, %ymm5
vpsubw mask_3(%rip), %ymm5, %ymm14
vpsraw $15, %ymm14, %ymm15
vpandn %ymm14, %ymm15, %ymm0
vpand %ymm15, %ymm5, %ymm14
vpxor %ymm14, %ymm0, %ymm5
vmovdqa %ymm5, 224(%rdi)
vmovdqa 256(%rsi), %ymm1
vpand %ymm6, %ymm1, %ymm1
vpsrlw $10, %ymm1, %ymm0
vpxor %ymm3, %ymm0, %ymm0
vpsllw $11, %ymm0, %ymm0
vpaddw %ymm1, %ymm0, %ymm0
vpaddw %ymm4, %ymm0, %ymm0
vpsrlw $8, %ymm0, %ymm5
vpand mask_ff(%rip), %ymm0, %ymm0
vpaddw %ymm5, %ymm0, %ymm5
vpand mask_f(%rip), %ymm5, %ymm0
vpsrlw $4, %ymm5, %ymm5
vpaddw %ymm5, %ymm0, %ymm5
vpand mask_3(%rip), %ymm5, %ymm0
vpsrlw $2, %ymm5, %ymm5
vpaddw %ymm5, %ymm0, %ymm5
vpand mask_3(%rip), %ymm5, %ymm0
vpsrlw $2, %ymm5, %ymm5
vpaddw %ymm5, %ymm0, %ymm5
vpsubw mask_3(%rip), %ymm5, %ymm14
vpsraw $15, %ymm14, %ymm15
vpandn %ymm14, %ymm15, %ymm0
vpand %ymm15, %ymm5, %ymm14
vpxor %ymm14, %ymm0, %ymm5
vmovdqa %ymm5, 256(%rdi)
vmovdqa 288(%rsi), %ymm1
vpand %ymm6, %ymm1, %ymm1
vpsrlw $10, %ymm1, %ymm0
vpxor %ymm3, %ymm0, %ymm0
vpsllw $11, %ymm0, %ymm0
vpaddw %ymm1, %ymm0, %ymm0
vpaddw %ymm4, %ymm0, %ymm0
vpsrlw $8, %ymm0, %ymm5
vpand mask_ff(%rip), %ymm0, %ymm0
vpaddw %ymm5, %ymm0, %ymm5
vpand mask_f(%rip), %ymm5, %ymm0
vpsrlw $4, %ymm5, %ymm5
vpaddw %ymm5, %ymm0, %ymm5
vpand mask_3(%rip), %ymm5, %ymm0
vpsrlw $2, %ymm5, %ymm5
vpaddw %ymm5, %ymm0, %ymm5
vpand mask_3(%rip), %ymm5, %ymm0
vpsrlw $2, %ymm5, %ymm5
vpaddw %ymm5, %ymm0, %ymm5
vpsubw mask_3(%rip), %ymm5, %ymm14
vpsraw $15, %ymm14, %ymm15
vpandn %ymm14, %ymm15, %ymm0
vpand %ymm15, %ymm5, %ymm14
vpxor %ymm14, %ymm0, %ymm5
vmovdqa %ymm5, 288(%rdi)
vmovdqa 320(%rsi), %ymm1
vpand %ymm6, %ymm1, %ymm1
vpsrlw $10, %ymm1, %ymm0
vpxor %ymm3, %ymm0, %ymm0
vpsllw $11, %ymm0, %ymm0
vpaddw %ymm1, %ymm0, %ymm0
vpaddw %ymm4, %ymm0, %ymm0
vpsrlw $8, %ymm0, %ymm5
vpand mask_ff(%rip), %ymm0, %ymm0
vpaddw %ymm5, %ymm0, %ymm5
vpand mask_f(%rip), %ymm5, %ymm0
vpsrlw $4, %ymm5, %ymm5
vpaddw %ymm5, %ymm0, %ymm5
vpand mask_3(%rip), %ymm5, %ymm0
vpsrlw $2, %ymm5, %ymm5
vpaddw %ymm5, %ymm0, %ymm5
vpand mask_3(%rip), %ymm5, %ymm0
vpsrlw $2, %ymm5, %ymm5
vpaddw %ymm5, %ymm0, %ymm5
vpsubw mask_3(%rip), %ymm5, %ymm14
vpsraw $15, %ymm14, %ymm15
vpandn %ymm14, %ymm15, %ymm0
vpand %ymm15, %ymm5, %ymm14
vpxor %ymm14, %ymm0, %ymm5
vmovdqa %ymm5, 320(%rdi)
vmovdqa 352(%rsi), %ymm1
vpand %ymm6, %ymm1, %ymm1
vpsrlw $10, %ymm1, %ymm0
vpxor %ymm3, %ymm0, %ymm0
vpsllw $11, %ymm0, %ymm0
vpaddw %ymm1, %ymm0, %ymm0
vpaddw %ymm4, %ymm0, %ymm0
vpsrlw $8, %ymm0, %ymm5
vpand mask_ff(%rip), %ymm0, %ymm0
vpaddw %ymm5, %ymm0, %ymm5
vpand mask_f(%rip), %ymm5, %ymm0
vpsrlw $4, %ymm5, %ymm5
vpaddw %ymm5, %ymm0, %ymm5
vpand mask_3(%rip), %ymm5, %ymm0
vpsrlw $2, %ymm5, %ymm5
vpaddw %ymm5, %ymm0, %ymm5
vpand mask_3(%rip), %ymm5, %ymm0
vpsrlw $2, %ymm5, %ymm5
vpaddw %ymm5, %ymm0, %ymm5
vpsubw mask_3(%rip), %ymm5, %ymm14
vpsraw $15, %ymm14, %ymm15
vpandn %ymm14, %ymm15, %ymm0
vpand %ymm15, %ymm5, %ymm14
vpxor %ymm14, %ymm0, %ymm5
vmovdqa %ymm5, 352(%rdi)
vmovdqa 384(%rsi), %ymm1
vpand %ymm6, %ymm1, %ymm1
vpsrlw $10, %ymm1, %ymm0
vpxor %ymm3, %ymm0, %ymm0
vpsllw $11, %ymm0, %ymm0
vpaddw %ymm1, %ymm0, %ymm0
vpaddw %ymm4, %ymm0, %ymm0
vpsrlw $8, %ymm0, %ymm5
vpand mask_ff(%rip), %ymm0, %ymm0
vpaddw %ymm5, %ymm0, %ymm5
vpand mask_f(%rip), %ymm5, %ymm0
vpsrlw $4, %ymm5, %ymm5
vpaddw %ymm5, %ymm0, %ymm5
vpand mask_3(%rip), %ymm5, %ymm0
vpsrlw $2, %ymm5, %ymm5
vpaddw %ymm5, %ymm0, %ymm5
vpand mask_3(%rip), %ymm5, %ymm0
vpsrlw $2, %ymm5, %ymm5
vpaddw %ymm5, %ymm0, %ymm5
vpsubw mask_3(%rip), %ymm5, %ymm14
vpsraw $15, %ymm14, %ymm15
vpandn %ymm14, %ymm15, %ymm0
vpand %ymm15, %ymm5, %ymm14
vpxor %ymm14, %ymm0, %ymm5
vmovdqa %ymm5, 384(%rdi)
vmovdqa 416(%rsi), %ymm1
vpand %ymm6, %ymm1, %ymm1
vpsrlw $10, %ymm1, %ymm0
vpxor %ymm3, %ymm0, %ymm0
vpsllw $11, %ymm0, %ymm0
vpaddw %ymm1, %ymm0, %ymm0
vpaddw %ymm4, %ymm0, %ymm0
vpsrlw $8, %ymm0, %ymm5
vpand mask_ff(%rip), %ymm0, %ymm0
vpaddw %ymm5, %ymm0, %ymm5
vpand mask_f(%rip), %ymm5, %ymm0
vpsrlw $4, %ymm5, %ymm5
vpaddw %ymm5, %ymm0, %ymm5
vpand mask_3(%rip), %ymm5, %ymm0
vpsrlw $2, %ymm5, %ymm5
vpaddw %ymm5, %ymm0, %ymm5
vpand mask_3(%rip), %ymm5, %ymm0
vpsrlw $2, %ymm5, %ymm5
vpaddw %ymm5, %ymm0, %ymm5
vpsubw mask_3(%rip), %ymm5, %ymm14
vpsraw $15, %ymm14, %ymm15
vpandn %ymm14, %ymm15, %ymm0
vpand %ymm15, %ymm5, %ymm14
vpxor %ymm14, %ymm0, %ymm5
vmovdqa %ymm5, 416(%rdi)
vmovdqa 448(%rsi), %ymm1
vpand %ymm6, %ymm1, %ymm1
vpsrlw $10, %ymm1, %ymm0
vpxor %ymm3, %ymm0, %ymm0
vpsllw $11, %ymm0, %ymm0
vpaddw %ymm1, %ymm0, %ymm0
vpaddw %ymm4, %ymm0, %ymm0
vpsrlw $8, %ymm0, %ymm5
vpand mask_ff(%rip), %ymm0, %ymm0
vpaddw %ymm5, %ymm0, %ymm5
vpand mask_f(%rip), %ymm5, %ymm0
vpsrlw $4, %ymm5, %ymm5
vpaddw %ymm5, %ymm0, %ymm5
vpand mask_3(%rip), %ymm5, %ymm0
vpsrlw $2, %ymm5, %ymm5
vpaddw %ymm5, %ymm0, %ymm5
vpand mask_3(%rip), %ymm5, %ymm0
vpsrlw $2, %ymm5, %ymm5
vpaddw %ymm5, %ymm0, %ymm5
vpsubw mask_3(%rip), %ymm5, %ymm14
vpsraw $15, %ymm14, %ymm15
vpandn %ymm14, %ymm15, %ymm0
vpand %ymm15, %ymm5, %ymm14
vpxor %ymm14, %ymm0, %ymm5
vmovdqa %ymm5, 448(%rdi)
vmovdqa 480(%rsi), %ymm1
vpand %ymm6, %ymm1, %ymm1
vpsrlw $10, %ymm1, %ymm0
vpxor %ymm3, %ymm0, %ymm0
vpsllw $11, %ymm0, %ymm0
vpaddw %ymm1, %ymm0, %ymm0
vpaddw %ymm4, %ymm0, %ymm0
vpsrlw $8, %ymm0, %ymm5
vpand mask_ff(%rip), %ymm0, %ymm0
vpaddw %ymm5, %ymm0, %ymm5
vpand mask_f(%rip), %ymm5, %ymm0
vpsrlw $4, %ymm5, %ymm5
vpaddw %ymm5, %ymm0, %ymm5
vpand mask_3(%rip), %ymm5, %ymm0
vpsrlw $2, %ymm5, %ymm5
vpaddw %ymm5, %ymm0, %ymm5
vpand mask_3(%rip), %ymm5, %ymm0
vpsrlw $2, %ymm5, %ymm5
vpaddw %ymm5, %ymm0, %ymm5
vpsubw mask_3(%rip), %ymm5, %ymm14
vpsraw $15, %ymm14, %ymm15
vpandn %ymm14, %ymm15, %ymm0
vpand %ymm15, %ymm5, %ymm14
vpxor %ymm14, %ymm0, %ymm5
vmovdqa %ymm5, 480(%rdi)
vmovdqa 512(%rsi), %ymm1
vpand %ymm6, %ymm1, %ymm1
vpsrlw $10, %ymm1, %ymm0
vpxor %ymm3, %ymm0, %ymm0
vpsllw $11, %ymm0, %ymm0
vpaddw %ymm1, %ymm0, %ymm0
vpaddw %ymm4, %ymm0, %ymm0
vpsrlw $8, %ymm0, %ymm5
vpand mask_ff(%rip), %ymm0, %ymm0
vpaddw %ymm5, %ymm0, %ymm5
vpand mask_f(%rip), %ymm5, %ymm0
vpsrlw $4, %ymm5, %ymm5
vpaddw %ymm5, %ymm0, %ymm5
vpand mask_3(%rip), %ymm5, %ymm0
vpsrlw $2, %ymm5, %ymm5
vpaddw %ymm5, %ymm0, %ymm5
vpand mask_3(%rip), %ymm5, %ymm0
vpsrlw $2, %ymm5, %ymm5
vpaddw %ymm5, %ymm0, %ymm5
vpsubw mask_3(%rip), %ymm5, %ymm14
vpsraw $15, %ymm14, %ymm15
vpandn %ymm14, %ymm15, %ymm0
vpand %ymm15, %ymm5, %ymm14
vpxor %ymm14, %ymm0, %ymm5
vmovdqa %ymm5, 512(%rdi)
vmovdqa 544(%rsi), %ymm1
vpand %ymm6, %ymm1, %ymm1
vpsrlw $10, %ymm1, %ymm0
vpxor %ymm3, %ymm0, %ymm0
vpsllw $11, %ymm0, %ymm0
vpaddw %ymm1, %ymm0, %ymm0
vpaddw %ymm4, %ymm0, %ymm0
vpsrlw $8, %ymm0, %ymm5
vpand mask_ff(%rip), %ymm0, %ymm0
vpaddw %ymm5, %ymm0, %ymm5
vpand mask_f(%rip), %ymm5, %ymm0
vpsrlw $4, %ymm5, %ymm5
vpaddw %ymm5, %ymm0, %ymm5
vpand mask_3(%rip), %ymm5, %ymm0
vpsrlw $2, %ymm5, %ymm5
vpaddw %ymm5, %ymm0, %ymm5
vpand mask_3(%rip), %ymm5, %ymm0
vpsrlw $2, %ymm5, %ymm5
vpaddw %ymm5, %ymm0, %ymm5
vpsubw mask_3(%rip), %ymm5, %ymm14
vpsraw $15, %ymm14, %ymm15
vpandn %ymm14, %ymm15, %ymm0
vpand %ymm15, %ymm5, %ymm14
vpxor %ymm14, %ymm0, %ymm5
vmovdqa %ymm5, 544(%rdi)
vmovdqa 576(%rsi), %ymm1
vpand %ymm6, %ymm1, %ymm1
vpsrlw $10, %ymm1, %ymm0
vpxor %ymm3, %ymm0, %ymm0
vpsllw $11, %ymm0, %ymm0
vpaddw %ymm1, %ymm0, %ymm0
vpaddw %ymm4, %ymm0, %ymm0
vpsrlw $8, %ymm0, %ymm5
vpand mask_ff(%rip), %ymm0, %ymm0
vpaddw %ymm5, %ymm0, %ymm5
vpand mask_f(%rip), %ymm5, %ymm0
vpsrlw $4, %ymm5, %ymm5
vpaddw %ymm5, %ymm0, %ymm5
vpand mask_3(%rip), %ymm5, %ymm0
vpsrlw $2, %ymm5, %ymm5
vpaddw %ymm5, %ymm0, %ymm5
vpand mask_3(%rip), %ymm5, %ymm0
vpsrlw $2, %ymm5, %ymm5
vpaddw %ymm5, %ymm0, %ymm5
vpsubw mask_3(%rip), %ymm5, %ymm14
vpsraw $15, %ymm14, %ymm15
vpandn %ymm14, %ymm15, %ymm0
vpand %ymm15, %ymm5, %ymm14
vpxor %ymm14, %ymm0, %ymm5
vmovdqa %ymm5, 576(%rdi)
vmovdqa 608(%rsi), %ymm1
vpand %ymm6, %ymm1, %ymm1
vpsrlw $10, %ymm1, %ymm0
vpxor %ymm3, %ymm0, %ymm0
vpsllw $11, %ymm0, %ymm0
vpaddw %ymm1, %ymm0, %ymm0
vpaddw %ymm4, %ymm0, %ymm0
vpsrlw $8, %ymm0, %ymm5
vpand mask_ff(%rip), %ymm0, %ymm0
vpaddw %ymm5, %ymm0, %ymm5
vpand mask_f(%rip), %ymm5, %ymm0
vpsrlw $4, %ymm5, %ymm5
vpaddw %ymm5, %ymm0, %ymm5
vpand mask_3(%rip), %ymm5, %ymm0
vpsrlw $2, %ymm5, %ymm5
vpaddw %ymm5, %ymm0, %ymm5
vpand mask_3(%rip), %ymm5, %ymm0
vpsrlw $2, %ymm5, %ymm5
vpaddw %ymm5, %ymm0, %ymm5
vpsubw mask_3(%rip), %ymm5, %ymm14
vpsraw $15, %ymm14, %ymm15
vpandn %ymm14, %ymm15, %ymm0
vpand %ymm15, %ymm5, %ymm14
vpxor %ymm14, %ymm0, %ymm5
vmovdqa %ymm5, 608(%rdi)
vmovdqa 640(%rsi), %ymm1
vpand %ymm6, %ymm1, %ymm1
vpsrlw $10, %ymm1, %ymm0
vpxor %ymm3, %ymm0, %ymm0
vpsllw $11, %ymm0, %ymm0
vpaddw %ymm1, %ymm0, %ymm0
vpaddw %ymm4, %ymm0, %ymm0
vpsrlw $8, %ymm0, %ymm5
vpand mask_ff(%rip), %ymm0, %ymm0
vpaddw %ymm5, %ymm0, %ymm5
vpand mask_f(%rip), %ymm5, %ymm0
vpsrlw $4, %ymm5, %ymm5
vpaddw %ymm5, %ymm0, %ymm5
vpand mask_3(%rip), %ymm5, %ymm0
vpsrlw $2, %ymm5, %ymm5
vpaddw %ymm5, %ymm0, %ymm5
vpand mask_3(%rip), %ymm5, %ymm0
vpsrlw $2, %ymm5, %ymm5
vpaddw %ymm5, %ymm0, %ymm5
vpsubw mask_3(%rip), %ymm5, %ymm14
vpsraw $15, %ymm14, %ymm15
vpandn %ymm14, %ymm15, %ymm0
vpand %ymm15, %ymm5, %ymm14
vpxor %ymm14, %ymm0, %ymm5
vmovdqa %ymm5, 640(%rdi)
vmovdqa 672(%rsi), %ymm1
vpand %ymm6, %ymm1, %ymm1
vpsrlw $10, %ymm1, %ymm0
vpxor %ymm3, %ymm0, %ymm0
vpsllw $11, %ymm0, %ymm0
vpaddw %ymm1, %ymm0, %ymm0
vpaddw %ymm4, %ymm0, %ymm0
vpsrlw $8, %ymm0, %ymm5
vpand mask_ff(%rip), %ymm0, %ymm0
vpaddw %ymm5, %ymm0, %ymm5
vpand mask_f(%rip), %ymm5, %ymm0
vpsrlw $4, %ymm5, %ymm5
vpaddw %ymm5, %ymm0, %ymm5
vpand mask_3(%rip), %ymm5, %ymm0
vpsrlw $2, %ymm5, %ymm5
vpaddw %ymm5, %ymm0, %ymm5
vpand mask_3(%rip), %ymm5, %ymm0
vpsrlw $2, %ymm5, %ymm5
vpaddw %ymm5, %ymm0, %ymm5
vpsubw mask_3(%rip), %ymm5, %ymm14
vpsraw $15, %ymm14, %ymm15
vpandn %ymm14, %ymm15, %ymm0
vpand %ymm15, %ymm5, %ymm14
vpxor %ymm14, %ymm0, %ymm5
vmovdqa %ymm5, 672(%rdi)
vmovdqa 704(%rsi), %ymm1
vpand %ymm6, %ymm1, %ymm1
vpsrlw $10, %ymm1, %ymm0
vpxor %ymm3, %ymm0, %ymm0
vpsllw $11, %ymm0, %ymm0
vpaddw %ymm1, %ymm0, %ymm0
vpaddw %ymm4, %ymm0, %ymm0
vpsrlw $8, %ymm0, %ymm5
vpand mask_ff(%rip), %ymm0, %ymm0
vpaddw %ymm5, %ymm0, %ymm5
vpand mask_f(%rip), %ymm5, %ymm0
vpsrlw $4, %ymm5, %ymm5
vpaddw %ymm5, %ymm0, %ymm5
vpand mask_3(%rip), %ymm5, %ymm0
vpsrlw $2, %ymm5, %ymm5
vpaddw %ymm5, %ymm0, %ymm5
vpand mask_3(%rip), %ymm5, %ymm0
vpsrlw $2, %ymm5, %ymm5
vpaddw %ymm5, %ymm0, %ymm5
vpsubw mask_3(%rip), %ymm5, %ymm14
vpsraw $15, %ymm14, %ymm15
vpandn %ymm14, %ymm15, %ymm0
vpand %ymm15, %ymm5, %ymm14
vpxor %ymm14, %ymm0, %ymm5
vmovdqa %ymm5, 704(%rdi)
vmovdqa 736(%rsi), %ymm1
vpand %ymm6, %ymm1, %ymm1
vpsrlw $10, %ymm1, %ymm0
vpxor %ymm3, %ymm0, %ymm0
vpsllw $11, %ymm0, %ymm0
vpaddw %ymm1, %ymm0, %ymm0
vpaddw %ymm4, %ymm0, %ymm0
vpsrlw $8, %ymm0, %ymm5
vpand mask_ff(%rip), %ymm0, %ymm0
vpaddw %ymm5, %ymm0, %ymm5
vpand mask_f(%rip), %ymm5, %ymm0
vpsrlw $4, %ymm5, %ymm5
vpaddw %ymm5, %ymm0, %ymm5
vpand mask_3(%rip), %ymm5, %ymm0
vpsrlw $2, %ymm5, %ymm5
vpaddw %ymm5, %ymm0, %ymm5
vpand mask_3(%rip), %ymm5, %ymm0
vpsrlw $2, %ymm5, %ymm5
vpaddw %ymm5, %ymm0, %ymm5
vpsubw mask_3(%rip), %ymm5, %ymm14
vpsraw $15, %ymm14, %ymm15
vpandn %ymm14, %ymm15, %ymm0
vpand %ymm15, %ymm5, %ymm14
vpxor %ymm14, %ymm0, %ymm5
vmovdqa %ymm5, 736(%rdi)
vmovdqa 768(%rsi), %ymm1
vpand %ymm6, %ymm1, %ymm1
vpsrlw $10, %ymm1, %ymm0
vpxor %ymm3, %ymm0, %ymm0
vpsllw $11, %ymm0, %ymm0
vpaddw %ymm1, %ymm0, %ymm0
vpaddw %ymm4, %ymm0, %ymm0
vpsrlw $8, %ymm0, %ymm5
vpand mask_ff(%rip), %ymm0, %ymm0
vpaddw %ymm5, %ymm0, %ymm5
vpand mask_f(%rip), %ymm5, %ymm0
vpsrlw $4, %ymm5, %ymm5
vpaddw %ymm5, %ymm0, %ymm5
vpand mask_3(%rip), %ymm5, %ymm0
vpsrlw $2, %ymm5, %ymm5
vpaddw %ymm5, %ymm0, %ymm5
vpand mask_3(%rip), %ymm5, %ymm0
vpsrlw $2, %ymm5, %ymm5
vpaddw %ymm5, %ymm0, %ymm5
vpsubw mask_3(%rip), %ymm5, %ymm14
vpsraw $15, %ymm14, %ymm15
vpandn %ymm14, %ymm15, %ymm0
vpand %ymm15, %ymm5, %ymm14
vpxor %ymm14, %ymm0, %ymm5
vmovdqa %ymm5, 768(%rdi)
vmovdqa 800(%rsi), %ymm1
vpand %ymm6, %ymm1, %ymm1
vpsrlw $10, %ymm1, %ymm0
vpxor %ymm3, %ymm0, %ymm0
vpsllw $11, %ymm0, %ymm0
vpaddw %ymm1, %ymm0, %ymm0
vpaddw %ymm4, %ymm0, %ymm0
vpsrlw $8, %ymm0, %ymm5
vpand mask_ff(%rip), %ymm0, %ymm0
vpaddw %ymm5, %ymm0, %ymm5
vpand mask_f(%rip), %ymm5, %ymm0
vpsrlw $4, %ymm5, %ymm5
vpaddw %ymm5, %ymm0, %ymm5
vpand mask_3(%rip), %ymm5, %ymm0
vpsrlw $2, %ymm5, %ymm5
vpaddw %ymm5, %ymm0, %ymm5
vpand mask_3(%rip), %ymm5, %ymm0
vpsrlw $2, %ymm5, %ymm5
vpaddw %ymm5, %ymm0, %ymm5
vpsubw mask_3(%rip), %ymm5, %ymm14
vpsraw $15, %ymm14, %ymm15
vpandn %ymm14, %ymm15, %ymm0
vpand %ymm15, %ymm5, %ymm14
vpxor %ymm14, %ymm0, %ymm5
vmovdqa %ymm5, 800(%rdi)
vmovdqa 832(%rsi), %ymm1
vpand %ymm6, %ymm1, %ymm1
vpsrlw $10, %ymm1, %ymm0
vpxor %ymm3, %ymm0, %ymm0
vpsllw $11, %ymm0, %ymm0
vpaddw %ymm1, %ymm0, %ymm0
vpaddw %ymm4, %ymm0, %ymm0
vpsrlw $8, %ymm0, %ymm5
vpand mask_ff(%rip), %ymm0, %ymm0
vpaddw %ymm5, %ymm0, %ymm5
vpand mask_f(%rip), %ymm5, %ymm0
vpsrlw $4, %ymm5, %ymm5
vpaddw %ymm5, %ymm0, %ymm5
vpand mask_3(%rip), %ymm5, %ymm0
vpsrlw $2, %ymm5, %ymm5
vpaddw %ymm5, %ymm0, %ymm5
vpand mask_3(%rip), %ymm5, %ymm0
vpsrlw $2, %ymm5, %ymm5
vpaddw %ymm5, %ymm0, %ymm5
vpsubw mask_3(%rip), %ymm5, %ymm14
vpsraw $15, %ymm14, %ymm15
vpandn %ymm14, %ymm15, %ymm0
vpand %ymm15, %ymm5, %ymm14
vpxor %ymm14, %ymm0, %ymm5
vmovdqa %ymm5, 832(%rdi)
vmovdqa 864(%rsi), %ymm1
vpand %ymm6, %ymm1, %ymm1
vpsrlw $10, %ymm1, %ymm0
vpxor %ymm3, %ymm0, %ymm0
vpsllw $11, %ymm0, %ymm0
vpaddw %ymm1, %ymm0, %ymm0
vpaddw %ymm4, %ymm0, %ymm0
vpsrlw $8, %ymm0, %ymm5
vpand mask_ff(%rip), %ymm0, %ymm0
vpaddw %ymm5, %ymm0, %ymm5
vpand mask_f(%rip), %ymm5, %ymm0
vpsrlw $4, %ymm5, %ymm5
vpaddw %ymm5, %ymm0, %ymm5
vpand mask_3(%rip), %ymm5, %ymm0
vpsrlw $2, %ymm5, %ymm5
vpaddw %ymm5, %ymm0, %ymm5
vpand mask_3(%rip), %ymm5, %ymm0
vpsrlw $2, %ymm5, %ymm5
vpaddw %ymm5, %ymm0, %ymm5
vpsubw mask_3(%rip), %ymm5, %ymm14
vpsraw $15, %ymm14, %ymm15
vpandn %ymm14, %ymm15, %ymm0
vpand %ymm15, %ymm5, %ymm14
vpxor %ymm14, %ymm0, %ymm5
vmovdqa %ymm5, 864(%rdi)
vmovdqa 896(%rsi), %ymm1
vpand %ymm6, %ymm1, %ymm1
vpsrlw $10, %ymm1, %ymm0
vpxor %ymm3, %ymm0, %ymm0
vpsllw $11, %ymm0, %ymm0
vpaddw %ymm1, %ymm0, %ymm0
vpaddw %ymm4, %ymm0, %ymm0
vpsrlw $8, %ymm0, %ymm5
vpand mask_ff(%rip), %ymm0, %ymm0
vpaddw %ymm5, %ymm0, %ymm5
vpand mask_f(%rip), %ymm5, %ymm0
vpsrlw $4, %ymm5, %ymm5
vpaddw %ymm5, %ymm0, %ymm5
vpand mask_3(%rip), %ymm5, %ymm0
vpsrlw $2, %ymm5, %ymm5
vpaddw %ymm5, %ymm0, %ymm5
vpand mask_3(%rip), %ymm5, %ymm0
vpsrlw $2, %ymm5, %ymm5
vpaddw %ymm5, %ymm0, %ymm5
vpsubw mask_3(%rip), %ymm5, %ymm14
vpsraw $15, %ymm14, %ymm15
vpandn %ymm14, %ymm15, %ymm0
vpand %ymm15, %ymm5, %ymm14
vpxor %ymm14, %ymm0, %ymm5
vmovdqa %ymm5, 896(%rdi)
vmovdqa 928(%rsi), %ymm1
vpand %ymm6, %ymm1, %ymm1
vpsrlw $10, %ymm1, %ymm0
vpxor %ymm3, %ymm0, %ymm0
vpsllw $11, %ymm0, %ymm0
vpaddw %ymm1, %ymm0, %ymm0
vpaddw %ymm4, %ymm0, %ymm0
vpsrlw $8, %ymm0, %ymm5
vpand mask_ff(%rip), %ymm0, %ymm0
vpaddw %ymm5, %ymm0, %ymm5
vpand mask_f(%rip), %ymm5, %ymm0
vpsrlw $4, %ymm5, %ymm5
vpaddw %ymm5, %ymm0, %ymm5
vpand mask_3(%rip), %ymm5, %ymm0
vpsrlw $2, %ymm5, %ymm5
vpaddw %ymm5, %ymm0, %ymm5
vpand mask_3(%rip), %ymm5, %ymm0
vpsrlw $2, %ymm5, %ymm5
vpaddw %ymm5, %ymm0, %ymm5
vpsubw mask_3(%rip), %ymm5, %ymm14
vpsraw $15, %ymm14, %ymm15
vpandn %ymm14, %ymm15, %ymm0
vpand %ymm15, %ymm5, %ymm14
vpxor %ymm14, %ymm0, %ymm5
vmovdqa %ymm5, 928(%rdi)
vmovdqa 960(%rsi), %ymm1
vpand %ymm6, %ymm1, %ymm1
vpsrlw $10, %ymm1, %ymm0
vpxor %ymm3, %ymm0, %ymm0
vpsllw $11, %ymm0, %ymm0
vpaddw %ymm1, %ymm0, %ymm0
vpaddw %ymm4, %ymm0, %ymm0
vpsrlw $8, %ymm0, %ymm5
vpand mask_ff(%rip), %ymm0, %ymm0
vpaddw %ymm5, %ymm0, %ymm5
vpand mask_f(%rip), %ymm5, %ymm0
vpsrlw $4, %ymm5, %ymm5
vpaddw %ymm5, %ymm0, %ymm5
vpand mask_3(%rip), %ymm5, %ymm0
vpsrlw $2, %ymm5, %ymm5
vpaddw %ymm5, %ymm0, %ymm5
vpand mask_3(%rip), %ymm5, %ymm0
vpsrlw $2, %ymm5, %ymm5
vpaddw %ymm5, %ymm0, %ymm5
vpsubw mask_3(%rip), %ymm5, %ymm14
vpsraw $15, %ymm14, %ymm15
vpandn %ymm14, %ymm15, %ymm0
vpand %ymm15, %ymm5, %ymm14
vpxor %ymm14, %ymm0, %ymm5
vmovdqa %ymm5, 960(%rdi)
vmovdqa 992(%rsi), %ymm1
vpand %ymm6, %ymm1, %ymm1
vpsrlw $10, %ymm1, %ymm0
vpxor %ymm3, %ymm0, %ymm0
vpsllw $11, %ymm0, %ymm0
vpaddw %ymm1, %ymm0, %ymm0
vpaddw %ymm4, %ymm0, %ymm0
vpsrlw $8, %ymm0, %ymm5
vpand mask_ff(%rip), %ymm0, %ymm0
vpaddw %ymm5, %ymm0, %ymm5
vpand mask_f(%rip), %ymm5, %ymm0
vpsrlw $4, %ymm5, %ymm5
vpaddw %ymm5, %ymm0, %ymm5
vpand mask_3(%rip), %ymm5, %ymm0
vpsrlw $2, %ymm5, %ymm5
vpaddw %ymm5, %ymm0, %ymm5
vpand mask_3(%rip), %ymm5, %ymm0
vpsrlw $2, %ymm5, %ymm5
vpaddw %ymm5, %ymm0, %ymm5
vpsubw mask_3(%rip), %ymm5, %ymm14
vpsraw $15, %ymm14, %ymm15
vpandn %ymm14, %ymm15, %ymm0
vpand %ymm15, %ymm5, %ymm14
vpxor %ymm14, %ymm0, %ymm5
vmovdqa %ymm5, 992(%rdi)
ret

+ 463
- 0
crypto_kem/ntruhps2048509/avx2/poly_s3_inv.c View File

@@ -0,0 +1,463 @@
#include "poly.h"

#include <immintrin.h>

typedef signed char small;

#define p 508
#define ppad 512
#define numvec 2

typedef __m256i vec256;

/*
This code stores 512-coeff poly as vec256[2].
Order of 256 coefficients in each vec256
is optimized in light of costs of vector instructions:
0,4,...,252 in 64-bit word;
1,5,...,253 in 64-bit word;
2,6,...,254 in 64-bit word;
3,7,...,255 in 64-bit word.
*/

static inline void vec256_frombits(vec256 *v, const small *b) {
int i;

for (i = 0; i < numvec; ++i) {
vec256 b0 = _mm256_loadu_si256((vec256 *) b);
b += 32; /* 0,1,...,31 */
vec256 b1 = _mm256_loadu_si256((vec256 *) b);
b += 32; /* 32,33,... */
vec256 b2 = _mm256_loadu_si256((vec256 *) b);
b += 32;
vec256 b3 = _mm256_loadu_si256((vec256 *) b);
b += 32;
vec256 b4 = _mm256_loadu_si256((vec256 *) b);
b += 32;
vec256 b5 = _mm256_loadu_si256((vec256 *) b);
b += 32;
vec256 b6 = _mm256_loadu_si256((vec256 *) b);
b += 32;
vec256 b7 = _mm256_loadu_si256((vec256 *) b);
b += 32;

vec256 c0 = _mm256_unpacklo_epi32(b0, b1); /* 0 1 2 3 32 33 34 35 4 5 6 7 36 37 38 39 ... 55 */
vec256 c1 = _mm256_unpackhi_epi32(b0, b1); /* 8 9 10 11 40 41 42 43 ... 63 */
vec256 c2 = _mm256_unpacklo_epi32(b2, b3);
vec256 c3 = _mm256_unpackhi_epi32(b2, b3);
vec256 c4 = _mm256_unpacklo_epi32(b4, b5);
vec256 c5 = _mm256_unpackhi_epi32(b4, b5);
vec256 c6 = _mm256_unpacklo_epi32(b6, b7);
vec256 c7 = _mm256_unpackhi_epi32(b6, b7);

vec256 d0 = c0 | _mm256_slli_epi32(c1, 2); /* 0 8, 1 9, 2 10, 3 11, 32 40, 33 41, ..., 55 63 */
vec256 d2 = c2 | _mm256_slli_epi32(c3, 2);
vec256 d4 = c4 | _mm256_slli_epi32(c5, 2);
vec256 d6 = c6 | _mm256_slli_epi32(c7, 2);

vec256 e0 = _mm256_unpacklo_epi64(d0, d2);
vec256 e2 = _mm256_unpackhi_epi64(d0, d2);
vec256 e4 = _mm256_unpacklo_epi64(d4, d6);
vec256 e6 = _mm256_unpackhi_epi64(d4, d6);

vec256 f0 = e0 | _mm256_slli_epi32(e2, 1);
vec256 f4 = e4 | _mm256_slli_epi32(e6, 1);

vec256 g0 = _mm256_permute2x128_si256(f0, f4, 0x20);
vec256 g4 = _mm256_permute2x128_si256(f0, f4, 0x31);

vec256 h = g0 | _mm256_slli_epi32(g4, 4);

#define TRANSPOSE _mm256_set_epi8( 31,27,23,19, 30,26,22,18, 29,25,21,17, 28,24,20,16, 15,11,7,3, 14,10,6,2, 13,9,5,1, 12,8,4,0 )
h = _mm256_shuffle_epi8(h, TRANSPOSE);
h = _mm256_permute4x64_epi64(h, 0xd8);
h = _mm256_shuffle_epi32(h, 0xd8);

*v++ = h;
}
}

static inline void vec256_tobits(const vec256 *v, small *b) {
int i;

for (i = 0; i < numvec; ++i) {
vec256 h = *v++;

h = _mm256_shuffle_epi32(h, 0xd8);
h = _mm256_permute4x64_epi64(h, 0xd8);
h = _mm256_shuffle_epi8(h, TRANSPOSE);

vec256 g0 = h & _mm256_set1_epi8(15);
vec256 g4 = _mm256_srli_epi32(h, 4) & _mm256_set1_epi8(15);

vec256 f0 = _mm256_permute2x128_si256(g0, g4, 0x20);
vec256 f4 = _mm256_permute2x128_si256(g0, g4, 0x31);

vec256 e0 = f0 & _mm256_set1_epi8(5);
vec256 e2 = _mm256_srli_epi32(f0, 1) & _mm256_set1_epi8(5);
vec256 e4 = f4 & _mm256_set1_epi8(5);
vec256 e6 = _mm256_srli_epi32(f4, 1) & _mm256_set1_epi8(5);

vec256 d0 = _mm256_unpacklo_epi32(e0, e2);
vec256 d2 = _mm256_unpackhi_epi32(e0, e2);
vec256 d4 = _mm256_unpacklo_epi32(e4, e6);
vec256 d6 = _mm256_unpackhi_epi32(e4, e6);

vec256 c0 = d0 & _mm256_set1_epi8(1);
vec256 c1 = _mm256_srli_epi32(d0, 2) & _mm256_set1_epi8(1);
vec256 c2 = d2 & _mm256_set1_epi8(1);
vec256 c3 = _mm256_srli_epi32(d2, 2) & _mm256_set1_epi8(1);
vec256 c4 = d4 & _mm256_set1_epi8(1);
vec256 c5 = _mm256_srli_epi32(d4, 2) & _mm256_set1_epi8(1);
vec256 c6 = d6 & _mm256_set1_epi8(1);
vec256 c7 = _mm256_srli_epi32(d6, 2) & _mm256_set1_epi8(1);

vec256 b0 = _mm256_unpacklo_epi64(c0, c1);
vec256 b1 = _mm256_unpackhi_epi64(c0, c1);
vec256 b2 = _mm256_unpacklo_epi64(c2, c3);
vec256 b3 = _mm256_unpackhi_epi64(c2, c3);
vec256 b4 = _mm256_unpacklo_epi64(c4, c5);
vec256 b5 = _mm256_unpackhi_epi64(c4, c5);
vec256 b6 = _mm256_unpacklo_epi64(c6, c7);
vec256 b7 = _mm256_unpackhi_epi64(c6, c7);

_mm256_storeu_si256((vec256 *) b, b0);
b += 32;
_mm256_storeu_si256((vec256 *) b, b1);
b += 32;
_mm256_storeu_si256((vec256 *) b, b2);
b += 32;
_mm256_storeu_si256((vec256 *) b, b3);
b += 32;
_mm256_storeu_si256((vec256 *) b, b4);
b += 32;
_mm256_storeu_si256((vec256 *) b, b5);
b += 32;
_mm256_storeu_si256((vec256 *) b, b6);
b += 32;
_mm256_storeu_si256((vec256 *) b, b7);
b += 32;
}
}

static void vec256_init(vec256 *G0, vec256 *G1, const small *s) {
int i;
small srev[ppad + (ppad - p)];
small si;
small g0[ppad];
small g1[ppad];

for (i = 0; i < p; ++i) {
srev[ppad - 1 - i] = s[i];
}
for (i = 0; i < ppad - p; ++i) {
srev[i] = 0;
}
for (i = p; i < ppad; ++i) {
srev[i + ppad - p] = 0;
}

for (i = 0; i < ppad; ++i) {
si = srev[i + ppad - p];
g0[i] = si & 1;
g1[i] = (si >> 1) & g0[i];
}

vec256_frombits(G0, g0);
vec256_frombits(G1, g1);
}

static void vec256_final(small *out, const vec256 *V0, const vec256 *V1) {
int i;
small v0[ppad];
small v1[ppad];
small v[ppad];
small vrev[ppad + (ppad - p)];

vec256_tobits(V0, v0);
vec256_tobits(V1, v1);

for (i = 0; i < ppad; ++i) {
v[i] = v0[i] + 2 * v1[i] - 4 * (v0[i] & v1[i]);
}

for (i = 0; i < ppad; ++i) {
vrev[i] = v[ppad - 1 - i];
}
for (i = ppad; i < ppad + (ppad - p); ++i) {
vrev[i] = 0;
}

for (i = 0; i < p; ++i) {
out[i] = vrev[i + ppad - p];
}
}

static inline int negative_mask(int x) {
return x >> 31;
}

static inline void vec256_swap(vec256 *f, vec256 *g, int len, vec256 mask) {
vec256 flip;
int i;

for (i = 0; i < len; ++i) {
flip = mask & (f[i] ^ g[i]);
f[i] ^= flip;
g[i] ^= flip;
}
}

static inline void vec256_scale(vec256 *f0, vec256 *f1, const vec256 c0, const vec256 c1) {
int i;

for (i = 0; i < numvec; ++i) {
vec256 f0i = f0[i];
vec256 f1i = f1[i];

f0i &= c0;
f1i ^= c1;
f1i &= f0i;

f0[i] = f0i;
f1[i] = f1i;
}
}

static inline void vec256_eliminate(vec256 *f0, vec256 *f1, vec256 *g0, vec256 *g1, int len, const vec256 c0, const vec256 c1) {
int i;

for (i = 0; i < len; ++i) {
vec256 f0i = f0[i];
vec256 f1i = f1[i];
vec256 g0i = g0[i];
vec256 g1i = g1[i];
vec256 t;

f0i &= c0;
f1i ^= c1;
f1i &= f0i;

t = g0i ^ f0i;
g0[i] = t | (g1i ^ f1i);
g1[i] = (g1i ^ f0i) & (f1i ^ t);
}
}

static inline int vec256_bit0mask(vec256 *f) {
return -(_mm_cvtsi128_si32(_mm256_castsi256_si128(f[0])) & 1);
}

static inline void vec256_divx_1(vec256 *f) {
vec256 f0 = f[0];

unsigned long long low0 = _mm_cvtsi128_si64(_mm256_castsi256_si128(f0));

low0 = low0 >> 1;

f0 = _mm256_blend_epi32(f0, _mm256_set_epi64x(0, 0, 0, low0), 0x3);

f[0] = _mm256_permute4x64_epi64(f0, 0x39);
}

static inline void vec256_divx_2(vec256 *f) {
vec256 f0 = f[0];
vec256 f1 = f[1];

unsigned long long low0 = _mm_cvtsi128_si64(_mm256_castsi256_si128(f0));
unsigned long long low1 = _mm_cvtsi128_si64(_mm256_castsi256_si128(f1));

low0 = (low0 >> 1) | (low1 << 63);
low1 = low1 >> 1;

f0 = _mm256_blend_epi32(f0, _mm256_set_epi64x(0, 0, 0, low0), 0x3);
f1 = _mm256_blend_epi32(f1, _mm256_set_epi64x(0, 0, 0, low1), 0x3);

f[0] = _mm256_permute4x64_epi64(f0, 0x39);
f[1] = _mm256_permute4x64_epi64(f1, 0x39);
}

static inline void vec256_timesx_1(vec256 *f) {
vec256 f0 = _mm256_permute4x64_epi64(f[0], 0x93);

unsigned long long low0 = _mm_cvtsi128_si64(_mm256_castsi256_si128(f0));

low0 = low0 << 1;

f0 = _mm256_blend_epi32(f0, _mm256_set_epi64x(0, 0, 0, low0), 0x3);

f[0] = f0;
}

static inline void vec256_timesx_2(vec256 *f) {
vec256 f0 = _mm256_permute4x64_epi64(f[0], 0x93);
vec256 f1 = _mm256_permute4x64_epi64(f[1], 0x93);

unsigned long long low0 = _mm_cvtsi128_si64(_mm256_castsi256_si128(f0));
unsigned long long low1 = _mm_cvtsi128_si64(_mm256_castsi256_si128(f1));

low1 = (low1 << 1) | (low0 >> 63);
low0 = low0 << 1;

f0 = _mm256_blend_epi32(f0, _mm256_set_epi64x(0, 0, 0, low0), 0x3);
f1 = _mm256_blend_epi32(f1, _mm256_set_epi64x(0, 0, 0, low1), 0x3);

f[0] = f0;
f[1] = f1;
}


static int __poly_S3_inv(unsigned char *outbytes, const unsigned char *inbytes) {
small *out = (void *) outbytes;
small *in = (void *) inbytes;
vec256 F0[numvec];
vec256 F1[numvec];
vec256 G0[numvec];
vec256 G1[numvec];
vec256 V0[numvec];
vec256 V1[numvec];
vec256 R0[numvec];
vec256 R1[numvec];
vec256 c0vec, c1vec;
int loop;
int c0, c1;
int minusdelta = -1;
int swapmask;
vec256 swapvec;

vec256_init(G0, G1, in);
F0[0] = _mm256_set_epi32(-1, -1, -1, -1, -1, -1, -1, -1);
F0[1] = _mm256_set_epi32(2147483647, -1, 2147483647, -1, 2147483647, -1, -1, -1);
F1[0] = _mm256_set1_epi32(0);
F1[1] = _mm256_set1_epi32(0);

V0[0] = _mm256_set1_epi32(0);
V1[0] = _mm256_set1_epi32(0);
V0[1] = _mm256_set1_epi32(0);
V1[1] = _mm256_set1_epi32(0);

R0[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, 1);
R1[0] = _mm256_set1_epi32(0);
R0[1] = _mm256_set1_epi32(0);
R1[1] = _mm256_set1_epi32(0);

for (loop = 256; loop > 0; --loop) {
vec256_timesx_1(V0);
vec256_timesx_1(V1);
swapmask = negative_mask(minusdelta) & vec256_bit0mask(G0);

c0 = vec256_bit0mask(F0) & vec256_bit0mask(G0);
c1 = vec256_bit0mask(F1) ^ vec256_bit0mask(G1);
c1 &= c0;

minusdelta ^= swapmask & (minusdelta ^ -minusdelta);
minusdelta -= 1;

swapvec = _mm256_set1_epi32(swapmask);
vec256_swap(F0, G0, 2, swapvec);
vec256_swap(F1, G1, 2, swapvec);

c0vec = _mm256_set1_epi32(c0);
c1vec = _mm256_set1_epi32(c1);

vec256_eliminate(F0, F1, G0, G1, 2, c0vec, c1vec);
vec256_divx_2(G0);
vec256_divx_2(G1);

vec256_swap(V0, R0, 1, swapvec);
vec256_swap(V1, R1, 1, swapvec);
vec256_eliminate(V0, V1, R0, R1, 1, c0vec, c1vec);
}

for (loop = 503; loop > 0; --loop) {
vec256_timesx_2(V0);
vec256_timesx_2(V1);
swapmask = negative_mask(minusdelta) & vec256_bit0mask(G0);

c0 = vec256_bit0mask(F0) & vec256_bit0mask(G0);
c1 = vec256_bit0mask(F1) ^ vec256_bit0mask(G1);
c1 &= c0;

minusdelta ^= swapmask & (minusdelta ^ -minusdelta);
minusdelta -= 1;

swapvec = _mm256_set1_epi32(swapmask);
vec256_swap(F0, G0, 2, swapvec);
vec256_swap(F1, G1, 2, swapvec);

c0vec = _mm256_set1_epi32(c0);
c1vec = _mm256_set1_epi32(c1);

vec256_eliminate(F0, F1, G0, G1, 2, c0vec, c1vec);
vec256_divx_2(G0);
vec256_divx_2(G1);

vec256_swap(V0, R0, 2, swapvec);
vec256_swap(V1, R1, 2, swapvec);
vec256_eliminate(V0, V1, R0, R1, 2, c0vec, c1vec);
}

for (loop = 256; loop > 0; --loop) {
vec256_timesx_2(V0);
vec256_timesx_2(V1);
swapmask = negative_mask(minusdelta) & vec256_bit0mask(G0);

c0 = vec256_bit0mask(F0) & vec256_bit0mask(G0);
c1 = vec256_bit0mask(F1) ^ vec256_bit0mask(G1);
c1 &= c0;

minusdelta ^= swapmask & (minusdelta ^ -minusdelta);
minusdelta -= 1;

swapvec = _mm256_set1_epi32(swapmask);
vec256_swap(F0, G0, 1, swapvec);
vec256_swap(F1, G1, 1, swapvec);

c0vec = _mm256_set1_epi32(c0);
c1vec = _mm256_set1_epi32(c1);

vec256_eliminate(F0, F1, G0, G1, 1, c0vec, c1vec);
vec256_divx_1(G0);
vec256_divx_1(G1);

vec256_swap(V0, R0, 2, swapvec);
vec256_swap(V1, R1, 2, swapvec);
vec256_eliminate(V0, V1, R0, R1, 2, c0vec, c1vec);
}

c0vec = _mm256_set1_epi32(vec256_bit0mask(F0));
c1vec = _mm256_set1_epi32(vec256_bit0mask(F1));
vec256_scale(V0, V1, c0vec, c1vec);

vec256_final(out, V0, V1);
out[p] = negative_mask(minusdelta);
return 0;
}

// This code is based on crypto_core/invhrss701/faster from SUPERCOP. The code was written as a case study
// for the paper "Fast constant-time gcd computation and modular inversion" by Daniel J. Bernstein and Bo-Yin Yang.
void PQCLEAN_NTRUHPS2048509_AVX2_poly_S3_inv(poly *r_out, const poly *a) {
const unsigned char *in = (void *) a;
unsigned char *out = (void *) r_out;

small input[ppad];
small output[ppad];
int i;

/* XXX: obviously input/output format should be packed into bytes */

for (i = 0; i < p; ++i) {
small x = in[2 * i] & 3; /* 0 1 2 3 */
x += 1; /* 0 1 2 3 4 5 6, offset by 1 */
x &= (x - 3) >> 5; /* 0 1 2, offset by 1 */
input[i] = x - 1;
}
/* XXX: merge with vec256_init */

__poly_S3_inv((unsigned char *)output, (unsigned char *)input);

for (i = 0; i < p; ++i) {
out[2 * i] = (3 & output[i]) ^ ((3 & output[i]) >> 1);
out[2 * i + 1] = 0;
}
}

+ 46
- 0
crypto_kem/ntruhps2048509/avx2/sample.c View File

@@ -0,0 +1,46 @@
#include "sample.h"

void PQCLEAN_NTRUHPS2048509_AVX2_sample_fg(poly *f, poly *g, const unsigned char uniformbytes[NTRU_SAMPLE_FG_BYTES]) {

PQCLEAN_NTRUHPS2048509_AVX2_sample_iid(f, uniformbytes);
PQCLEAN_NTRUHPS2048509_AVX2_sample_fixed_type(g, uniformbytes + NTRU_SAMPLE_IID_BYTES);
}

void PQCLEAN_NTRUHPS2048509_AVX2_sample_rm(poly *r, poly *m, const unsigned char uniformbytes[NTRU_SAMPLE_RM_BYTES]) {

PQCLEAN_NTRUHPS2048509_AVX2_sample_iid(r, uniformbytes);
PQCLEAN_NTRUHPS2048509_AVX2_sample_fixed_type(m, uniformbytes + NTRU_SAMPLE_IID_BYTES);
}


#include "crypto_sort_int32.h"
void PQCLEAN_NTRUHPS2048509_AVX2_sample_fixed_type(poly *r, const unsigned char u[NTRU_SAMPLE_FT_BYTES]) {
// Assumes NTRU_SAMPLE_FT_BYTES = ceil(30*(n-1)/8)

int32_t s[NTRU_N - 1];
int i;

// Use 30 bits of u per word
for (i = 0; i < (NTRU_N - 1) / 4; i++) {
s[4 * i + 0] = (u[15 * i + 0] << 2) + (u[15 * i + 1] << 10) + (u[15 * i + 2] << 18) + ((uint32_t) u[15 * i + 3] << 26);
s[4 * i + 1] = ((u[15 * i + 3] & 0xc0) >> 4) + (u[15 * i + 4] << 4) + (u[15 * i + 5] << 12) + (u[15 * i + 6] << 20) + ((uint32_t) u[15 * i + 7] << 28);
s[4 * i + 2] = ((u[15 * i + 7] & 0xf0) >> 2) + (u[15 * i + 8] << 6) + (u[15 * i + 9] << 14) + (u[15 * i + 10] << 22) + ((uint32_t) u[15 * i + 11] << 30);
s[4 * i + 3] = (u[15 * i + 11] & 0xfc) + (u[15 * i + 12] << 8) + (u[15 * i + 13] << 15) + ((uint32_t) u[15 * i + 14] << 24);
}

for (i = 0; i < NTRU_WEIGHT / 2; i++) {
s[i] |= 1;
}

for (i = NTRU_WEIGHT / 2; i < NTRU_WEIGHT; i++) {
s[i] |= 2;
}

PQCLEAN_NTRUHPS2048509_AVX2_crypto_sort_int32(s, NTRU_N - 1);

for (i = 0; i < NTRU_N - 1; i++) {
r->coeffs[i] = ((uint16_t) (s[i] & 3));
}

r->coeffs[NTRU_N - 1] = 0;
}

+ 15
- 0
crypto_kem/ntruhps2048509/avx2/sample.h View File

@@ -0,0 +1,15 @@
#ifndef SAMPLE_H
#define SAMPLE_H

#include "params.h"
#include "poly.h"

void PQCLEAN_NTRUHPS2048509_AVX2_sample_fg(poly *f, poly *g, const unsigned char uniformbytes[NTRU_SAMPLE_FG_BYTES]);
void PQCLEAN_NTRUHPS2048509_AVX2_sample_rm(poly *r, poly *m, const unsigned char uniformbytes[NTRU_SAMPLE_RM_BYTES]);

void PQCLEAN_NTRUHPS2048509_AVX2_sample_iid(poly *r, const unsigned char uniformbytes[NTRU_SAMPLE_IID_BYTES]);

void PQCLEAN_NTRUHPS2048509_AVX2_sample_fixed_type(poly *r, const unsigned char uniformbytes[NTRU_SAMPLE_FT_BYTES]);


#endif

+ 21
- 0
crypto_kem/ntruhps2048509/avx2/sample_iid.c View File

@@ -0,0 +1,21 @@
#include <immintrin.h>

#include "sample.h"

extern void PQCLEAN_NTRUHPS2048509_AVX2_vec32_sample_iid(poly *r, const unsigned char uniformbytes[PAD32(NTRU_SAMPLE_IID_BYTES)]);

void PQCLEAN_NTRUHPS2048509_AVX2_sample_iid(poly *r, const unsigned char uniformbytes[NTRU_SAMPLE_IID_BYTES]) {
int i;
union { /* align to 32 byte boundary for vmovdqa */
unsigned char b[PAD32(NTRU_SAMPLE_IID_BYTES)];
__m256i b_x32[PAD32(NTRU_SAMPLE_IID_BYTES) / 32];
} buffer;

for (i = 0; i < NTRU_SAMPLE_IID_BYTES; i++) {
buffer.b[i] = uniformbytes[i];
}
for (i = NTRU_SAMPLE_IID_BYTES; i < PAD32(NTRU_SAMPLE_IID_BYTES); i++) {
buffer.b[i] = 0;
}
PQCLEAN_NTRUHPS2048509_AVX2_vec32_sample_iid(r, buffer.b);
}

+ 2854
- 0
crypto_kem/ntruhps2048509/avx2/square_126_509_shufbytes.s
File diff suppressed because it is too large
View File


+ 5125
- 0
crypto_kem/ntruhps2048509/avx2/square_15_509_shufbytes.s
File diff suppressed because it is too large
View File


+ 109
- 0
crypto_kem/ntruhps2048509/avx2/square_1_509_patience.s View File

@@ -0,0 +1,109 @@
.data
.p2align 5
.text
.global PQCLEAN_NTRUHPS2048509_AVX2_square_1_509
.global _PQCLEAN_NTRUHPS2048509_AVX2_square_1_509
PQCLEAN_NTRUHPS2048509_AVX2_square_1_509:
_PQCLEAN_NTRUHPS2048509_AVX2_square_1_509:
push %r15
push %r14
push %r13
push %r12
push %rbx
push %rbp
mov 0(%rsi), %r11
mov %r11, %r10
and $-0x1, %r10
mov $0x5555555555555555, %rbp
pdep %rbp, %r10, %r10
mov %r10, 0(%rdi)
mov $0xffffffff00000000, %rbx
pext %rbx, %r11, %r10
pdep %rbp, %r10, %r10
mov %r10, 8(%rdi)
mov 8(%rsi), %r11
mov %r11, %r10
and $-0x1, %r10
pdep %rbp, %r10, %r10
mov %r10, 16(%rdi)
pext %rbx, %r11, %r10
pdep %rbp, %r10, %r10
mov %r10, 24(%rdi)
mov 16(%rsi), %r11
mov %r11, %r10
and $-0x1, %r10
pdep %rbp, %r10, %r10
mov %r10, 32(%rdi)
pext %rbx, %r11, %r10
pdep %rbp, %r10, %r10
mov %r10, 40(%rdi)
mov 24(%rsi), %r11
mov %r11, %r10
rol $2, %r10
and $0x2, %r10
xor %r10, 0(%rdi)
mov %r11, %r10
and $-0x1, %r10
pdep %rbp, %r10, %r10
mov %r10, 48(%rdi)
mov $0x7fffffff00000000, %r12
pext %r12, %r11, %r10
mov $0x1555555555555555, %r13
pdep %r13, %r10, %r10
mov %r10, 56(%rdi)
mov 32(%rsi), %r11
mov %r11, %r10
and $0x7fffffff, %r10
mov $0xaaaaaaaaaaaaaaa8, %r14
pdep %r14, %r10, %r10
xor %r10, 0(%rdi)
mov $0x7fffffff80000000, %r15
pext %r15, %r11, %r10
mov $0xaaaaaaaaaaaaaaaa, %r9
pdep %r9, %r10, %r10
xor %r10, 8(%rdi)
mov %r11, %r10
rol $2, %r10
and $0x2, %r10
xor %r10, 16(%rdi)
mov 40(%rsi), %r11
mov %r11, %r10
and $0x7fffffff, %r10
pdep %r14, %r10, %r10
xor %r10, 16(%rdi)
pext %r15, %r11, %r10
pdep %r9, %r10, %r10
xor %r10, 24(%rdi)
mov %r11, %r10
rol $2, %r10
and $0x2, %r10
xor %r10, 32(%rdi)
mov 48(%rsi), %r11
mov %r11, %r10
and $0x7fffffff, %r10
pdep %r14, %r10, %r10
xor %r10, 32(%rdi)
pext %r15, %r11, %r10
pdep %r9, %r10, %r10
xor %r10, 40(%rdi)
mov %r11, %r10
rol $2, %r10
and $0x2, %r10
xor %r10, 48(%rdi)
mov 56(%rsi), %r11
mov %r11, %r10
and $0x7fffffff, %r10
pdep %r14, %r10, %r10
xor %r10, 48(%rdi)
mov $0x1fffffff80000000, %r8
pext %r8, %r11, %r10
mov $0xaaaaaaaaaaaaaaa, %rdx
pdep %rdx, %r10, %r10
xor %r10, 56(%rdi)
pop %rbp
pop %rbx
pop %r12
pop %r13
pop %r14
pop %r15
ret

+ 3992
- 0
crypto_kem/ntruhps2048509/avx2/square_252_509_shufbytes.s
File diff suppressed because it is too large
View File


+ 4316
- 0
crypto_kem/ntruhps2048509/avx2/square_30_509_shufbytes.s
File diff suppressed because it is too large
View File


+ 272
- 0
crypto_kem/ntruhps2048509/avx2/square_3_509_patience.s View File

@@ -0,0 +1,272 @@
.data
.p2align 5
.text
.global PQCLEAN_NTRUHPS2048509_AVX2_square_3_509
.global _PQCLEAN_NTRUHPS2048509_AVX2_square_3_509
PQCLEAN_NTRUHPS2048509_AVX2_square_3_509:
_PQCLEAN_NTRUHPS2048509_AVX2_square_3_509:
push %r15
push %r14
push %r13
push %r12
push %rbx
push %rbp
mov 0(%rsi), %r11
mov %r11, %r10
and $0xff, %r10
mov $0x101010101010101, %rbp
pdep %rbp, %r10, %r10
mov %r10, 0(%rdi)
mov $0xff00, %rbx
pext %rbx, %r11, %r10
pdep %rbp, %r10, %r10
mov %r10, 8(%rdi)
mov $0xff0000, %r12
pext %r12, %r11, %r10
pdep %rbp, %r10, %r10
mov %r10, 16(%rdi)
mov $0xff000000, %r13
pext %r13, %r11, %r10
pdep %rbp, %r10, %r10
mov %r10, 24(%rdi)
mov $0xff00000000, %r14
pext %r14, %r11, %r10
pdep %rbp, %r10, %r10
mov %r10, 32(%rdi)
mov $0xff0000000000, %r15
pext %r15, %r11, %r10
pdep %rbp, %r10, %r10
mov %r10, 40(%rdi)
mov $0xff000000000000, %r9
pext %r9, %r11, %r10
pdep %rbp, %r10, %r10
mov %r10, 48(%rdi)
mov $0xff00000000000000, %r8
pext %r8, %r11, %r10
pdep %rbp, %r10, %r10
mov %r10, 56(%rdi)
mov 8(%rsi), %r11
mov %r11, %r10
and $0xff, %r10
mov $0x808080808080808, %rdx
pdep %rdx, %r10, %r10
xor %r10, 0(%rdi)
pext %rbx, %r11, %r10
pdep %rdx, %r10, %r10
xor %r10, 8(%rdi)
pext %r12, %r11, %r10
pdep %rdx, %r10, %r10
xor %r10, 16(%rdi)
pext %r13, %r11, %r10
pdep %rdx, %r10, %r10
xor %r10, 24(%rdi)
pext %r14, %r11, %r10
pdep %rdx, %r10, %r10
xor %r10, 32(%rdi)
pext %r15, %r11, %r10
pdep %rdx, %r10, %r10
xor %r10, 40(%rdi)
pext %r9, %r11, %r10
pdep %rdx, %r10, %r10
xor %r10, 48(%rdi)
pext %r8, %r11, %r10
pdep %rdx, %r10, %r10
xor %r10, 56(%rdi)
mov 16(%rsi), %r11
mov $0x80000000000000ff, %rcx
pext %rcx, %r11, %r10
mov $0x9010101010101010, %rax
pdep %rax, %r10, %r10
rol $2, %r10
xor %r10, 0(%rdi)
pext %rbx, %r11, %r10
mov $0x4040404040404040, %rbp
pdep %rbp, %r10, %r10
xor %r10, 8(%rdi)
pext %r12, %r11, %r10
pdep %rbp, %r10, %r10
xor %r10, 16(%rdi)
pext %r13, %r11, %r10
pdep %rbp, %r10, %r10
xor %r10, 24(%rdi)
pext %r14, %r11, %r10
pdep %rbp, %r10, %r10
xor %r10, 32(%rdi)
pext %r15, %r11, %r10
pdep %rbp, %r10, %r10
xor %r10, 40(%rdi)
pext %r9, %r11, %r10
pdep %rbp, %r10, %r10
xor %r10, 48(%rdi)
mov $0x7f00000000000000, %r8
pext %r8, %r11, %r10
mov $0x40404040404040, %rdx
pdep %rdx, %r10, %r10
xor %r10, 56(%rdi)
mov 24(%rsi), %r11
mov $0x800000000000007f, %rcx
pext %rcx, %r11, %r10
mov $0x8010101010101010, %rax
pdep %rax, %r10, %r10
rol $5, %r10
xor %r10, 0(%rdi)
mov $0x7f80, %rbx
pext %rbx, %r11, %r10
mov $0x202020202020202, %r12
pdep %r12, %r10, %r10
xor %r10, 8(%rdi)
mov $0x7f8000, %r13
pext %r13, %r11, %r10
pdep %r12, %r10, %r10
xor %r10, 16(%rdi)
mov $0x7f800000, %r14
pext %r14, %r11, %r10
pdep %r12, %r10, %r10
xor %r10, 24(%rdi)
mov $0x7f80000000, %r15
pext %r15, %r11, %r10
pdep %r12, %r10, %r10
xor %r10, 32(%rdi)
mov $0x7f8000000000, %r9
pext %r9, %r11, %r10
pdep %r12, %r10, %r10
xor %r10, 40(%rdi)
mov $0x7f800000000000, %rbp
pext %rbp, %r11, %r10
pdep %r12, %r10, %r10
xor %r10, 48(%rdi)
mov $0x7f80000000000000, %r8
pext %r8, %r11, %r10
pdep %r12, %r10, %r10
xor %r10, 56(%rdi)
mov 32(%rsi), %r11
pext %rcx, %r11, %r10
pdep %rax, %r10, %r10
rol $8, %r10
xor %r10, 0(%rdi)
pext %rbx, %r11, %r10
mov $0x1010101010101010, %rdx
pdep %rdx, %r10, %r10
xor %r10, 8(%rdi)
pext %r13, %r11, %r10
pdep %rdx, %r10, %r10
xor %r10, 16(%rdi)
pext %r14, %r11, %r10
pdep %rdx, %r10, %r10
xor %r10, 24(%rdi)
pext %r15, %r11, %r10
pdep %rdx, %r10, %r10
xor %r10, 32(%rdi)
pext %r9, %r11, %r10
pdep %rdx, %r10, %r10
xor %r10, 40(%rdi)
pext %rbp, %r11, %r10
pdep %rdx, %r10, %r10
xor %r10, 48(%rdi)
pext %r8, %r11, %r10
pdep %rdx, %r10, %r10
xor %r10, 56(%rdi)
mov 40(%rsi), %r11
mov $0xc00000000000007f, %r12
pext %r12, %r11, %r10
mov $0x8090101010101010, %rcx
pdep %rcx, %r10, %r10
rol $11, %r10
xor %r10, 0(%rdi)
pext %rbx, %r11, %r10
mov $0x8080808080808080, %rax
pdep %rax, %r10, %r10
xor %r10, 8(%rdi)
pext %r13, %r11, %r10
pdep %rax, %r10, %r10
xor %r10, 16(%rdi)
pext %r14, %r11, %r10
pdep %rax, %r10, %r10
xor %r10, 24(%rdi)
pext %r15, %r11, %r10
pdep %rax, %r10, %r10
xor %r10, 32(%rdi)
pext %r9, %r11, %r10
pdep %rax, %r10, %r10
xor %r10, 40(%rdi)
pext %rbp, %r11, %r10
pdep %rax, %r10, %r10
xor %r10, 48(%rdi)
mov $0x3f80000000000000, %r8
pext %r8, %r11, %r10
mov $0x80808080808080, %rdx
pdep %rdx, %r10, %r10
xor %r10, 56(%rdi)
mov 48(%rsi), %r11
mov $0xc00000000000003f, %r12
pext %r12, %r11, %r10
mov $0x8080101010101010, %rcx
pdep %rcx, %r10, %r10
rol $14, %r10
xor %r10, 0(%rdi)
mov $0x3fc0, %rbx
pext %rbx, %r11, %r10
mov $0x404040404040404, %r13
pdep %r13, %r10, %r10
xor %r10, 8(%rdi)
mov $0x3fc000, %r14
pext %r14, %r11, %r10
pdep %r13, %r10, %r10
xor %r10, 16(%rdi)
mov $0x3fc00000, %r15
pext %r15, %r11, %r10
pdep %r13, %r10, %r10
xor %r10, 24(%rdi)
mov $0x3fc0000000, %r9
pext %r9, %r11, %r10
pdep %r13, %r10, %r10
xor %r10, 32(%rdi)
mov $0x3fc000000000, %rbp
pext %rbp, %r11, %r10
pdep %r13, %r10, %r10
xor %r10, 40(%rdi)
mov $0x3fc00000000000, %rax
pext %rax, %r11, %r10
pdep %r13, %r10, %r10
xor %r10, 48(%rdi)
mov $0x3fc0000000000000, %r8
pext %r8, %r11, %r10
pdep %r13, %r10, %r10
xor %r10, 56(%rdi)
mov 56(%rsi), %r11
mov %r11, %r10
and $0x3f, %r10
mov $0x2020202020200000, %rdx
pdep %rdx, %r10, %r10
xor %r10, 0(%rdi)
pext %rbx, %r11, %r10
mov $0x2020202020202020, %r12
pdep %r12, %r10, %r10
xor %r10, 8(%rdi)
pext %r14, %r11, %r10
pdep %r12, %r10, %r10
xor %r10, 16(%rdi)
pext %r15, %r11, %r10
pdep %r12, %r10, %r10
xor %r10, 24(%rdi)
pext %r9, %r11, %r10
pdep %r12, %r10, %r10
xor %r10, 32(%rdi)
pext %rbp, %r11, %r10
pdep %r12, %r10, %r10
xor %r10, 40(%rdi)
pext %rax, %r11, %r10
pdep %r12, %r10, %r10
xor %r10, 48(%rdi)
mov $0x1fc0000000000000, %rcx
pext %rcx, %r11, %r10
mov $0x20202020202020, %r8
pdep %r8, %r10, %r10
xor %r10, 56(%rdi)
pop %rbp
pop %rbx
pop %r12
pop %r13
pop %r14
pop %r15
ret

+ 4186
- 0
crypto_kem/ntruhps2048509/avx2/square_63_509_shufbytes.s
File diff suppressed because it is too large
View File


+ 296
- 0
crypto_kem/ntruhps2048509/avx2/square_6_509_patience.s View File

@@ -0,0 +1,296 @@
.data
.p2align 5
.text
.global PQCLEAN_NTRUHPS2048509_AVX2_square_6_509
.global _PQCLEAN_NTRUHPS2048509_AVX2_square_6_509
PQCLEAN_NTRUHPS2048509_AVX2_square_6_509:
_PQCLEAN_NTRUHPS2048509_AVX2_square_6_509:
push %r15
push %r14
push %r13
push %r12
push %rbx
push %rbp
mov 0(%rsi), %r11
mov $0x101010101010101, %rbp
pext %rbp, %r11, %r10
mov $0x249249, %rbx
pdep %rbx, %r10, %r10
mov %r10, 0(%rdi)
mov $0x202020202020202, %r12
pext %r12, %r11, %r10
pdep %rbx, %r10, %r10
mov %r10, 8(%rdi)
mov $0x404040404040404, %r13
pext %r13, %r11, %r10
pdep %rbx, %r10, %r10
mov %r10, 16(%rdi)
mov $0x808080808080808, %r14
pext %r14, %r11, %r10
pdep %rbx, %r10, %r10
mov %r10, 24(%rdi)
mov $0x1010101010101010, %r15
pext %r15, %r11, %r10
pdep %rbx, %r10, %r10
mov %r10, 32(%rdi)
mov $0x2020202020202020, %r9
pext %r9, %r11, %r10
pdep %rbx, %r10, %r10
mov %r10, 40(%rdi)
mov $0x4040404040404040, %r8
pext %r8, %r11, %r10
pdep %rbx, %r10, %r10
mov %r10, 48(%rdi)
mov $0x8080808080808080, %rdx
pext %rdx, %r11, %r10
pdep %rbx, %r10, %r10
mov %r10, 56(%rdi)
mov 8(%rsi), %r11
pext %rbp, %r11, %r10
mov $0x249249000000, %rcx
pdep %rcx, %r10, %r10
xor %r10, 0(%rdi)
pext %r12, %r11, %r10
pdep %rcx, %r10, %r10
xor %r10, 8(%rdi)
pext %r13, %r11, %r10
pdep %rcx, %r10, %r10
xor %r10, 16(%rdi)
pext %r14, %r11, %r10
pdep %rcx, %r10, %r10
xor %r10, 24(%rdi)
pext %r15, %r11, %r10
pdep %rcx, %r10, %r10
xor %r10, 32(%rdi)
pext %r9, %r11, %r10
pdep %rcx, %r10, %r10
xor %r10, 40(%rdi)
pext %r8, %r11, %r10
pdep %rcx, %r10, %r10
xor %r10, 48(%rdi)
pext %rdx, %r11, %r10
pdep %rcx, %r10, %r10
xor %r10, 56(%rdi)
mov 16(%rsi), %r11
mov $0x8080810101010101, %rax
pext %rax, %r11, %r10
mov $0x9249248000000000, %rbx
pdep %rbx, %r10, %r10
rol $9, %r10
xor %r10, 0(%rdi)
mov $0x101020202020202, %rbp
pext %rbp, %r11, %r10
mov $0x9249240000000000, %r12
pdep %r12, %r10, %r10
rol $6, %r10
xor %r10, 8(%rdi)
mov $0x202040404040404, %r13
pext %r13, %r11, %r10
pdep %r12, %r10, %r10
rol $6, %r10
xor %r10, 16(%rdi)
mov $0x404080808080808, %r14
pext %r14, %r11, %r10
pdep %r12, %r10, %r10
rol $6, %r10
xor %r10, 24(%rdi)
mov $0x808101010101010, %r15
pext %r15, %r11, %r10
pdep %r12, %r10, %r10
rol $6, %r10
xor %r10, 32(%rdi)
mov $0x1010202020202020, %r9
pext %r9, %r11, %r10
pdep %r12, %r10, %r10
rol $6, %r10
xor %r10, 40(%rdi)
mov $0x2020404040404040, %r8
pext %r8, %r11, %r10
pdep %r12, %r10, %r10
rol $6, %r10
xor %r10, 48(%rdi)
mov $0x4040008080808080, %rdx
pext %rdx, %r11, %r10
mov $0x9049240000000000, %rcx
pdep %rcx, %r10, %r10
rol $6, %r10
xor %r10, 56(%rdi)
mov 24(%rsi), %r11
mov $0x8080808080808080, %rax
pext %rax, %r11, %r10
mov $0x124924800, %rbx
pdep %rbx, %r10, %r10
xor %r10, 0(%rdi)
mov $0x101010101010101, %rbp
pext %rbp, %r11, %r10
mov $0x24924900, %r13
pdep %r13, %r10, %r10
xor %r10, 8(%rdi)
mov $0x202020202020202, %r14
pext %r14, %r11, %r10
pdep %r13, %r10, %r10
xor %r10, 16(%rdi)
mov $0x404040404040404, %r15
pext %r15, %r11, %r10
pdep %r13, %r10, %r10
xor %r10, 24(%rdi)
mov $0x808080808080808, %r9
pext %r9, %r11, %r10
pdep %r13, %r10, %r10
xor %r10, 32(%rdi)
mov $0x1010101010101010, %r8
pext %r8, %r11, %r10
pdep %r13, %r10, %r10
xor %r10, 40(%rdi)
mov $0x2020202020202020, %r12
pext %r12, %r11, %r10
pdep %r13, %r10, %r10
xor %r10, 48(%rdi)
mov $0x4040404040404040, %rdx
pext %rdx, %r11, %r10
pdep %r13, %r10, %r10
xor %r10, 56(%rdi)
mov 32(%rsi), %r11
pext %rax, %r11, %r10
mov $0x124924800000000, %rcx
pdep %rcx, %r10, %r10
xor %r10, 0(%rdi)
pext %rbp, %r11, %r10
mov $0x24924900000000, %rbx
pdep %rbx, %r10, %r10
xor %r10, 8(%rdi)
pext %r14, %r11, %r10
pdep %rbx, %r10, %r10
xor %r10, 16(%rdi)
pext %r15, %r11, %r10
pdep %rbx, %r10, %r10
xor %r10, 24(%rdi)
pext %r9, %r11, %r10
pdep %rbx, %r10, %r10
xor %r10, 32(%rdi)
pext %r8, %r11, %r10
pdep %rbx, %r10, %r10
xor %r10, 40(%rdi)
pext %r12, %r11, %r10
pdep %rbx, %r10, %r10
xor %r10, 48(%rdi)
pext %rdx, %r11, %r10
pdep %rbx, %r10, %r10
xor %r10, 56(%rdi)
mov 40(%rsi), %r11
mov $0x4040404040408080, %r13
pext %r13, %r11, %r10
mov $0x9249240000000000, %rax
pdep %rax, %r10, %r10
rol $17, %r10
xor %r10, 0(%rdi)
mov $0x8080808080810101, %rcx
pext %rcx, %r11, %r10
mov $0x9249248000000000, %rbp
pdep %rbp, %r10, %r10
rol $17, %r10
xor %r10, 8(%rdi)
mov $0x101010101020202, %r14
pext %r14, %r11, %r10
pdep %rax, %r10, %r10
rol $14, %r10
xor %r10, 16(%rdi)
mov $0x202020202040404, %r15
pext %r15, %r11, %r10
pdep %rax, %r10, %r10
rol $14, %r10
xor %r10, 24(%rdi)
mov $0x404040404080808, %r9
pext %r9, %r11, %r10
pdep %rax, %r10, %r10
rol $14, %r10
xor %r10, 32(%rdi)
mov $0x808080808101010, %r8
pext %r8, %r11, %r10
pdep %rax, %r10, %r10
rol $14, %r10
xor %r10, 40(%rdi)
mov $0x1010101010202020, %r12
pext %r12, %r11, %r10
pdep %rax, %r10, %r10
rol $14, %r10
xor %r10, 48(%rdi)
mov $0x2020202020004040, %rdx
pext %rdx, %r11, %r10
mov $0x9248240000000000, %rbx
pdep %rbx, %r10, %r10
rol $14, %r10
xor %r10, 56(%rdi)
mov 48(%rsi), %r11
mov $0x4040404040404040, %r13
pext %r13, %r11, %r10
mov $0x12492480000, %rcx
pdep %rcx, %r10, %r10
xor %r10, 0(%rdi)
mov $0x8080808080808080, %rbp
pext %rbp, %r11, %r10
pdep %rcx, %r10, %r10
xor %r10, 8(%rdi)
mov $0x101010101010101, %r14
pext %r14, %r11, %r10
mov $0x2492490000, %r15
pdep %r15, %r10, %r10
xor %r10, 16(%rdi)
mov $0x202020202020202, %r9
pext %r9, %r11, %r10
pdep %r15, %r10, %r10
xor %r10, 24(%rdi)
mov $0x404040404040404, %r8
pext %r8, %r11, %r10
pdep %r15, %r10, %r10
xor %r10, 32(%rdi)
mov $0x808080808080808, %r12
pext %r12, %r11, %r10
pdep %r15, %r10, %r10
xor %r10, 40(%rdi)
mov $0x1010101010101010, %rax
pext %rax, %r11, %r10
pdep %r15, %r10, %r10
xor %r10, 48(%rdi)
mov $0x2020202020202020, %rdx
pext %rdx, %r11, %r10
pdep %r15, %r10, %r10
xor %r10, 56(%rdi)
mov 56(%rsi), %r11
mov $0x40404040404040, %rbx
pext %rbx, %r11, %r10
mov $0x2492480000000000, %r13
pdep %r13, %r10, %r10
xor %r10, 0(%rdi)
mov $0x80808080808080, %rbp
pext %rbp, %r11, %r10
pdep %r13, %r10, %r10
xor %r10, 8(%rdi)
pext %r14, %r11, %r10
mov $0x2492490000000000, %rcx
pdep %rcx, %r10, %r10
xor %r10, 16(%rdi)
pext %r9, %r11, %r10
pdep %rcx, %r10, %r10
xor %r10, 24(%rdi)
pext %r8, %r11, %r10
pdep %rcx, %r10, %r10
xor %r10, 32(%rdi)
pext %r12, %r11, %r10
pdep %rcx, %r10, %r10
xor %r10, 40(%rdi)
pext %rax, %r11, %r10
pdep %rcx, %r10, %r10
xor %r10, 48(%rdi)
mov $0x20202020202020, %rdx
pext %rdx, %r11, %r10
mov $0x492490000000000, %r15
pdep %r15, %r10, %r10
xor %r10, 56(%rdi)
pop %rbp
pop %rbx
pop %r12
pop %r13
pop %r14
pop %r15
ret

+ 784
- 0
crypto_kem/ntruhps2048509/avx2/vec32_sample_iid.s View File

@@ -0,0 +1,784 @@
.data
.p2align 5
cast8_to_16:
.byte 255
.byte 0
.byte 255
.byte 1
.byte 255
.byte 2
.byte 255
.byte 3
.byte 255
.byte 4
.byte 255
.byte 5
.byte 255
.byte 6
.byte 255
.byte 7
.byte 255
.byte 0
.byte 255
.byte 1
.byte 255
.byte 2
.byte 255
.byte 3
.byte 255
.byte 4
.byte 255
.byte 5
.byte 255
.byte 6
.byte 255
.byte 7
mask_ff:
.word 0xff
.word 0xff
.word 0xff
.word 0xff
.word 0xff
.word 0xff
.word 0xff
.word 0xff
.word 0xff
.word 0xff
.word 0xff
.word 0xff
.word 0xff
.word 0xff
.word 0xff
.word 0xff
mask_f:
.word 0xf
.word 0xf
.word 0xf
.word 0xf
.word 0xf
.word 0xf
.word 0xf
.word 0xf
.word 0xf
.word 0xf
.word 0xf
.word 0xf
.word 0xf
.word 0xf
.word 0xf
.word 0xf
mask_3:
.word 0x03
.word 0x03
.word 0x03
.word 0x03
.word 0x03
.word 0x03
.word 0x03
.word 0x03
.word 0x03
.word 0x03
.word 0x03
.word 0x03
.word 0x03
.word 0x03
.word 0x03
.word 0x03
.text
.global PQCLEAN_NTRUHPS2048509_AVX2_vec32_sample_iid
.global _PQCLEAN_NTRUHPS2048509_AVX2_vec32_sample_iid
PQCLEAN_NTRUHPS2048509_AVX2_vec32_sample_iid:
_PQCLEAN_NTRUHPS2048509_AVX2_vec32_sample_iid:
vmovdqa 0(%rsi), %ymm3
vextracti128 $0, %ymm3, %xmm1
vpermq $216, %ymm1, %ymm1
vpshufb cast8_to_16(%rip), %ymm1, %ymm1
vpsrlw $8, %ymm1, %ymm2
vpand mask_ff(%rip), %ymm1, %ymm1
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_f(%rip), %ymm2, %ymm1
vpsrlw $4, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpsubw mask_3(%rip), %ymm2, %ymm14
vpsraw $15, %ymm14, %ymm15
vpandn %ymm14, %ymm15, %ymm1
vpand %ymm15, %ymm2, %ymm14
vpxor %ymm14, %ymm1, %ymm2
vmovdqa %ymm2, 0(%rdi)
vextracti128 $1, %ymm3, %xmm1
vpermq $216, %ymm1, %ymm1
vpshufb cast8_to_16(%rip), %ymm1, %ymm1
vpsrlw $8, %ymm1, %ymm2
vpand mask_ff(%rip), %ymm1, %ymm1
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_f(%rip), %ymm2, %ymm1
vpsrlw $4, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpsubw mask_3(%rip), %ymm2, %ymm14
vpsraw $15, %ymm14, %ymm15
vpandn %ymm14, %ymm15, %ymm1
vpand %ymm15, %ymm2, %ymm14
vpxor %ymm14, %ymm1, %ymm2
vmovdqa %ymm2, 32(%rdi)
vmovdqa 32(%rsi), %ymm3
vextracti128 $0, %ymm3, %xmm1
vpermq $216, %ymm1, %ymm1
vpshufb cast8_to_16(%rip), %ymm1, %ymm1
vpsrlw $8, %ymm1, %ymm2
vpand mask_ff(%rip), %ymm1, %ymm1
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_f(%rip), %ymm2, %ymm1
vpsrlw $4, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpsubw mask_3(%rip), %ymm2, %ymm14
vpsraw $15, %ymm14, %ymm15
vpandn %ymm14, %ymm15, %ymm1
vpand %ymm15, %ymm2, %ymm14
vpxor %ymm14, %ymm1, %ymm2
vmovdqa %ymm2, 64(%rdi)
vextracti128 $1, %ymm3, %xmm1
vpermq $216, %ymm1, %ymm1
vpshufb cast8_to_16(%rip), %ymm1, %ymm1
vpsrlw $8, %ymm1, %ymm2
vpand mask_ff(%rip), %ymm1, %ymm1
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_f(%rip), %ymm2, %ymm1
vpsrlw $4, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpsubw mask_3(%rip), %ymm2, %ymm14
vpsraw $15, %ymm14, %ymm15
vpandn %ymm14, %ymm15, %ymm1
vpand %ymm15, %ymm2, %ymm14
vpxor %ymm14, %ymm1, %ymm2
vmovdqa %ymm2, 96(%rdi)
vmovdqa 64(%rsi), %ymm3
vextracti128 $0, %ymm3, %xmm1
vpermq $216, %ymm1, %ymm1
vpshufb cast8_to_16(%rip), %ymm1, %ymm1
vpsrlw $8, %ymm1, %ymm2
vpand mask_ff(%rip), %ymm1, %ymm1
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_f(%rip), %ymm2, %ymm1
vpsrlw $4, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpsubw mask_3(%rip), %ymm2, %ymm14
vpsraw $15, %ymm14, %ymm15
vpandn %ymm14, %ymm15, %ymm1
vpand %ymm15, %ymm2, %ymm14
vpxor %ymm14, %ymm1, %ymm2
vmovdqa %ymm2, 128(%rdi)
vextracti128 $1, %ymm3, %xmm1
vpermq $216, %ymm1, %ymm1
vpshufb cast8_to_16(%rip), %ymm1, %ymm1
vpsrlw $8, %ymm1, %ymm2
vpand mask_ff(%rip), %ymm1, %ymm1
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_f(%rip), %ymm2, %ymm1
vpsrlw $4, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpsubw mask_3(%rip), %ymm2, %ymm14
vpsraw $15, %ymm14, %ymm15
vpandn %ymm14, %ymm15, %ymm1
vpand %ymm15, %ymm2, %ymm14
vpxor %ymm14, %ymm1, %ymm2
vmovdqa %ymm2, 160(%rdi)
vmovdqa 96(%rsi), %ymm3
vextracti128 $0, %ymm3, %xmm1
vpermq $216, %ymm1, %ymm1
vpshufb cast8_to_16(%rip), %ymm1, %ymm1
vpsrlw $8, %ymm1, %ymm2
vpand mask_ff(%rip), %ymm1, %ymm1
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_f(%rip), %ymm2, %ymm1
vpsrlw $4, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpsubw mask_3(%rip), %ymm2, %ymm14
vpsraw $15, %ymm14, %ymm15
vpandn %ymm14, %ymm15, %ymm1
vpand %ymm15, %ymm2, %ymm14
vpxor %ymm14, %ymm1, %ymm2
vmovdqa %ymm2, 192(%rdi)
vextracti128 $1, %ymm3, %xmm1
vpermq $216, %ymm1, %ymm1
vpshufb cast8_to_16(%rip), %ymm1, %ymm1
vpsrlw $8, %ymm1, %ymm2
vpand mask_ff(%rip), %ymm1, %ymm1
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_f(%rip), %ymm2, %ymm1
vpsrlw $4, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpsubw mask_3(%rip), %ymm2, %ymm14
vpsraw $15, %ymm14, %ymm15
vpandn %ymm14, %ymm15, %ymm1
vpand %ymm15, %ymm2, %ymm14
vpxor %ymm14, %ymm1, %ymm2
vmovdqa %ymm2, 224(%rdi)
vmovdqa 128(%rsi), %ymm3
vextracti128 $0, %ymm3, %xmm1
vpermq $216, %ymm1, %ymm1
vpshufb cast8_to_16(%rip), %ymm1, %ymm1
vpsrlw $8, %ymm1, %ymm2
vpand mask_ff(%rip), %ymm1, %ymm1
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_f(%rip), %ymm2, %ymm1
vpsrlw $4, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpsubw mask_3(%rip), %ymm2, %ymm14
vpsraw $15, %ymm14, %ymm15
vpandn %ymm14, %ymm15, %ymm1
vpand %ymm15, %ymm2, %ymm14
vpxor %ymm14, %ymm1, %ymm2
vmovdqa %ymm2, 256(%rdi)
vextracti128 $1, %ymm3, %xmm1
vpermq $216, %ymm1, %ymm1
vpshufb cast8_to_16(%rip), %ymm1, %ymm1
vpsrlw $8, %ymm1, %ymm2
vpand mask_ff(%rip), %ymm1, %ymm1
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_f(%rip), %ymm2, %ymm1
vpsrlw $4, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpsubw mask_3(%rip), %ymm2, %ymm14
vpsraw $15, %ymm14, %ymm15
vpandn %ymm14, %ymm15, %ymm1
vpand %ymm15, %ymm2, %ymm14
vpxor %ymm14, %ymm1, %ymm2
vmovdqa %ymm2, 288(%rdi)
vmovdqa 160(%rsi), %ymm3
vextracti128 $0, %ymm3, %xmm1
vpermq $216, %ymm1, %ymm1
vpshufb cast8_to_16(%rip), %ymm1, %ymm1
vpsrlw $8, %ymm1, %ymm2
vpand mask_ff(%rip), %ymm1, %ymm1
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_f(%rip), %ymm2, %ymm1
vpsrlw $4, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpsubw mask_3(%rip), %ymm2, %ymm14
vpsraw $15, %ymm14, %ymm15
vpandn %ymm14, %ymm15, %ymm1
vpand %ymm15, %ymm2, %ymm14
vpxor %ymm14, %ymm1, %ymm2
vmovdqa %ymm2, 320(%rdi)
vextracti128 $1, %ymm3, %xmm1
vpermq $216, %ymm1, %ymm1
vpshufb cast8_to_16(%rip), %ymm1, %ymm1
vpsrlw $8, %ymm1, %ymm2
vpand mask_ff(%rip), %ymm1, %ymm1
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_f(%rip), %ymm2, %ymm1
vpsrlw $4, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpsubw mask_3(%rip), %ymm2, %ymm14
vpsraw $15, %ymm14, %ymm15
vpandn %ymm14, %ymm15, %ymm1
vpand %ymm15, %ymm2, %ymm14
vpxor %ymm14, %ymm1, %ymm2
vmovdqa %ymm2, 352(%rdi)
vmovdqa 192(%rsi), %ymm3
vextracti128 $0, %ymm3, %xmm1
vpermq $216, %ymm1, %ymm1
vpshufb cast8_to_16(%rip), %ymm1, %ymm1
vpsrlw $8, %ymm1, %ymm2
vpand mask_ff(%rip), %ymm1, %ymm1
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_f(%rip), %ymm2, %ymm1
vpsrlw $4, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpsubw mask_3(%rip), %ymm2, %ymm14
vpsraw $15, %ymm14, %ymm15
vpandn %ymm14, %ymm15, %ymm1
vpand %ymm15, %ymm2, %ymm14
vpxor %ymm14, %ymm1, %ymm2
vmovdqa %ymm2, 384(%rdi)
vextracti128 $1, %ymm3, %xmm1
vpermq $216, %ymm1, %ymm1
vpshufb cast8_to_16(%rip), %ymm1, %ymm1
vpsrlw $8, %ymm1, %ymm2
vpand mask_ff(%rip), %ymm1, %ymm1
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_f(%rip), %ymm2, %ymm1
vpsrlw $4, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpsubw mask_3(%rip), %ymm2, %ymm14
vpsraw $15, %ymm14, %ymm15
vpandn %ymm14, %ymm15, %ymm1
vpand %ymm15, %ymm2, %ymm14
vpxor %ymm14, %ymm1, %ymm2
vmovdqa %ymm2, 416(%rdi)
vmovdqa 224(%rsi), %ymm3
vextracti128 $0, %ymm3, %xmm1
vpermq $216, %ymm1, %ymm1
vpshufb cast8_to_16(%rip), %ymm1, %ymm1
vpsrlw $8, %ymm1, %ymm2
vpand mask_ff(%rip), %ymm1, %ymm1
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_f(%rip), %ymm2, %ymm1
vpsrlw $4, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpsubw mask_3(%rip), %ymm2, %ymm14
vpsraw $15, %ymm14, %ymm15
vpandn %ymm14, %ymm15, %ymm1
vpand %ymm15, %ymm2, %ymm14
vpxor %ymm14, %ymm1, %ymm2
vmovdqa %ymm2, 448(%rdi)
vextracti128 $1, %ymm3, %xmm1
vpermq $216, %ymm1, %ymm1
vpshufb cast8_to_16(%rip), %ymm1, %ymm1
vpsrlw $8, %ymm1, %ymm2
vpand mask_ff(%rip), %ymm1, %ymm1
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_f(%rip), %ymm2, %ymm1
vpsrlw $4, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpsubw mask_3(%rip), %ymm2, %ymm14
vpsraw $15, %ymm14, %ymm15
vpandn %ymm14, %ymm15, %ymm1
vpand %ymm15, %ymm2, %ymm14
vpxor %ymm14, %ymm1, %ymm2
vmovdqa %ymm2, 480(%rdi)
vmovdqa 256(%rsi), %ymm3
vextracti128 $0, %ymm3, %xmm1
vpermq $216, %ymm1, %ymm1
vpshufb cast8_to_16(%rip), %ymm1, %ymm1
vpsrlw $8, %ymm1, %ymm2
vpand mask_ff(%rip), %ymm1, %ymm1
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_f(%rip), %ymm2, %ymm1
vpsrlw $4, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpsubw mask_3(%rip), %ymm2, %ymm14
vpsraw $15, %ymm14, %ymm15
vpandn %ymm14, %ymm15, %ymm1
vpand %ymm15, %ymm2, %ymm14
vpxor %ymm14, %ymm1, %ymm2
vmovdqa %ymm2, 512(%rdi)
vextracti128 $1, %ymm3, %xmm1
vpermq $216, %ymm1, %ymm1
vpshufb cast8_to_16(%rip), %ymm1, %ymm1
vpsrlw $8, %ymm1, %ymm2
vpand mask_ff(%rip), %ymm1, %ymm1
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_f(%rip), %ymm2, %ymm1
vpsrlw $4, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpsubw mask_3(%rip), %ymm2, %ymm14
vpsraw $15, %ymm14, %ymm15
vpandn %ymm14, %ymm15, %ymm1
vpand %ymm15, %ymm2, %ymm14
vpxor %ymm14, %ymm1, %ymm2
vmovdqa %ymm2, 544(%rdi)
vmovdqa 288(%rsi), %ymm3
vextracti128 $0, %ymm3, %xmm1
vpermq $216, %ymm1, %ymm1
vpshufb cast8_to_16(%rip), %ymm1, %ymm1
vpsrlw $8, %ymm1, %ymm2
vpand mask_ff(%rip), %ymm1, %ymm1
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_f(%rip), %ymm2, %ymm1
vpsrlw $4, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpsubw mask_3(%rip), %ymm2, %ymm14
vpsraw $15, %ymm14, %ymm15
vpandn %ymm14, %ymm15, %ymm1
vpand %ymm15, %ymm2, %ymm14
vpxor %ymm14, %ymm1, %ymm2
vmovdqa %ymm2, 576(%rdi)
vextracti128 $1, %ymm3, %xmm1
vpermq $216, %ymm1, %ymm1
vpshufb cast8_to_16(%rip), %ymm1, %ymm1
vpsrlw $8, %ymm1, %ymm2
vpand mask_ff(%rip), %ymm1, %ymm1
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_f(%rip), %ymm2, %ymm1
vpsrlw $4, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpsubw mask_3(%rip), %ymm2, %ymm14
vpsraw $15, %ymm14, %ymm15
vpandn %ymm14, %ymm15, %ymm1
vpand %ymm15, %ymm2, %ymm14
vpxor %ymm14, %ymm1, %ymm2
vmovdqa %ymm2, 608(%rdi)
vmovdqa 320(%rsi), %ymm3
vextracti128 $0, %ymm3, %xmm1
vpermq $216, %ymm1, %ymm1
vpshufb cast8_to_16(%rip), %ymm1, %ymm1
vpsrlw $8, %ymm1, %ymm2
vpand mask_ff(%rip), %ymm1, %ymm1
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_f(%rip), %ymm2, %ymm1
vpsrlw $4, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpsubw mask_3(%rip), %ymm2, %ymm14
vpsraw $15, %ymm14, %ymm15
vpandn %ymm14, %ymm15, %ymm1
vpand %ymm15, %ymm2, %ymm14
vpxor %ymm14, %ymm1, %ymm2
vmovdqa %ymm2, 640(%rdi)
vextracti128 $1, %ymm3, %xmm1
vpermq $216, %ymm1, %ymm1
vpshufb cast8_to_16(%rip), %ymm1, %ymm1
vpsrlw $8, %ymm1, %ymm2
vpand mask_ff(%rip), %ymm1, %ymm1
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_f(%rip), %ymm2, %ymm1
vpsrlw $4, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpsubw mask_3(%rip), %ymm2, %ymm14
vpsraw $15, %ymm14, %ymm15
vpandn %ymm14, %ymm15, %ymm1
vpand %ymm15, %ymm2, %ymm14
vpxor %ymm14, %ymm1, %ymm2
vmovdqa %ymm2, 672(%rdi)
vmovdqa 352(%rsi), %ymm3
vextracti128 $0, %ymm3, %xmm1
vpermq $216, %ymm1, %ymm1
vpshufb cast8_to_16(%rip), %ymm1, %ymm1
vpsrlw $8, %ymm1, %ymm2
vpand mask_ff(%rip), %ymm1, %ymm1
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_f(%rip), %ymm2, %ymm1
vpsrlw $4, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpsubw mask_3(%rip), %ymm2, %ymm14
vpsraw $15, %ymm14, %ymm15
vpandn %ymm14, %ymm15, %ymm1
vpand %ymm15, %ymm2, %ymm14
vpxor %ymm14, %ymm1, %ymm2
vmovdqa %ymm2, 704(%rdi)
vextracti128 $1, %ymm3, %xmm1
vpermq $216, %ymm1, %ymm1
vpshufb cast8_to_16(%rip), %ymm1, %ymm1
vpsrlw $8, %ymm1, %ymm2
vpand mask_ff(%rip), %ymm1, %ymm1
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_f(%rip), %ymm2, %ymm1
vpsrlw $4, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpsubw mask_3(%rip), %ymm2, %ymm14
vpsraw $15, %ymm14, %ymm15
vpandn %ymm14, %ymm15, %ymm1
vpand %ymm15, %ymm2, %ymm14
vpxor %ymm14, %ymm1, %ymm2
vmovdqa %ymm2, 736(%rdi)
vmovdqa 384(%rsi), %ymm3
vextracti128 $0, %ymm3, %xmm1
vpermq $216, %ymm1, %ymm1
vpshufb cast8_to_16(%rip), %ymm1, %ymm1
vpsrlw $8, %ymm1, %ymm2
vpand mask_ff(%rip), %ymm1, %ymm1
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_f(%rip), %ymm2, %ymm1
vpsrlw $4, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpsubw mask_3(%rip), %ymm2, %ymm14
vpsraw $15, %ymm14, %ymm15
vpandn %ymm14, %ymm15, %ymm1
vpand %ymm15, %ymm2, %ymm14
vpxor %ymm14, %ymm1, %ymm2
vmovdqa %ymm2, 768(%rdi)
vextracti128 $1, %ymm3, %xmm1
vpermq $216, %ymm1, %ymm1
vpshufb cast8_to_16(%rip), %ymm1, %ymm1
vpsrlw $8, %ymm1, %ymm2
vpand mask_ff(%rip), %ymm1, %ymm1
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_f(%rip), %ymm2, %ymm1
vpsrlw $4, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpsubw mask_3(%rip), %ymm2, %ymm14
vpsraw $15, %ymm14, %ymm15
vpandn %ymm14, %ymm15, %ymm1
vpand %ymm15, %ymm2, %ymm14
vpxor %ymm14, %ymm1, %ymm2
vmovdqa %ymm2, 800(%rdi)
vmovdqa 416(%rsi), %ymm3
vextracti128 $0, %ymm3, %xmm1
vpermq $216, %ymm1, %ymm1
vpshufb cast8_to_16(%rip), %ymm1, %ymm1
vpsrlw $8, %ymm1, %ymm2
vpand mask_ff(%rip), %ymm1, %ymm1
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_f(%rip), %ymm2, %ymm1
vpsrlw $4, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpsubw mask_3(%rip), %ymm2, %ymm14
vpsraw $15, %ymm14, %ymm15
vpandn %ymm14, %ymm15, %ymm1
vpand %ymm15, %ymm2, %ymm14
vpxor %ymm14, %ymm1, %ymm2
vmovdqa %ymm2, 832(%rdi)
vextracti128 $1, %ymm3, %xmm1
vpermq $216, %ymm1, %ymm1
vpshufb cast8_to_16(%rip), %ymm1, %ymm1
vpsrlw $8, %ymm1, %ymm2
vpand mask_ff(%rip), %ymm1, %ymm1
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_f(%rip), %ymm2, %ymm1
vpsrlw $4, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpsubw mask_3(%rip), %ymm2, %ymm14
vpsraw $15, %ymm14, %ymm15
vpandn %ymm14, %ymm15, %ymm1
vpand %ymm15, %ymm2, %ymm14
vpxor %ymm14, %ymm1, %ymm2
vmovdqa %ymm2, 864(%rdi)
vmovdqa 448(%rsi), %ymm3
vextracti128 $0, %ymm3, %xmm1
vpermq $216, %ymm1, %ymm1
vpshufb cast8_to_16(%rip), %ymm1, %ymm1
vpsrlw $8, %ymm1, %ymm2
vpand mask_ff(%rip), %ymm1, %ymm1
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_f(%rip), %ymm2, %ymm1
vpsrlw $4, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpsubw mask_3(%rip), %ymm2, %ymm14
vpsraw $15, %ymm14, %ymm15
vpandn %ymm14, %ymm15, %ymm1
vpand %ymm15, %ymm2, %ymm14
vpxor %ymm14, %ymm1, %ymm2
vmovdqa %ymm2, 896(%rdi)
vextracti128 $1, %ymm3, %xmm1
vpermq $216, %ymm1, %ymm1
vpshufb cast8_to_16(%rip), %ymm1, %ymm1
vpsrlw $8, %ymm1, %ymm2
vpand mask_ff(%rip), %ymm1, %ymm1
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_f(%rip), %ymm2, %ymm1
vpsrlw $4, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpsubw mask_3(%rip), %ymm2, %ymm14
vpsraw $15, %ymm14, %ymm15
vpandn %ymm14, %ymm15, %ymm1
vpand %ymm15, %ymm2, %ymm14
vpxor %ymm14, %ymm1, %ymm2
vmovdqa %ymm2, 928(%rdi)
vmovdqa 480(%rsi), %ymm3
vextracti128 $0, %ymm3, %xmm1
vpermq $216, %ymm1, %ymm1
vpshufb cast8_to_16(%rip), %ymm1, %ymm1
vpsrlw $8, %ymm1, %ymm2
vpand mask_ff(%rip), %ymm1, %ymm1
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_f(%rip), %ymm2, %ymm1
vpsrlw $4, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpsubw mask_3(%rip), %ymm2, %ymm14
vpsraw $15, %ymm14, %ymm15
vpandn %ymm14, %ymm15, %ymm1
vpand %ymm15, %ymm2, %ymm14
vpxor %ymm14, %ymm1, %ymm2
vmovdqa %ymm2, 960(%rdi)
vextracti128 $1, %ymm3, %xmm1
vpermq $216, %ymm1, %ymm1
vpshufb cast8_to_16(%rip), %ymm1, %ymm1
vpsrlw $8, %ymm1, %ymm2
vpand mask_ff(%rip), %ymm1, %ymm1
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_f(%rip), %ymm2, %ymm1
vpsrlw $4, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpsubw mask_3(%rip), %ymm2, %ymm14
vpsraw $15, %ymm14, %ymm15
vpandn %ymm14, %ymm15, %ymm1
vpand %ymm15, %ymm2, %ymm14
vpxor %ymm14, %ymm1, %ymm2
vmovdqa %ymm2, 992(%rdi)
movw $0, 1016(%rdi)
movw $0, 1018(%rdi)
movw $0, 1020(%rdi)
movw $0, 1022(%rdi)
ret

+ 2
- 2
crypto_kem/ntruhps2048509/clean/Makefile View File

@@ -1,8 +1,8 @@
# This Makefile can be used with GNU Make or BSD Make

LIB=libntruhps2048509_clean.a
HEADERS=api.h crypto_sort.h owcpa.h params.h poly.h sample.h verify.h
OBJECTS=crypto_sort.o kem.o owcpa.o pack3.o packq.o poly.o poly_lift.o poly_mod.o poly_r2_inv.o poly_rq_mul.o poly_s3_inv.o sample.o sample_iid.o verify.o
HEADERS=api.h cmov.h crypto_sort_int32.h owcpa.h params.h poly.h sample.h
OBJECTS=cmov.o crypto_sort_int32.o kem.o owcpa.o pack3.o packq.o poly.o poly_lift.o poly_mod.o poly_r2_inv.o poly_rq_mul.o poly_s3_inv.o sample.o sample_iid.o

CFLAGS=-O3 -Wall -Wextra -Wpedantic -Wvla -Werror -Wredundant-decls -Wmissing-prototypes -std=c99 -I../../../common $(EXTRAFLAGS)



+ 1
- 1
crypto_kem/ntruhps2048509/clean/Makefile.Microsoft_nmake View File

@@ -2,7 +2,7 @@
# nmake /f Makefile.Microsoft_nmake

LIBRARY=libntruhps2048509_clean.lib
OBJECTS=crypto_sort.obj kem.obj owcpa.obj pack3.obj packq.obj poly.obj poly_lift.obj poly_mod.obj poly_r2_inv.obj poly_rq_mul.obj poly_s3_inv.obj sample.obj sample_iid.obj verify.obj
OBJECTS=cmov.obj crypto_sort_int32.obj kem.obj owcpa.obj pack3.obj packq.obj poly.obj poly_lift.obj poly_mod.obj poly_r2_inv.obj poly_rq_mul.obj poly_s3_inv.obj sample.obj sample_iid.obj

CFLAGS=/nologo /O2 /I ..\..\..\common /W4 /WX



+ 1
- 1
crypto_kem/ntruhps2048509/clean/api.h View File

@@ -8,7 +8,7 @@
#define PQCLEAN_NTRUHPS2048509_CLEAN_CRYPTO_CIPHERTEXTBYTES 699
#define PQCLEAN_NTRUHPS2048509_CLEAN_CRYPTO_BYTES 32

#define PQCLEAN_NTRUHPS2048509_CLEAN_CRYPTO_ALGNAME "NTRU-HPS2048509"
#define PQCLEAN_NTRUHPS2048509_CLEAN_CRYPTO_ALGNAME "ntruhps2048509"

int PQCLEAN_NTRUHPS2048509_CLEAN_crypto_kem_keypair(uint8_t *pk, uint8_t *sk);



+ 11
- 0
crypto_kem/ntruhps2048509/clean/cmov.c View File

@@ -0,0 +1,11 @@
#include "cmov.h"

/* b = 1 means mov, b = 0 means don't mov*/
void PQCLEAN_NTRUHPS2048509_CLEAN_cmov(unsigned char *r, const unsigned char *x, size_t len, unsigned char b) {
size_t i;

b = (~b + 1);
for (i = 0; i < len; i++) {
r[i] ^= b & (x[i] ^ r[i]);
}
}

+ 10
- 0
crypto_kem/ntruhps2048509/clean/cmov.h View File

@@ -0,0 +1,10 @@
#ifndef VERIFY_H
#define VERIFY_H

#include "params.h"

#include <stddef.h>

void PQCLEAN_NTRUHPS2048509_CLEAN_cmov(unsigned char *r, const unsigned char *x, size_t len, unsigned char b);

#endif

+ 0
- 50
crypto_kem/ntruhps2048509/clean/crypto_sort.c View File

@@ -1,50 +0,0 @@
// XXX: Temporary placeholder for a faster sort.
// Copied from supercop-20190110/crypto_sort/int32/portable3

#include <stdint.h>

#include "crypto_sort.h"

#define int32_MINMAX(a,b) \
do { \
int32_t ab = (b) ^ (a); \
int32_t c = (int32_t)((int64_t)(b) - (int64_t)(a)); \
c ^= ab & (c ^ (b)); \
c >>= 31; \
c &= ab; \
(a) ^= c; \
(b) ^= c; \
} while(0)

void PQCLEAN_NTRUHPS2048509_CLEAN_crypto_sort(void *array, long long n) {
long long top, p, q, r, i;
int32_t *x = array;

if (n < 2) {
return;
}
top = 1;
while (top < n - top) {
top += top;
}

for (p = top; p > 0; p >>= 1) {
for (i = 0; i < n - p; ++i) {
if (!(i & p)) {
int32_MINMAX(x[i], x[i + p]);
}
}
i = 0;
for (q = top; q > p; q >>= 1) {
for (; i < n - q; ++i) {
if (!(i & p)) {
int32_t a = x[i + p];
for (r = q; r > p; r >>= 1) {
int32_MINMAX(a, x[i + r]);
}
x[i + p] = a;
}
}
}
}
}

+ 0
- 6
crypto_kem/ntruhps2048509/clean/crypto_sort.h View File

@@ -1,6 +0,0 @@
#ifndef CRYPTO_SORT
#define CRYPTO_SORT

void PQCLEAN_NTRUHPS2048509_CLEAN_crypto_sort(void *array, long long n);

#endif

+ 86
- 0
crypto_kem/ntruhps2048509/clean/crypto_sort_int32.c View File

@@ -0,0 +1,86 @@
// Based on supercop-20190110/crypto_sort/int32/x86

#include "crypto_sort_int32.h"

#include <stdint.h>
#define int32 int32_t

#define int32_MINMAX(a,b) \
do { \
int32_t ab = (b) ^ (a); \
int32_t c = (int32_t)((int64_t)(b) - (int64_t)(a)); \
c ^= ab & (c ^ (b)); \
c >>= 31; \
c &= ab; \
(a) ^= c; \
(b) ^= c; \
} while(0)

/* assume 2 <= n <= 0x40000000 */
void PQCLEAN_NTRUHPS2048509_CLEAN_crypto_sort_int32(int32 *array, size_t n) {
size_t top, p, q, r, i, j;
int32 *x = array;

top = 1;
while (top < n - top) {
top += top;
}

for (p = top; p >= 1; p >>= 1) {
i = 0;
while (i + 2 * p <= n) {
for (j = i; j < i + p; ++j) {
int32_MINMAX(x[j], x[j + p]);
}
i += 2 * p;
}
for (j = i; j < n - p; ++j) {
int32_MINMAX(x[j], x[j + p]);
}

i = 0;
j = 0;
for (q = top; q > p; q >>= 1) {
if (j != i) {
for (;;) {
if (j == n - q) {
goto done;
}
int32 a = x[j + p];
for (r = q; r > p; r >>= 1) {
int32_MINMAX(a, x[j + r]);
}
x[j + p] = a;
++j;
if (j == i + p) {
i += 2 * p;
break;
}
}
}
while (i + p <= n - q) {
for (j = i; j < i + p; ++j) {
int32 a = x[j + p];
for (r = q; r > p; r >>= 1) {
int32_MINMAX(a, x[j + r]);
}
x[j + p] = a;
}
i += 2 * p;
}
/* now i + p > n - q */
j = i;
while (j < n - q) {
int32 a = x[j + p];
for (r = q; r > p; r >>= 1) {
int32_MINMAX(a, x[j + r]);
}
x[j + p] = a;
++j;
}

done:
;
}
}
}

+ 11
- 0
crypto_kem/ntruhps2048509/clean/crypto_sort_int32.h View File

@@ -0,0 +1,11 @@
#ifndef CRYPTO_SORT
#define CRYPTO_SORT

#include "params.h"

#include <stddef.h>
#include <stdint.h>

void PQCLEAN_NTRUHPS2048509_CLEAN_crypto_sort_int32(int32_t *array, size_t n);

#endif

+ 2
- 4
crypto_kem/ntruhps2048509/clean/kem.c View File

@@ -1,12 +1,10 @@
#include <stdint.h>

#include "api.h"
#include "cmov.h"
#include "fips202.h"
#include "owcpa.h"
#include "params.h"
#include "randombytes.h"
#include "sample.h"
#include "verify.h"

// API FUNCTIONS
int PQCLEAN_NTRUHPS2048509_CLEAN_crypto_kem_keypair(uint8_t *pk, uint8_t *sk) {
@@ -51,7 +49,7 @@ int PQCLEAN_NTRUHPS2048509_CLEAN_crypto_kem_dec(uint8_t *k, const uint8_t *c, co

fail |= PQCLEAN_NTRUHPS2048509_CLEAN_owcpa_dec(rm, c, sk);
/* If fail = 0 then c = Enc(h, rm). There is no need to re-encapsulate. */
/* See comment in PQCLEAN_NTRUHPS2048509_CLEAN_owcpa_dec for details. */
/* See comment in PQCLEAN_NTRUHPS2048509_CLEAN_owcpa_dec for details. */

sha3_256(k, rm, NTRU_OWCPA_MSGBYTES);



+ 1
- 0
crypto_kem/ntruhps2048509/clean/owcpa.c View File

@@ -59,6 +59,7 @@ void PQCLEAN_NTRUHPS2048509_CLEAN_owcpa_keypair(unsigned char *pk,
PQCLEAN_NTRUHPS2048509_CLEAN_poly_Z3_to_Zq(f);
PQCLEAN_NTRUHPS2048509_CLEAN_poly_Z3_to_Zq(g);


/* g = 3*g */
for (i = 0; i < NTRU_N; i++) {
g->coeffs[i] = 3 * g->coeffs[i];


+ 0
- 1
crypto_kem/ntruhps2048509/clean/pack3.c View File

@@ -19,7 +19,6 @@ void PQCLEAN_NTRUHPS2048509_CLEAN_poly_S3_tobytes(unsigned char msg[NTRU_OWCPA_M
c = (3 * c + a->coeffs[5 * i + j]) & 255;
}
msg[i] = c;

}

void PQCLEAN_NTRUHPS2048509_CLEAN_poly_S3_frombytes(poly *r, const unsigned char msg[NTRU_OWCPA_MSGBYTES]) {


+ 6
- 8
crypto_kem/ntruhps2048509/clean/packq.c View File

@@ -29,7 +29,6 @@ void PQCLEAN_NTRUHPS2048509_CLEAN_poly_Sq_tobytes(unsigned char *r, const poly *
t[j] = 0;
}


switch (NTRU_PACK_DEG & 0x07) {
// cases 0 and 6 are impossible since 2 generates (Z/n)* and
// p mod 8 in {1, 7} implies that 2 is a quadratic residue.
@@ -61,19 +60,18 @@ void PQCLEAN_NTRUHPS2048509_CLEAN_poly_Sq_frombytes(poly *r, const unsigned char
r->coeffs[8 * i + 6] = (a[11 * i + 8] >> 2) | (((uint16_t)a[11 * i + 9] & 0x1f) << 6);
r->coeffs[8 * i + 7] = (a[11 * i + 9] >> 5) | (((uint16_t)a[11 * i + 10] & 0xff) << 3);
}

switch (NTRU_PACK_DEG & 0x07) {
// cases 0 and 6 are impossible since 2 generates (Z/n)* and
// p mod 8 in {1, 7} implies that 2 is a quadratic residue.
case 4:
r->coeffs[8 * i + 0] = (unsigned char) (a[11 * i + 0] >> 0) | (((uint16_t)a[11 * i + 1] & 0x07) << 8);
r->coeffs[8 * i + 1] = (unsigned char) (a[11 * i + 1] >> 3) | (((uint16_t)a[11 * i + 2] & 0x3f) << 5);
r->coeffs[8 * i + 2] = (unsigned char) (a[11 * i + 2] >> 6) | (((uint16_t)a[11 * i + 3] & 0xff) << 2) | (((uint16_t)a[11 * i + 4] & 0x01) << 10);
r->coeffs[8 * i + 3] = (unsigned char) (a[11 * i + 4] >> 1) | (((uint16_t)a[11 * i + 5] & 0x0f) << 7);
r->coeffs[8 * i + 0] = (a[11 * i + 0] >> 0) | (((uint16_t)a[11 * i + 1] & 0x07) << 8);
r->coeffs[8 * i + 1] = (a[11 * i + 1] >> 3) | (((uint16_t)a[11 * i + 2] & 0x3f) << 5);
r->coeffs[8 * i + 2] = (a[11 * i + 2] >> 6) | (((uint16_t)a[11 * i + 3] & 0xff) << 2) | (((uint16_t)a[11 * i + 4] & 0x01) << 10);
r->coeffs[8 * i + 3] = (a[11 * i + 4] >> 1) | (((uint16_t)a[11 * i + 5] & 0x0f) << 7);
break;
case 2:
r->coeffs[8 * i + 0] = (unsigned char) (a[11 * i + 0] >> 0) | (((uint16_t)a[11 * i + 1] & 0x07) << 8);
r->coeffs[8 * i + 1] = (unsigned char) (a[11 * i + 1] >> 3) | (((uint16_t)a[11 * i + 2] & 0x3f) << 5);
r->coeffs[8 * i + 0] = (a[11 * i + 0] >> 0) | (((uint16_t)a[11 * i + 1] & 0x07) << 8);
r->coeffs[8 * i + 1] = (a[11 * i + 1] >> 3) | (((uint16_t)a[11 * i + 2] & 0x3f) << 5);
break;
}
r->coeffs[NTRU_N - 1] = 0;


+ 1
- 0
crypto_kem/ntruhps2048509/clean/params.h View File

@@ -5,6 +5,7 @@
#define NTRU_N 509
#define NTRU_LOGQ 11


/* Do not modify below this line */

#define PAD32(X) ((((X) + 31)/32)*32)


+ 2
- 6
crypto_kem/ntruhps2048509/clean/poly.h View File

@@ -1,16 +1,13 @@
#ifndef POLY_H
#define POLY_H

#include <stdint.h>

#include "params.h"

#include <stdint.h>

#define MODQ(X) ((X) & (NTRU_Q-1))

typedef struct {
// round to nearest multiple of 32 to make it easier to load into vector
// registers without having to do bound checks
#define NTRU_N_32 PAD32(NTRU_N)
uint16_t coeffs[NTRU_N];
} poly;

@@ -38,5 +35,4 @@ void PQCLEAN_NTRUHPS2048509_CLEAN_poly_S3_inv(poly *r, const poly *a);

void PQCLEAN_NTRUHPS2048509_CLEAN_poly_Z3_to_Zq(poly *r);
void PQCLEAN_NTRUHPS2048509_CLEAN_poly_trinary_Zq_to_Z3(poly *r);

#endif

+ 1
- 0
crypto_kem/ntruhps2048509/clean/poly_lift.c View File

@@ -8,3 +8,4 @@ void PQCLEAN_NTRUHPS2048509_CLEAN_poly_lift(poly *r, const poly *a) {
PQCLEAN_NTRUHPS2048509_CLEAN_poly_Z3_to_Zq(r);
}



+ 51
- 95
crypto_kem/ntruhps2048509/clean/poly_r2_inv.c View File

@@ -1,113 +1,69 @@
#include "poly.h"
#include "verify.h"

#define POLY_R2_ADD(I,A,B,S) \
for ((I)=0; (I)<NTRU_N; (I)++) { \
(A).coeffs[(I)] ^= (B).coeffs[(I)] * (S); \
}

static void cswappoly(poly *a, poly *b, int swap) {
int i;
uint16_t t;
swap = -swap;
for (i = 0; i < NTRU_N; i++) {
t = (a->coeffs[i] ^ b->coeffs[i]) & swap;
a->coeffs[i] ^= t;
b->coeffs[i] ^= t;
}
}

static inline void poly_divx(poly *a, int s) {
int i;
/* Based on supercop-20200702/crypto_core/invhrss701/simpler/core.c */

for (i = 1; i < NTRU_N; i++) {
a->coeffs[i - 1] = (unsigned char) ((s * a->coeffs[i]) | (!s * a->coeffs[i - 1]));
}
a->coeffs[NTRU_N - 1] = (!s * a->coeffs[NTRU_N - 1]);
}

static inline void poly_mulx(poly *a, int s) {
int i;
#include "poly.h"

for (i = 1; i < NTRU_N; i++) {
a->coeffs[NTRU_N - i] = (unsigned char) ((s * a->coeffs[NTRU_N - i - 1]) | (!s * a->coeffs[NTRU_N - i]));
}
a->coeffs[0] = (!s * a->coeffs[0]);
/* return -1 if x<0 and y<0; otherwise return 0 */
static inline int both_negative_mask(int x, int y) {
return (x & y) >> 15;
}

void PQCLEAN_NTRUHPS2048509_CLEAN_poly_R2_inv(poly *r, const poly *a) {
/* Schroeppel--Orman--O'Malley--Spatscheck
* "Almost Inverse" algorithm as described
* by Silverman in NTRU Tech Report #14 */
// with several modifications to make it run in constant-time
int i, j;
int k = 0;
uint16_t degf = NTRU_N - 1;
uint16_t degg = NTRU_N - 1;
int sign, t, swap;
int16_t done = 0;
poly b, f, g;
poly *c = r; // save some stack space
poly *temp_r = &f;
poly f, g, v, w;
int i, loop, delta;
int sign, swap, t;

/* b(X) := 1 */
for (i = 1; i < NTRU_N; i++) {
b.coeffs[i] = 0;
for (i = 0; i < NTRU_N; ++i) {
v.coeffs[i] = 0;
}
b.coeffs[0] = 1;

/* c(X) := 0 */
for (i = 0; i < NTRU_N; i++) {
c->coeffs[i] = 0;
for (i = 0; i < NTRU_N; ++i) {
w.coeffs[i] = 0;
}
w.coeffs[0] = 1;

/* f(X) := a(X) */
for (i = 0; i < NTRU_N; i++) {
f.coeffs[i] = a->coeffs[i] & 1;
for (i = 0; i < NTRU_N; ++i) {
f.coeffs[i] = 1;
}

/* g(X) := 1 + X + X^2 + ... + X^{N-1} */
for (i = 0; i < NTRU_N; i++) {
g.coeffs[i] = 1;
for (i = 0; i < NTRU_N - 1; ++i) {
g.coeffs[NTRU_N - 2 - i] = (a->coeffs[i] ^ a->coeffs[NTRU_N - 1]) & 1;
}
g.coeffs[NTRU_N - 1] = 0;

for (j = 0; j < 2 * (NTRU_N - 1) - 1; j++) {
sign = f.coeffs[0];
swap = sign & !done & ((degf - degg) >> 15);

cswappoly(&f, &g, swap);
cswappoly(&b, c, swap);
t = (degf ^ degg) & (-swap);
degf ^= t;
degg ^= t;

POLY_R2_ADD(i, f, g, sign * (!done));
POLY_R2_ADD(i, b, (*c), sign * (!done));

poly_divx(&f, !done);
poly_mulx(c, !done);
degf -= !done;
k += !done;

done = 1 - (((uint16_t) - degf) >> 15);
}
delta = 1;

k = k - NTRU_N * ((uint16_t)(NTRU_N - k - 1) >> 15);
for (loop = 0; loop < 2 * (NTRU_N - 1) - 1; ++loop) {
for (i = NTRU_N - 1; i > 0; --i) {
v.coeffs[i] = v.coeffs[i - 1];
}
v.coeffs[0] = 0;

sign = g.coeffs[0] & f.coeffs[0];
swap = both_negative_mask(-delta, -(int) g.coeffs[0]);
delta ^= swap & (delta ^ -delta);
delta += 1;

for (i = 0; i < NTRU_N; ++i) {
t = swap & (f.coeffs[i] ^ g.coeffs[i]);
f.coeffs[i] ^= t;
g.coeffs[i] ^= t;
t = swap & (v.coeffs[i] ^ w.coeffs[i]);
v.coeffs[i] ^= t;
w.coeffs[i] ^= t;
}

/* Return X^{N-k} * b(X) */
/* This is a k-coefficient rotation. We do this by looking at the binary
representation of k, rotating for every power of 2, and performing a cmov
if the respective bit is set. */
for (i = 0; i < NTRU_N; i++) {
r->coeffs[i] = b.coeffs[i];
for (i = 0; i < NTRU_N; ++i) {
g.coeffs[i] = g.coeffs[i] ^ (sign & f.coeffs[i]);
}
for (i = 0; i < NTRU_N; ++i) {
w.coeffs[i] = w.coeffs[i] ^ (sign & v.coeffs[i]);
}
for (i = 0; i < NTRU_N - 1; ++i) {
g.coeffs[i] = g.coeffs[i + 1];
}
g.coeffs[NTRU_N - 1] = 0;
}

for (i = 0; i < 10; i++) {
for (j = 0; j < NTRU_N; j++) {
temp_r->coeffs[j] = r->coeffs[(j + (1 << i)) % NTRU_N];
}
PQCLEAN_NTRUHPS2048509_CLEAN_cmov((unsigned char *) & (r->coeffs),
(unsigned char *) & (temp_r->coeffs), sizeof(uint16_t) * NTRU_N, k & 1);
k >>= 1;
for (i = 0; i < NTRU_N - 1; ++i) {
r->coeffs[i] = v.coeffs[NTRU_N - 2 - i];
}
r->coeffs[NTRU_N - 1] = 0;
}

+ 53
- 112
crypto_kem/ntruhps2048509/clean/poly_s3_inv.c View File

@@ -1,137 +1,78 @@
/* Based on supercop-20200702/crypto_core/invhrss701/simpler/core.c */

#include "poly.h"
#include "verify.h"

static uint16_t mod3(uint16_t a) {
uint16_t r;
static inline uint8_t mod3(uint8_t a) { /* a between 0 and 9 */
int16_t t, c;

r = (a >> 8) + (a & 0xff); // r mod 255 == a mod 255
r = (r >> 4) + (r & 0xf); // r' mod 15 == r mod 15
r = (r >> 2) + (r & 0x3); // r' mod 3 == r mod 3
r = (r >> 2) + (r & 0x3); // r' mod 3 == r mod 3

t = r - 3;
c = t >> 15;

return (c & r) ^ (~c & t);
a = (a >> 2) + (a & 3); /* between 0 and 4 */
t = a - 3;
c = t >> 5;
return t ^ (c & (a ^ t));
}

#define POLY_S3_FMADD(I,A,B,S) \
for ((I)=0; (I)<NTRU_N; (I)++) { \
(A).coeffs[(I)] = mod3((A).coeffs[(I)] + (S) * (B).coeffs[(I)]); \
}

static void cswappoly(poly *a, poly *b, int swap) {
int i;
uint16_t t;
swap = -swap;
for (i = 0; i < NTRU_N; i++) {
t = (a->coeffs[i] ^ b->coeffs[i]) & swap;
a->coeffs[i] ^= t;
b->coeffs[i] ^= t;
}
/* return -1 if x<0 and y<0; otherwise return 0 */
static inline int both_negative_mask(int x, int y) {
return (x & y) >> 15;
}

static inline void poly_divx(poly *a, int s) {
int i;
void PQCLEAN_NTRUHPS2048509_CLEAN_poly_S3_inv(poly *r, const poly *a) {
poly f, g, v, w;
int i, loop, delta;
int sign, swap, t;

for (i = 1; i < NTRU_N; i++) {
a->coeffs[i - 1] = (unsigned char) ((s * a->coeffs[i]) | (!s * a->coeffs[i - 1]));
for (i = 0; i < NTRU_N; ++i) {
v.coeffs[i] = 0;
}
a->coeffs[NTRU_N - 1] = (!s * a->coeffs[NTRU_N - 1]);
}

static inline void poly_mulx(poly *a, int s) {
int i;

for (i = 1; i < NTRU_N; i++) {
a->coeffs[NTRU_N - i] = (unsigned char) ((s * a->coeffs[NTRU_N - i - 1]) | (!s * a->coeffs[NTRU_N - i]));
for (i = 0; i < NTRU_N; ++i) {
w.coeffs[i] = 0;
}
a->coeffs[0] = (!s * a->coeffs[0]);
}

void PQCLEAN_NTRUHPS2048509_CLEAN_poly_S3_inv(poly *r, const poly *a) {
/* Schroeppel--Orman--O'Malley--Spatscheck
* "Almost Inverse" algorithm as described
* by Silverman in NTRU Tech Report #14 */
// with several modifications to make it run in constant-time
int i, j;
uint16_t k = 0;
uint16_t degf = NTRU_N - 1;
uint16_t degg = NTRU_N - 1;
int sign, fsign = 0, t, swap;
int16_t done = 0;
poly b, c, f, g;
poly *temp_r = &f;
w.coeffs[0] = 1;

/* b(X) := 1 */
for (i = 1; i < NTRU_N; i++) {
b.coeffs[i] = 0;
for (i = 0; i < NTRU_N; ++i) {
f.coeffs[i] = 1;
}
b.coeffs[0] = 1;

/* c(X) := 0 */
for (i = 0; i < NTRU_N; i++) {
c.coeffs[i] = 0;
for (i = 0; i < NTRU_N - 1; ++i) {
g.coeffs[NTRU_N - 2 - i] = mod3((a->coeffs[i] & 3) + 2 * (a->coeffs[NTRU_N - 1] & 3));
}
g.coeffs[NTRU_N - 1] = 0;

/* f(X) := a(X) */
for (i = 0; i < NTRU_N; i++) {
f.coeffs[i] = a->coeffs[i];
}
delta = 1;

/* g(X) := 1 + X + X^2 + ... + X^{N-1} */
for (i = 0; i < NTRU_N; i++) {
g.coeffs[i] = 1;
}
for (loop = 0; loop < 2 * (NTRU_N - 1) - 1; ++loop) {
for (i = NTRU_N - 1; i > 0; --i) {
v.coeffs[i] = v.coeffs[i - 1];
}
v.coeffs[0] = 0;

for (j = 0; j < 2 * (NTRU_N - 1) - 1; j++) {
sign = mod3(2 * g.coeffs[0] * f.coeffs[0]);
swap = (((sign & 2) >> 1) | sign) & !done & ((degf - degg) >> 15);

cswappoly(&f, &g, swap);
cswappoly(&b, &c, swap);
t = (degf ^ degg) & (-swap);
degf ^= t;
degg ^= t;
swap = both_negative_mask(-delta, -(int) g.coeffs[0]);
delta ^= swap & (delta ^ -delta);
delta += 1;

for (i = 0; i < NTRU_N; ++i) {
t = swap & (f.coeffs[i] ^ g.coeffs[i]);
f.coeffs[i] ^= t;
g.coeffs[i] ^= t;
t = swap & (v.coeffs[i] ^ w.coeffs[i]);
v.coeffs[i] ^= t;
w.coeffs[i] ^= t;
}

for (i = 0; i < NTRU_N; i++) {
f.coeffs[i] = mod3(f.coeffs[i] + ((uint16_t) (sign * (!done))) * g.coeffs[i]);
for (i = 0; i < NTRU_N; ++i) {
g.coeffs[i] = mod3(g.coeffs[i] + sign * f.coeffs[i]);
}
for (i = 0; i < NTRU_N; i++) {
b.coeffs[i] = mod3(b.coeffs[i] + ((uint16_t) (sign * (!done))) * c.coeffs[i]);
for (i = 0; i < NTRU_N; ++i) {
w.coeffs[i] = mod3(w.coeffs[i] + sign * v.coeffs[i]);
}

poly_divx(&f, !done);
poly_mulx(&c, !done);
degf -= !done;
k += !done;

done = 1 - (((uint16_t) - degf) >> 15);
}

fsign = f.coeffs[0];
k = k - NTRU_N * ((uint16_t)(NTRU_N - k - 1) >> 15);

/* Return X^{N-k} * b(X) */
/* This is a k-coefficient rotation. We do this by looking at the binary
representation of k, rotating for every power of 2, and performing a cmov
if the respective bit is set. */
for (i = 0; i < NTRU_N; i++) {
r->coeffs[i] = mod3((uint16_t) fsign * b.coeffs[i]);
}

for (i = 0; i < 10; i++) {
for (j = 0; j < NTRU_N; j++) {
temp_r->coeffs[j] = r->coeffs[(j + (1 << i)) % NTRU_N];
for (i = 0; i < NTRU_N - 1; ++i) {
g.coeffs[i] = g.coeffs[i + 1];
}
PQCLEAN_NTRUHPS2048509_CLEAN_cmov((unsigned char *) & (r->coeffs),
(unsigned char *) & (temp_r->coeffs), sizeof(uint16_t) * NTRU_N, k & 1);
k >>= 1;
g.coeffs[NTRU_N - 1] = 0;
}

/* Reduce modulo Phi_n */
for (i = 0; i < NTRU_N; i++) {
r->coeffs[i] = mod3(r->coeffs[i] + 2 * r->coeffs[NTRU_N - 1]);
sign = f.coeffs[0];
for (i = 0; i < NTRU_N - 1; ++i) {
r->coeffs[i] = mod3(sign * v.coeffs[NTRU_N - 2 - i]);
}
r->coeffs[NTRU_N - 1] = 0;
}

+ 9
- 6
crypto_kem/ntruhps2048509/clean/sample.c View File

@@ -1,27 +1,30 @@
#include "crypto_sort.h"
#include "sample.h"

void PQCLEAN_NTRUHPS2048509_CLEAN_sample_fg(poly *f, poly *g, const unsigned char uniformbytes[NTRU_SAMPLE_FG_BYTES]) {

PQCLEAN_NTRUHPS2048509_CLEAN_sample_iid(f, uniformbytes);
PQCLEAN_NTRUHPS2048509_CLEAN_sample_fixed_type(g, uniformbytes + NTRU_SAMPLE_IID_BYTES);
}

void PQCLEAN_NTRUHPS2048509_CLEAN_sample_rm(poly *r, poly *m, const unsigned char uniformbytes[NTRU_SAMPLE_RM_BYTES]) {

PQCLEAN_NTRUHPS2048509_CLEAN_sample_iid(r, uniformbytes);
PQCLEAN_NTRUHPS2048509_CLEAN_sample_fixed_type(m, uniformbytes + NTRU_SAMPLE_IID_BYTES);
}


#include "crypto_sort_int32.h"
void PQCLEAN_NTRUHPS2048509_CLEAN_sample_fixed_type(poly *r, const unsigned char u[NTRU_SAMPLE_FT_BYTES]) {
// Assumes NTRU_SAMPLE_FT_BYTES = ceil(30*(n-1)/8)

uint32_t s[NTRU_N - 1];
int32_t s[NTRU_N - 1];
int i;

// Use 30 bits of u per word
for (i = 0; i < (NTRU_N - 1) / 4; i++) {
s[4 * i + 0] = (u[15 * i + 0] << 2) + (u[15 * i + 1] << 10) + (u[15 * i + 2] << 18) + ((uint32_t) u[15 * i + 3] << 26);
s[4 * i + 1] = ((u[15 * i + 3] & 0xc0) >> 4) + (u[15 * i + 4] << 4) + (u[15 * i + 5] << 12) + (u[15 * i + 6] << 20) + ((uint32_t) u[15 * i + 7] << 28);
s[4 * i + 2] = ((u[15 * i + 7] & 0xf0) >> 2) + (u[15 * i + 8] << 6) + (u[15 * i + 9] << 14) + (u[15 * i + 10] << 22) + ((uint32_t) u[15 * i + 11] << 30);
s[4 * i + 0] = (u[15 * i + 0] << 2) + (u[15 * i + 1] << 10) + (u[15 * i + 2] << 18) + ((uint32_t) u[15 * i + 3] << 26);
s[4 * i + 1] = ((u[15 * i + 3] & 0xc0) >> 4) + (u[15 * i + 4] << 4) + (u[15 * i + 5] << 12) + (u[15 * i + 6] << 20) + ((uint32_t) u[15 * i + 7] << 28);
s[4 * i + 2] = ((u[15 * i + 7] & 0xf0) >> 2) + (u[15 * i + 8] << 6) + (u[15 * i + 9] << 14) + (u[15 * i + 10] << 22) + ((uint32_t) u[15 * i + 11] << 30);
s[4 * i + 3] = (u[15 * i + 11] & 0xfc) + (u[15 * i + 12] << 8) + (u[15 * i + 13] << 15) + ((uint32_t) u[15 * i + 14] << 24);
}

@@ -33,7 +36,7 @@ void PQCLEAN_NTRUHPS2048509_CLEAN_sample_fixed_type(poly *r, const unsigned char
s[i] |= 2;
}

PQCLEAN_NTRUHPS2048509_CLEAN_crypto_sort(s, NTRU_N - 1);
PQCLEAN_NTRUHPS2048509_CLEAN_crypto_sort_int32(s, NTRU_N - 1);

for (i = 0; i < NTRU_N - 1; i++) {
r->coeffs[i] = ((uint16_t) (s[i] & 3));


+ 1
- 2
crypto_kem/ntruhps2048509/clean/sample.h View File

@@ -1,8 +1,6 @@
#ifndef SAMPLE_H
#define SAMPLE_H

#include <stdlib.h>

#include "params.h"
#include "poly.h"

@@ -13,4 +11,5 @@ void PQCLEAN_NTRUHPS2048509_CLEAN_sample_iid(poly *r, const unsigned char unifor

void PQCLEAN_NTRUHPS2048509_CLEAN_sample_fixed_type(poly *r, const unsigned char uniformbytes[NTRU_SAMPLE_FT_BYTES]);


#endif

+ 0
- 29
crypto_kem/ntruhps2048509/clean/verify.c View File

@@ -1,29 +0,0 @@
#include <stdint.h>
#include <stdlib.h>

#include "verify.h"

/* returns 0 for equal strings, 1 for non-equal strings */
unsigned char PQCLEAN_NTRUHPS2048509_CLEAN_verify(const unsigned char *a, const unsigned char *b, size_t len) {
uint64_t r;
size_t i;

r = 0;
for (i = 0; i < len; i++) {
r |= a[i] ^ b[i];
}

r = (~r + 1); // Two's complement
r >>= 63;
return (unsigned char)r;
}

/* b = 1 means mov, b = 0 means don't mov*/
void PQCLEAN_NTRUHPS2048509_CLEAN_cmov(unsigned char *r, const unsigned char *x, size_t len, unsigned char b) {
size_t i;

b = (~b + 1); // Two's complement
for (i = 0; i < len; i++) {
r[i] ^= b & (x[i] ^ r[i]);
}
}

+ 0
- 12
crypto_kem/ntruhps2048509/clean/verify.h View File

@@ -1,12 +0,0 @@
#ifndef VERIFY_H
#define VERIFY_H

#include <stdio.h>

/* returns 0 for equal strings, 1 for non-equal strings */
unsigned char PQCLEAN_NTRUHPS2048509_CLEAN_verify(const unsigned char *a, const unsigned char *b, size_t len);

/* b = 1 means mov, b = 0 means don't mov*/
void PQCLEAN_NTRUHPS2048509_CLEAN_cmov(unsigned char *r, const unsigned char *x, size_t len, unsigned char b);

#endif

+ 15
- 2
crypto_kem/ntruhps2048677/META.yml View File

@@ -1,4 +1,4 @@
name: NTRU-HPS2048677
name: ntruhps2048677
type: kem
claimed-nist-level: 3
claimed-security: IND-CCA2
@@ -15,9 +15,22 @@ auxiliary-submitters:
- Jeffrey Hoffstein
- Andreas Hülsing
- Joost Rijneveld
- Tsunekazu Saito
- Peter Schwabe
- William Whyte
- Keita Xagawa
- Takashi Yamakawa
- Zhenfei Zhang
implementations:
- name: clean
version: https://github.com/jschanck/ntru/tree/485dde03 reference implementation
version: https://github.com/jschanck/ntru/tree/4699d70a reference implementation
- name: avx2
version: https://github.com/jschanck/ntru/tree/4699d70a avx2 implementation
supported_platforms:
- architecture: x86_64
operating_systems:
- Linux
- Darwin
required_flags:
- avx2
- bmi2

+ 1
- 0
crypto_kem/ntruhps2048677/avx2/LICENSE View File

@@ -0,0 +1 @@
Public Domain

+ 24
- 0
crypto_kem/ntruhps2048677/avx2/Makefile View File

@@ -0,0 +1,24 @@
# This Makefile can be used with GNU Make or BSD Make

LIB=libntruhps2048677_avx2.a
HEADERS=api.h cmov.h crypto_sort_int32.h owcpa.h params.h poly.h poly_r2_inv.h sample.h
OBJECTS=cmov.o crypto_sort_int32.o kem.o owcpa.o pack3.o packq.o poly.o poly_lift.o poly_r2_inv.o poly_s3_inv.o sample.o sample_iid.o \
square_1_677_patience.o square_2_677_patience.o square_3_677_patience.o square_5_677_patience.o square_10_677_shufbytes.o square_21_677_shufbytes.o square_42_677_shufbytes.o square_84_677_shufbytes.o square_168_677_shufbytes.o square_336_677_shufbytes.o \
poly_mod_3_Phi_n.o poly_mod_q_Phi_n.o poly_r2_mul.o poly_rq_mul.o poly_rq_to_s3.o vec32_sample_iid.o

CFLAGS=-O3 -mavx2 -mbmi2 -Wall -Wextra -Wpedantic -Wvla -Werror -Wredundant-decls -Wmissing-prototypes -std=c99 -I../../../common $(EXTRAFLAGS)

all: $(LIB)

%.o: %.s $(HEADERS)
$(AS) -o $@ $<

%.o: %.c $(HEADERS)
$(CC) $(CFLAGS) -c -o $@ $<

$(LIB): $(OBJECTS)
$(AR) -r $@ $(OBJECTS)

clean:
$(RM) $(OBJECTS)
$(RM) $(LIB)

+ 19
- 0
crypto_kem/ntruhps2048677/avx2/api.h View File

@@ -0,0 +1,19 @@
#ifndef PQCLEAN_NTRUHPS2048677_AVX2_API_H
#define PQCLEAN_NTRUHPS2048677_AVX2_API_H

#include <stdint.h>

#define PQCLEAN_NTRUHPS2048677_AVX2_CRYPTO_SECRETKEYBYTES 1234
#define PQCLEAN_NTRUHPS2048677_AVX2_CRYPTO_PUBLICKEYBYTES 930
#define PQCLEAN_NTRUHPS2048677_AVX2_CRYPTO_CIPHERTEXTBYTES 930
#define PQCLEAN_NTRUHPS2048677_AVX2_CRYPTO_BYTES 32

#define PQCLEAN_NTRUHPS2048677_AVX2_CRYPTO_ALGNAME "ntruhps2048677"

int PQCLEAN_NTRUHPS2048677_AVX2_crypto_kem_keypair(uint8_t *pk, uint8_t *sk);

int PQCLEAN_NTRUHPS2048677_AVX2_crypto_kem_enc(uint8_t *c, uint8_t *k, const uint8_t *pk);

int PQCLEAN_NTRUHPS2048677_AVX2_crypto_kem_dec(uint8_t *k, const uint8_t *c, const uint8_t *sk);

#endif

+ 11
- 0
crypto_kem/ntruhps2048677/avx2/cmov.c View File

@@ -0,0 +1,11 @@
#include "cmov.h"

/* b = 1 means mov, b = 0 means don't mov*/
void PQCLEAN_NTRUHPS2048677_AVX2_cmov(unsigned char *r, const unsigned char *x, size_t len, unsigned char b) {
size_t i;

b = (~b + 1);
for (i = 0; i < len; i++) {
r[i] ^= b & (x[i] ^ r[i]);
}
}

+ 10
- 0
crypto_kem/ntruhps2048677/avx2/cmov.h View File

@@ -0,0 +1,10 @@
#ifndef VERIFY_H
#define VERIFY_H

#include "params.h"

#include <stddef.h>

void PQCLEAN_NTRUHPS2048677_AVX2_cmov(unsigned char *r, const unsigned char *x, size_t len, unsigned char b);

#endif

+ 1215
- 0
crypto_kem/ntruhps2048677/avx2/crypto_sort_int32.c
File diff suppressed because it is too large
View File


+ 11
- 0
crypto_kem/ntruhps2048677/avx2/crypto_sort_int32.h View File

@@ -0,0 +1,11 @@
#ifndef CRYPTO_SORT
#define CRYPTO_SORT

#include "params.h"

#include <stddef.h>
#include <stdint.h>

void PQCLEAN_NTRUHPS2048677_AVX2_crypto_sort_int32(int32_t *x, size_t n);

#endif

+ 68
- 0
crypto_kem/ntruhps2048677/avx2/kem.c View File

@@ -0,0 +1,68 @@
#include "api.h"
#include "cmov.h"
#include "fips202.h"
#include "owcpa.h"
#include "params.h"
#include "randombytes.h"
#include "sample.h"

// API FUNCTIONS
int PQCLEAN_NTRUHPS2048677_AVX2_crypto_kem_keypair(uint8_t *pk, uint8_t *sk) {
uint8_t seed[NTRU_SAMPLE_FG_BYTES];

randombytes(seed, NTRU_SAMPLE_FG_BYTES);
PQCLEAN_NTRUHPS2048677_AVX2_owcpa_keypair(pk, sk, seed);

randombytes(sk + NTRU_OWCPA_SECRETKEYBYTES, NTRU_PRFKEYBYTES);

return 0;
}

int PQCLEAN_NTRUHPS2048677_AVX2_crypto_kem_enc(uint8_t *c, uint8_t *k, const uint8_t *pk) {
poly r, m;
uint8_t rm[NTRU_OWCPA_MSGBYTES];
uint8_t rm_seed[NTRU_SAMPLE_RM_BYTES];

randombytes(rm_seed, NTRU_SAMPLE_RM_BYTES);

PQCLEAN_NTRUHPS2048677_AVX2_sample_rm(&r, &m, rm_seed);

PQCLEAN_NTRUHPS2048677_AVX2_poly_S3_tobytes(rm, &r);
PQCLEAN_NTRUHPS2048677_AVX2_poly_S3_tobytes(rm + NTRU_PACK_TRINARY_BYTES, &m);
sha3_256(k, rm, NTRU_OWCPA_MSGBYTES);

PQCLEAN_NTRUHPS2048677_AVX2_poly_Z3_to_Zq(&r);
PQCLEAN_NTRUHPS2048677_AVX2_owcpa_enc(c, &r, &m, pk);

return 0;
}

int PQCLEAN_NTRUHPS2048677_AVX2_crypto_kem_dec(uint8_t *k, const uint8_t *c, const uint8_t *sk) {
int i, fail;
uint8_t rm[NTRU_OWCPA_MSGBYTES];
uint8_t buf[NTRU_PRFKEYBYTES + NTRU_CIPHERTEXTBYTES];

fail = 0;

/* Check that unused bits of last byte of ciphertext are zero */
fail |= c[NTRU_CIPHERTEXTBYTES - 1] & (0xff << (8 - (7 & (NTRU_LOGQ * NTRU_PACK_DEG))));

fail |= PQCLEAN_NTRUHPS2048677_AVX2_owcpa_dec(rm, c, sk);
/* If fail = 0 then c = Enc(h, rm). There is no need to re-encapsulate. */
/* See comment in PQCLEAN_NTRUHPS2048677_AVX2_owcpa_dec for details. */

sha3_256(k, rm, NTRU_OWCPA_MSGBYTES);

/* shake(secret PRF key || input ciphertext) */
for (i = 0; i < NTRU_PRFKEYBYTES; i++) {
buf[i] = sk[i + NTRU_OWCPA_SECRETKEYBYTES];
}
for (i = 0; i < NTRU_CIPHERTEXTBYTES; i++) {
buf[NTRU_PRFKEYBYTES + i] = c[i];
}
sha3_256(rm, buf, NTRU_PRFKEYBYTES + NTRU_CIPHERTEXTBYTES);

PQCLEAN_NTRUHPS2048677_AVX2_cmov(k, rm, NTRU_SHAREDKEYBYTES, (unsigned char) fail);

return 0;
}

+ 160
- 0
crypto_kem/ntruhps2048677/avx2/owcpa.c View File

@@ -0,0 +1,160 @@
#include "owcpa.h"
#include "poly.h"
#include "sample.h"

static int owcpa_check_r(const poly *r) {
/* Check that r is in message space. */
/* Note: Assumes that r has coefficients in {0, 1, ..., q-1} */
int i;
uint64_t t = 0;
uint16_t c;
for (i = 0; i < NTRU_N; i++) {
c = MODQ(r->coeffs[i] + 1);
t |= c & (NTRU_Q - 4); /* 0 if c is in {0,1,2,3} */
t |= (c + 1) & 0x4; /* 0 if c is in {0,1,2} */
}
t |= MODQ(r->coeffs[NTRU_N - 1]); /* Coefficient n-1 must be zero */
t = (~t + 1); // two's complement
t >>= 63;
return (int) t;
}

static int owcpa_check_m(const poly *m) {
/* Check that m is in message space. */
/* Note: Assumes that m has coefficients in {0,1,2}. */
int i;
uint64_t t = 0;
uint16_t p1 = 0;
uint16_t m1 = 0;
for (i = 0; i < NTRU_N; i++) {
p1 += m->coeffs[i] & 0x01;
m1 += (m->coeffs[i] & 0x02) >> 1;
}
/* Need p1 = m1 and p1 + m1 = NTRU_WEIGHT */
t |= p1 ^ m1;
t |= (p1 + m1) ^ NTRU_WEIGHT;
t = (~t + 1); // two's complement
t >>= 63;
return (int) t;
}

void PQCLEAN_NTRUHPS2048677_AVX2_owcpa_keypair(unsigned char *pk,
unsigned char *sk,
const unsigned char seed[NTRU_SAMPLE_FG_BYTES]) {
int i;

poly x1, x2, x3, x4, x5;

poly *f = &x1, *g = &x2, *invf_mod3 = &x3;
poly *gf = &x3, *invgf = &x4, *tmp = &x5;
poly *invh = &x3, *h = &x3;

PQCLEAN_NTRUHPS2048677_AVX2_sample_fg(f, g, seed);

PQCLEAN_NTRUHPS2048677_AVX2_poly_S3_inv(invf_mod3, f);
PQCLEAN_NTRUHPS2048677_AVX2_poly_S3_tobytes(sk, f);
PQCLEAN_NTRUHPS2048677_AVX2_poly_S3_tobytes(sk + NTRU_PACK_TRINARY_BYTES, invf_mod3);

/* Lift coeffs of f and g from Z_p to Z_q */
PQCLEAN_NTRUHPS2048677_AVX2_poly_Z3_to_Zq(f);
PQCLEAN_NTRUHPS2048677_AVX2_poly_Z3_to_Zq(g);


/* g = 3*g */
for (i = 0; i < NTRU_N; i++) {
g->coeffs[i] = 3 * g->coeffs[i];
}

PQCLEAN_NTRUHPS2048677_AVX2_poly_Rq_mul(gf, g, f);

PQCLEAN_NTRUHPS2048677_AVX2_poly_Rq_inv(invgf, gf);

PQCLEAN_NTRUHPS2048677_AVX2_poly_Rq_mul(tmp, invgf, f);
PQCLEAN_NTRUHPS2048677_AVX2_poly_Sq_mul(invh, tmp, f);
PQCLEAN_NTRUHPS2048677_AVX2_poly_Sq_tobytes(sk + 2 * NTRU_PACK_TRINARY_BYTES, invh);

PQCLEAN_NTRUHPS2048677_AVX2_poly_Rq_mul(tmp, invgf, g);
PQCLEAN_NTRUHPS2048677_AVX2_poly_Rq_mul(h, tmp, g);
PQCLEAN_NTRUHPS2048677_AVX2_poly_Rq_sum_zero_tobytes(pk, h);
}


void PQCLEAN_NTRUHPS2048677_AVX2_owcpa_enc(unsigned char *c,
const poly *r,
const poly *m,
const unsigned char *pk) {
int i;
poly x1, x2;
poly *h = &x1, *liftm = &x1;
poly *ct = &x2;

PQCLEAN_NTRUHPS2048677_AVX2_poly_Rq_sum_zero_frombytes(h, pk);

PQCLEAN_NTRUHPS2048677_AVX2_poly_Rq_mul(ct, r, h);

PQCLEAN_NTRUHPS2048677_AVX2_poly_lift(liftm, m);
for (i = 0; i < NTRU_N; i++) {
ct->coeffs[i] = ct->coeffs[i] + liftm->coeffs[i];
}

PQCLEAN_NTRUHPS2048677_AVX2_poly_Rq_sum_zero_tobytes(c, ct);
}

int PQCLEAN_NTRUHPS2048677_AVX2_owcpa_dec(unsigned char *rm,
const unsigned char *ciphertext,
const unsigned char *secretkey) {
int i;
int fail;
poly x1, x2, x3, x4;

poly *c = &x1, *f = &x2, *cf = &x3;
poly *mf = &x2, *finv3 = &x3, *m = &x4;
poly *liftm = &x2, *invh = &x3, *r = &x4;
poly *b = &x1;

PQCLEAN_NTRUHPS2048677_AVX2_poly_Rq_sum_zero_frombytes(c, ciphertext);
PQCLEAN_NTRUHPS2048677_AVX2_poly_S3_frombytes(f, secretkey);
PQCLEAN_NTRUHPS2048677_AVX2_poly_Z3_to_Zq(f);

PQCLEAN_NTRUHPS2048677_AVX2_poly_Rq_mul(cf, c, f);
PQCLEAN_NTRUHPS2048677_AVX2_poly_Rq_to_S3(mf, cf);

PQCLEAN_NTRUHPS2048677_AVX2_poly_S3_frombytes(finv3, secretkey + NTRU_PACK_TRINARY_BYTES);
PQCLEAN_NTRUHPS2048677_AVX2_poly_S3_mul(m, mf, finv3);
PQCLEAN_NTRUHPS2048677_AVX2_poly_S3_tobytes(rm + NTRU_PACK_TRINARY_BYTES, m);

/* NOTE: For the IND-CCA2 KEM we must ensure that c = Enc(h, (r,m)). */
/* We can avoid re-computing r*h + Lift(m) as long as we check that */
/* r (defined as b/h mod (q, Phi_n)) and m are in the message space. */
/* (m can take any value in S3 in NTRU_HRSS) */
fail = 0;
fail |= owcpa_check_m(m);

/* b = c - Lift(m) mod (q, x^n - 1) */
PQCLEAN_NTRUHPS2048677_AVX2_poly_lift(liftm, m);
for (i = 0; i < NTRU_N; i++) {
b->coeffs[i] = c->coeffs[i] - liftm->coeffs[i];
}

/* r = b / h mod (q, Phi_n) */
PQCLEAN_NTRUHPS2048677_AVX2_poly_Sq_frombytes(invh, secretkey + 2 * NTRU_PACK_TRINARY_BYTES);
PQCLEAN_NTRUHPS2048677_AVX2_poly_Sq_mul(r, b, invh);

/* NOTE: Our definition of r as b/h mod (q, Phi_n) follows Figure 4 of */
/* [Sch18] https://eprint.iacr.org/2018/1174/20181203:032458. */
/* This differs from Figure 10 of Saito--Xagawa--Yamakawa */
/* [SXY17] https://eprint.iacr.org/2017/1005/20180516:055500 */
/* where r gets a final reduction modulo p. */
/* We need this change to use Proposition 1 of [Sch18]. */

/* Proposition 1 of [Sch18] shows that re-encryption with (r,m) yields c. */
/* if and only if fail==0 after the following call to owcpa_check_r */
/* The procedure given in Fig. 8 of [Sch18] can be skipped because we have */
/* c(1) = 0 due to the use of poly_Rq_sum_zero_{to,from}bytes. */
fail |= owcpa_check_r(r);

PQCLEAN_NTRUHPS2048677_AVX2_poly_trinary_Zq_to_Z3(r);
PQCLEAN_NTRUHPS2048677_AVX2_poly_S3_tobytes(rm, r);

return fail;
}

+ 22
- 0
crypto_kem/ntruhps2048677/avx2/owcpa.h View File

@@ -0,0 +1,22 @@
#ifndef OWCPA_H
#define OWCPA_H

#include "params.h"
#include "poly.h"

void PQCLEAN_NTRUHPS2048677_AVX2_owcpa_samplemsg(unsigned char msg[NTRU_OWCPA_MSGBYTES],
const unsigned char seed[NTRU_SEEDBYTES]);

void PQCLEAN_NTRUHPS2048677_AVX2_owcpa_keypair(unsigned char *pk,
unsigned char *sk,
const unsigned char seed[NTRU_SEEDBYTES]);

void PQCLEAN_NTRUHPS2048677_AVX2_owcpa_enc(unsigned char *c,
const poly *r,
const poly *m,
const unsigned char *pk);

int PQCLEAN_NTRUHPS2048677_AVX2_owcpa_dec(unsigned char *rm,
const unsigned char *ciphertext,
const unsigned char *secretkey);
#endif

+ 46
- 0
crypto_kem/ntruhps2048677/avx2/pack3.c View File

@@ -0,0 +1,46 @@
#include "poly.h"

void PQCLEAN_NTRUHPS2048677_AVX2_poly_S3_tobytes(unsigned char msg[NTRU_OWCPA_MSGBYTES], const poly *a) {
int i;
unsigned char c;
int j;

for (i = 0; i < NTRU_PACK_DEG / 5; i++) {
c = a->coeffs[5 * i + 4] & 255;
c = (3 * c + a->coeffs[5 * i + 3]) & 255;
c = (3 * c + a->coeffs[5 * i + 2]) & 255;
c = (3 * c + a->coeffs[5 * i + 1]) & 255;
c = (3 * c + a->coeffs[5 * i + 0]) & 255;
msg[i] = c;
}
i = NTRU_PACK_DEG / 5;
c = 0;
for (j = NTRU_PACK_DEG - (5 * i) - 1; j >= 0; j--) {
c = (3 * c + a->coeffs[5 * i + j]) & 255;
}
msg[i] = c;
}

void PQCLEAN_NTRUHPS2048677_AVX2_poly_S3_frombytes(poly *r, const unsigned char msg[NTRU_OWCPA_MSGBYTES]) {
int i;
unsigned char c;
int j;

for (i = 0; i < NTRU_PACK_DEG / 5; i++) {
c = msg[i];
r->coeffs[5 * i + 0] = c;
r->coeffs[5 * i + 1] = c * 171 >> 9; // this is division by 3
r->coeffs[5 * i + 2] = c * 57 >> 9; // division by 3^2
r->coeffs[5 * i + 3] = c * 19 >> 9; // division by 3^3
r->coeffs[5 * i + 4] = c * 203 >> 14; // etc.
}
i = NTRU_PACK_DEG / 5;
c = msg[i];
for (j = 0; (5 * i + j) < NTRU_PACK_DEG; j++) {
r->coeffs[5 * i + j] = c;
c = c * 171 >> 9;
}
r->coeffs[NTRU_N - 1] = 0;
PQCLEAN_NTRUHPS2048677_AVX2_poly_mod_3_Phi_n(r);
}


+ 93
- 0
crypto_kem/ntruhps2048677/avx2/packq.c View File

@@ -0,0 +1,93 @@
#include "poly.h"

void PQCLEAN_NTRUHPS2048677_AVX2_poly_Sq_tobytes(unsigned char *r, const poly *a) {
int i, j;
uint16_t t[8];

for (i = 0; i < NTRU_PACK_DEG / 8; i++) {
for (j = 0; j < 8; j++) {
t[j] = MODQ(a->coeffs[8 * i + j]);
}

r[11 * i + 0] = (unsigned char) ( t[0] & 0xff);
r[11 * i + 1] = (unsigned char) ((t[0] >> 8) | ((t[1] & 0x1f) << 3));
r[11 * i + 2] = (unsigned char) ((t[1] >> 5) | ((t[2] & 0x03) << 6));
r[11 * i + 3] = (unsigned char) ((t[2] >> 2) & 0xff);
r[11 * i + 4] = (unsigned char) ((t[2] >> 10) | ((t[3] & 0x7f) << 1));
r[11 * i + 5] = (unsigned char) ((t[3] >> 7) | ((t[4] & 0x0f) << 4));
r[11 * i + 6] = (unsigned char) ((t[4] >> 4) | ((t[5] & 0x01) << 7));
r[11 * i + 7] = (unsigned char) ((t[5] >> 1) & 0xff);
r[11 * i + 8] = (unsigned char) ((t[5] >> 9) | ((t[6] & 0x3f) << 2));
r[11 * i + 9] = (unsigned char) ((t[6] >> 6) | ((t[7] & 0x07) << 5));
r[11 * i + 10] = (unsigned char) ((t[7] >> 3));
}

for (j = 0; j < NTRU_PACK_DEG - 8 * i; j++) {
t[j] = MODQ(a->coeffs[8 * i + j]);
}
for (; j < 8; j++) {
t[j] = 0;
}

switch (NTRU_PACK_DEG & 0x07) {
// cases 0 and 6 are impossible since 2 generates (Z/n)* and
// p mod 8 in {1, 7} implies that 2 is a quadratic residue.
case 4:
r[11 * i + 0] = (unsigned char) (t[0] & 0xff);
r[11 * i + 1] = (unsigned char) (t[0] >> 8) | ((t[1] & 0x1f) << 3);
r[11 * i + 2] = (unsigned char) (t[1] >> 5) | ((t[2] & 0x03) << 6);
r[11 * i + 3] = (unsigned char) (t[2] >> 2) & 0xff;
r[11 * i + 4] = (unsigned char) (t[2] >> 10) | ((t[3] & 0x7f) << 1);
r[11 * i + 5] = (unsigned char) (t[3] >> 7) | ((t[4] & 0x0f) << 4);
break;
case 2:
r[11 * i + 0] = (unsigned char) (t[0] & 0xff);
r[11 * i + 1] = (unsigned char) (t[0] >> 8) | ((t[1] & 0x1f) << 3);
r[11 * i + 2] = (unsigned char) (t[1] >> 5) | ((t[2] & 0x03) << 6);
break;
}
}

void PQCLEAN_NTRUHPS2048677_AVX2_poly_Sq_frombytes(poly *r, const unsigned char *a) {
int i;
for (i = 0; i < NTRU_PACK_DEG / 8; i++) {
r->coeffs[8 * i + 0] = (a[11 * i + 0] >> 0) | (((uint16_t)a[11 * i + 1] & 0x07) << 8);
r->coeffs[8 * i + 1] = (a[11 * i + 1] >> 3) | (((uint16_t)a[11 * i + 2] & 0x3f) << 5);
r->coeffs[8 * i + 2] = (a[11 * i + 2] >> 6) | (((uint16_t)a[11 * i + 3] & 0xff) << 2) | (((uint16_t)a[11 * i + 4] & 0x01) << 10);
r->coeffs[8 * i + 3] = (a[11 * i + 4] >> 1) | (((uint16_t)a[11 * i + 5] & 0x0f) << 7);
r->coeffs[8 * i + 4] = (a[11 * i + 5] >> 4) | (((uint16_t)a[11 * i + 6] & 0x7f) << 4);
r->coeffs[8 * i + 5] = (a[11 * i + 6] >> 7) | (((uint16_t)a[11 * i + 7] & 0xff) << 1) | (((uint16_t)a[11 * i + 8] & 0x03) << 9);
r->coeffs[8 * i + 6] = (a[11 * i + 8] >> 2) | (((uint16_t)a[11 * i + 9] & 0x1f) << 6);
r->coeffs[8 * i + 7] = (a[11 * i + 9] >> 5) | (((uint16_t)a[11 * i + 10] & 0xff) << 3);
}
switch (NTRU_PACK_DEG & 0x07) {
// cases 0 and 6 are impossible since 2 generates (Z/n)* and
// p mod 8 in {1, 7} implies that 2 is a quadratic residue.
case 4:
r->coeffs[8 * i + 0] = (a[11 * i + 0] >> 0) | (((uint16_t)a[11 * i + 1] & 0x07) << 8);
r->coeffs[8 * i + 1] = (a[11 * i + 1] >> 3) | (((uint16_t)a[11 * i + 2] & 0x3f) << 5);
r->coeffs[8 * i + 2] = (a[11 * i + 2] >> 6) | (((uint16_t)a[11 * i + 3] & 0xff) << 2) | (((uint16_t)a[11 * i + 4] & 0x01) << 10);
r->coeffs[8 * i + 3] = (a[11 * i + 4] >> 1) | (((uint16_t)a[11 * i + 5] & 0x0f) << 7);
break;
case 2:
r->coeffs[8 * i + 0] = (a[11 * i + 0] >> 0) | (((uint16_t)a[11 * i + 1] & 0x07) << 8);
r->coeffs[8 * i + 1] = (a[11 * i + 1] >> 3) | (((uint16_t)a[11 * i + 2] & 0x3f) << 5);
break;
}
r->coeffs[NTRU_N - 1] = 0;
}

void PQCLEAN_NTRUHPS2048677_AVX2_poly_Rq_sum_zero_tobytes(unsigned char *r, const poly *a) {
PQCLEAN_NTRUHPS2048677_AVX2_poly_Sq_tobytes(r, a);
}

void PQCLEAN_NTRUHPS2048677_AVX2_poly_Rq_sum_zero_frombytes(poly *r, const unsigned char *a) {
int i;
PQCLEAN_NTRUHPS2048677_AVX2_poly_Sq_frombytes(r, a);

/* Set r[n-1] so that the sum of coefficients is zero mod q */
r->coeffs[NTRU_N - 1] = 0;
for (i = 0; i < NTRU_PACK_DEG; i++) {
r->coeffs[NTRU_N - 1] -= r->coeffs[i];
}
}

+ 37
- 0
crypto_kem/ntruhps2048677/avx2/params.h View File

@@ -0,0 +1,37 @@
#ifndef PARAMS_H
#define PARAMS_H

#define NTRU_HPS
#define NTRU_N 677
#define NTRU_LOGQ 11


/* Do not modify below this line */

#define PAD32(X) ((((X) + 31)/32)*32)

#define NTRU_Q (1 << NTRU_LOGQ)
#define NTRU_WEIGHT (NTRU_Q/8 - 2)

#define NTRU_SEEDBYTES 32
#define NTRU_PRFKEYBYTES 32
#define NTRU_SHAREDKEYBYTES 32

#define NTRU_SAMPLE_IID_BYTES (NTRU_N-1)
#define NTRU_SAMPLE_FT_BYTES ((30*(NTRU_N-1)+7)/8)
#define NTRU_SAMPLE_FG_BYTES (NTRU_SAMPLE_IID_BYTES+NTRU_SAMPLE_FT_BYTES)
#define NTRU_SAMPLE_RM_BYTES (NTRU_SAMPLE_IID_BYTES+NTRU_SAMPLE_FT_BYTES)

#define NTRU_PACK_DEG (NTRU_N-1)
#define NTRU_PACK_TRINARY_BYTES ((NTRU_PACK_DEG+4)/5)

#define NTRU_OWCPA_MSGBYTES (2*NTRU_PACK_TRINARY_BYTES)
#define NTRU_OWCPA_PUBLICKEYBYTES ((NTRU_LOGQ*NTRU_PACK_DEG+7)/8)
#define NTRU_OWCPA_SECRETKEYBYTES (2*NTRU_PACK_TRINARY_BYTES + NTRU_OWCPA_PUBLICKEYBYTES)
#define NTRU_OWCPA_BYTES ((NTRU_LOGQ*NTRU_PACK_DEG+7)/8)

#define NTRU_PUBLICKEYBYTES (NTRU_OWCPA_PUBLICKEYBYTES)
#define NTRU_SECRETKEYBYTES (NTRU_OWCPA_SECRETKEYBYTES + NTRU_PRFKEYBYTES)
#define NTRU_CIPHERTEXTBYTES (NTRU_OWCPA_BYTES)

#endif

+ 67
- 0
crypto_kem/ntruhps2048677/avx2/poly.c View File

@@ -0,0 +1,67 @@
#include "poly.h"

/* Map {0, 1, 2} -> {0,1,q-1} in place */
void PQCLEAN_NTRUHPS2048677_AVX2_poly_Z3_to_Zq(poly *r) {
int i;
for (i = 0; i < NTRU_N; i++) {
r->coeffs[i] = r->coeffs[i] | ((-(r->coeffs[i] >> 1)) & (NTRU_Q - 1));
}
}

/* Map {0, 1, q-1} -> {0,1,2} in place */
void PQCLEAN_NTRUHPS2048677_AVX2_poly_trinary_Zq_to_Z3(poly *r) {
int i;
for (i = 0; i < NTRU_N; i++) {
r->coeffs[i] = MODQ(r->coeffs[i]);
r->coeffs[i] = 3 & (r->coeffs[i] ^ (r->coeffs[i] >> (NTRU_LOGQ - 1)));
}
}

void PQCLEAN_NTRUHPS2048677_AVX2_poly_Sq_mul(poly *r, const poly *a, const poly *b) {
PQCLEAN_NTRUHPS2048677_AVX2_poly_Rq_mul(r, a, b);
PQCLEAN_NTRUHPS2048677_AVX2_poly_mod_q_Phi_n(r);
}

void PQCLEAN_NTRUHPS2048677_AVX2_poly_S3_mul(poly *r, const poly *a, const poly *b) {
PQCLEAN_NTRUHPS2048677_AVX2_poly_Rq_mul(r, a, b);
PQCLEAN_NTRUHPS2048677_AVX2_poly_mod_3_Phi_n(r);
}

static void PQCLEAN_NTRUHPS2048677_AVX2_poly_R2_inv_to_Rq_inv(poly *r, const poly *ai, const poly *a) {

int i;
poly b, c;
poly s;

// for 0..4
// ai = ai * (2 - a*ai) mod q
for (i = 0; i < NTRU_N; i++) {
b.coeffs[i] = -(a->coeffs[i]);
}

for (i = 0; i < NTRU_N; i++) {
r->coeffs[i] = ai->coeffs[i];
}

PQCLEAN_NTRUHPS2048677_AVX2_poly_Rq_mul(&c, r, &b);
c.coeffs[0] += 2; // c = 2 - a*ai
PQCLEAN_NTRUHPS2048677_AVX2_poly_Rq_mul(&s, &c, r); // s = ai*c

PQCLEAN_NTRUHPS2048677_AVX2_poly_Rq_mul(&c, &s, &b);
c.coeffs[0] += 2; // c = 2 - a*s
PQCLEAN_NTRUHPS2048677_AVX2_poly_Rq_mul(r, &c, &s); // r = s*c

PQCLEAN_NTRUHPS2048677_AVX2_poly_Rq_mul(&c, r, &b);
c.coeffs[0] += 2; // c = 2 - a*r
PQCLEAN_NTRUHPS2048677_AVX2_poly_Rq_mul(&s, &c, r); // s = r*c

PQCLEAN_NTRUHPS2048677_AVX2_poly_Rq_mul(&c, &s, &b);
c.coeffs[0] += 2; // c = 2 - a*s
PQCLEAN_NTRUHPS2048677_AVX2_poly_Rq_mul(r, &c, &s); // r = s*c
}

void PQCLEAN_NTRUHPS2048677_AVX2_poly_Rq_inv(poly *r, const poly *a) {
poly ai2;
PQCLEAN_NTRUHPS2048677_AVX2_poly_R2_inv(&ai2, a);
PQCLEAN_NTRUHPS2048677_AVX2_poly_R2_inv_to_Rq_inv(r, &ai2, a);
}

+ 41
- 0
crypto_kem/ntruhps2048677/avx2/poly.h View File

@@ -0,0 +1,41 @@
#ifndef POLY_H
#define POLY_H

#include <immintrin.h>
#include <stdint.h>

#include "params.h"

#define MODQ(X) ((X) & (NTRU_Q-1))

typedef union { /* align to 32 byte boundary for vmovdqa */
uint16_t coeffs[PAD32(NTRU_N)];
__m256i coeffs_x16[PAD32(NTRU_N) / 16];
} poly;

void PQCLEAN_NTRUHPS2048677_AVX2_poly_mod_3_Phi_n(poly *r);
void PQCLEAN_NTRUHPS2048677_AVX2_poly_mod_q_Phi_n(poly *r);

void PQCLEAN_NTRUHPS2048677_AVX2_poly_Sq_tobytes(unsigned char *r, const poly *a);
void PQCLEAN_NTRUHPS2048677_AVX2_poly_Sq_frombytes(poly *r, const unsigned char *a);

void PQCLEAN_NTRUHPS2048677_AVX2_poly_Rq_sum_zero_tobytes(unsigned char *r, const poly *a);
void PQCLEAN_NTRUHPS2048677_AVX2_poly_Rq_sum_zero_frombytes(poly *r, const unsigned char *a);

void PQCLEAN_NTRUHPS2048677_AVX2_poly_S3_tobytes(unsigned char msg[NTRU_PACK_TRINARY_BYTES], const poly *a);
void PQCLEAN_NTRUHPS2048677_AVX2_poly_S3_frombytes(poly *r, const unsigned char msg[NTRU_PACK_TRINARY_BYTES]);

void PQCLEAN_NTRUHPS2048677_AVX2_poly_Sq_mul(poly *r, const poly *a, const poly *b);
void PQCLEAN_NTRUHPS2048677_AVX2_poly_Rq_mul(poly *r, const poly *a, const poly *b);
void PQCLEAN_NTRUHPS2048677_AVX2_poly_S3_mul(poly *r, const poly *a, const poly *b);
void PQCLEAN_NTRUHPS2048677_AVX2_poly_lift(poly *r, const poly *a);
void PQCLEAN_NTRUHPS2048677_AVX2_poly_Rq_to_S3(poly *r, const poly *a);

void PQCLEAN_NTRUHPS2048677_AVX2_poly_R2_inv(poly *r, const poly *a);
void PQCLEAN_NTRUHPS2048677_AVX2_poly_Rq_inv(poly *r, const poly *a);
void PQCLEAN_NTRUHPS2048677_AVX2_poly_S3_inv(poly *r, const poly *a);

void PQCLEAN_NTRUHPS2048677_AVX2_poly_Z3_to_Zq(poly *r);
void PQCLEAN_NTRUHPS2048677_AVX2_poly_trinary_Zq_to_Z3(poly *r);

#endif

+ 11
- 0
crypto_kem/ntruhps2048677/avx2/poly_lift.c View File

@@ -0,0 +1,11 @@
#include "poly.h"

void PQCLEAN_NTRUHPS2048677_AVX2_poly_lift(poly *r, const poly *a) {
int i;
for (i = 0; i < NTRU_N; i++) {
r->coeffs[i] = a->coeffs[i];
}
PQCLEAN_NTRUHPS2048677_AVX2_poly_Z3_to_Zq(r);
}



+ 928
- 0
crypto_kem/ntruhps2048677/avx2/poly_mod_3_Phi_n.s View File

@@ -0,0 +1,928 @@
.data
.p2align 5
mask_ff:
.word 0xff
.word 0xff
.word 0xff
.word 0xff
.word 0xff
.word 0xff
.word 0xff
.word 0xff
.word 0xff
.word 0xff
.word 0xff
.word 0xff
.word 0xff
.word 0xff
.word 0xff
.word 0xff
mask_f:
.word 0xf
.word 0xf
.word 0xf
.word 0xf
.word 0xf
.word 0xf
.word 0xf
.word 0xf
.word 0xf
.word 0xf
.word 0xf
.word 0xf
.word 0xf
.word 0xf
.word 0xf
.word 0xf
mask_3:
.word 0x03
.word 0x03
.word 0x03
.word 0x03
.word 0x03
.word 0x03
.word 0x03
.word 0x03
.word 0x03
.word 0x03
.word 0x03
.word 0x03
.word 0x03
.word 0x03
.word 0x03
.word 0x03
.text
.global PQCLEAN_NTRUHPS2048677_AVX2_poly_mod_3_Phi_n
.global _PQCLEAN_NTRUHPS2048677_AVX2_poly_mod_3_Phi_n
PQCLEAN_NTRUHPS2048677_AVX2_poly_mod_3_Phi_n:
_PQCLEAN_NTRUHPS2048677_AVX2_poly_mod_3_Phi_n:
vmovdqa 1344(%rdi), %ymm0
vpermq $1, %ymm0, %ymm0
vpslld $17, %ymm0, %ymm0
vpsrld $16, %ymm0, %ymm1
vpor %ymm0, %ymm1, %ymm0
vbroadcastss %xmm0, %ymm0
vpaddw 0(%rdi), %ymm0, %ymm1
vpsrlw $8, %ymm1, %ymm2
vpand mask_ff(%rip), %ymm1, %ymm1
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_f(%rip), %ymm2, %ymm1
vpsrlw $4, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpsubw mask_3(%rip), %ymm2, %ymm14
vpsraw $15, %ymm14, %ymm15
vpandn %ymm14, %ymm15, %ymm1
vpand %ymm15, %ymm2, %ymm14
vpxor %ymm14, %ymm1, %ymm2
vmovdqa %ymm2, 0(%rdi)
vpaddw 32(%rdi), %ymm0, %ymm1
vpsrlw $8, %ymm1, %ymm2
vpand mask_ff(%rip), %ymm1, %ymm1
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_f(%rip), %ymm2, %ymm1
vpsrlw $4, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpsubw mask_3(%rip), %ymm2, %ymm14
vpsraw $15, %ymm14, %ymm15
vpandn %ymm14, %ymm15, %ymm1
vpand %ymm15, %ymm2, %ymm14
vpxor %ymm14, %ymm1, %ymm2
vmovdqa %ymm2, 32(%rdi)
vpaddw 64(%rdi), %ymm0, %ymm1
vpsrlw $8, %ymm1, %ymm2
vpand mask_ff(%rip), %ymm1, %ymm1
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_f(%rip), %ymm2, %ymm1
vpsrlw $4, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpsubw mask_3(%rip), %ymm2, %ymm14
vpsraw $15, %ymm14, %ymm15
vpandn %ymm14, %ymm15, %ymm1
vpand %ymm15, %ymm2, %ymm14
vpxor %ymm14, %ymm1, %ymm2
vmovdqa %ymm2, 64(%rdi)
vpaddw 96(%rdi), %ymm0, %ymm1
vpsrlw $8, %ymm1, %ymm2
vpand mask_ff(%rip), %ymm1, %ymm1
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_f(%rip), %ymm2, %ymm1
vpsrlw $4, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpsubw mask_3(%rip), %ymm2, %ymm14
vpsraw $15, %ymm14, %ymm15
vpandn %ymm14, %ymm15, %ymm1
vpand %ymm15, %ymm2, %ymm14
vpxor %ymm14, %ymm1, %ymm2
vmovdqa %ymm2, 96(%rdi)
vpaddw 128(%rdi), %ymm0, %ymm1
vpsrlw $8, %ymm1, %ymm2
vpand mask_ff(%rip), %ymm1, %ymm1
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_f(%rip), %ymm2, %ymm1
vpsrlw $4, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpsubw mask_3(%rip), %ymm2, %ymm14
vpsraw $15, %ymm14, %ymm15
vpandn %ymm14, %ymm15, %ymm1
vpand %ymm15, %ymm2, %ymm14
vpxor %ymm14, %ymm1, %ymm2
vmovdqa %ymm2, 128(%rdi)
vpaddw 160(%rdi), %ymm0, %ymm1
vpsrlw $8, %ymm1, %ymm2
vpand mask_ff(%rip), %ymm1, %ymm1
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_f(%rip), %ymm2, %ymm1
vpsrlw $4, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpsubw mask_3(%rip), %ymm2, %ymm14
vpsraw $15, %ymm14, %ymm15
vpandn %ymm14, %ymm15, %ymm1
vpand %ymm15, %ymm2, %ymm14
vpxor %ymm14, %ymm1, %ymm2
vmovdqa %ymm2, 160(%rdi)
vpaddw 192(%rdi), %ymm0, %ymm1
vpsrlw $8, %ymm1, %ymm2
vpand mask_ff(%rip), %ymm1, %ymm1
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_f(%rip), %ymm2, %ymm1
vpsrlw $4, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpsubw mask_3(%rip), %ymm2, %ymm14
vpsraw $15, %ymm14, %ymm15
vpandn %ymm14, %ymm15, %ymm1
vpand %ymm15, %ymm2, %ymm14
vpxor %ymm14, %ymm1, %ymm2
vmovdqa %ymm2, 192(%rdi)
vpaddw 224(%rdi), %ymm0, %ymm1
vpsrlw $8, %ymm1, %ymm2
vpand mask_ff(%rip), %ymm1, %ymm1
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_f(%rip), %ymm2, %ymm1
vpsrlw $4, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpsubw mask_3(%rip), %ymm2, %ymm14
vpsraw $15, %ymm14, %ymm15
vpandn %ymm14, %ymm15, %ymm1
vpand %ymm15, %ymm2, %ymm14
vpxor %ymm14, %ymm1, %ymm2
vmovdqa %ymm2, 224(%rdi)
vpaddw 256(%rdi), %ymm0, %ymm1
vpsrlw $8, %ymm1, %ymm2
vpand mask_ff(%rip), %ymm1, %ymm1
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_f(%rip), %ymm2, %ymm1
vpsrlw $4, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpsubw mask_3(%rip), %ymm2, %ymm14
vpsraw $15, %ymm14, %ymm15
vpandn %ymm14, %ymm15, %ymm1
vpand %ymm15, %ymm2, %ymm14
vpxor %ymm14, %ymm1, %ymm2
vmovdqa %ymm2, 256(%rdi)
vpaddw 288(%rdi), %ymm0, %ymm1
vpsrlw $8, %ymm1, %ymm2
vpand mask_ff(%rip), %ymm1, %ymm1
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_f(%rip), %ymm2, %ymm1
vpsrlw $4, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpsubw mask_3(%rip), %ymm2, %ymm14
vpsraw $15, %ymm14, %ymm15
vpandn %ymm14, %ymm15, %ymm1
vpand %ymm15, %ymm2, %ymm14
vpxor %ymm14, %ymm1, %ymm2
vmovdqa %ymm2, 288(%rdi)
vpaddw 320(%rdi), %ymm0, %ymm1
vpsrlw $8, %ymm1, %ymm2
vpand mask_ff(%rip), %ymm1, %ymm1
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_f(%rip), %ymm2, %ymm1
vpsrlw $4, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpsubw mask_3(%rip), %ymm2, %ymm14
vpsraw $15, %ymm14, %ymm15
vpandn %ymm14, %ymm15, %ymm1
vpand %ymm15, %ymm2, %ymm14
vpxor %ymm14, %ymm1, %ymm2
vmovdqa %ymm2, 320(%rdi)
vpaddw 352(%rdi), %ymm0, %ymm1
vpsrlw $8, %ymm1, %ymm2
vpand mask_ff(%rip), %ymm1, %ymm1
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_f(%rip), %ymm2, %ymm1
vpsrlw $4, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpsubw mask_3(%rip), %ymm2, %ymm14
vpsraw $15, %ymm14, %ymm15
vpandn %ymm14, %ymm15, %ymm1
vpand %ymm15, %ymm2, %ymm14
vpxor %ymm14, %ymm1, %ymm2
vmovdqa %ymm2, 352(%rdi)
vpaddw 384(%rdi), %ymm0, %ymm1
vpsrlw $8, %ymm1, %ymm2
vpand mask_ff(%rip), %ymm1, %ymm1
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_f(%rip), %ymm2, %ymm1
vpsrlw $4, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpsubw mask_3(%rip), %ymm2, %ymm14
vpsraw $15, %ymm14, %ymm15
vpandn %ymm14, %ymm15, %ymm1
vpand %ymm15, %ymm2, %ymm14
vpxor %ymm14, %ymm1, %ymm2
vmovdqa %ymm2, 384(%rdi)
vpaddw 416(%rdi), %ymm0, %ymm1
vpsrlw $8, %ymm1, %ymm2
vpand mask_ff(%rip), %ymm1, %ymm1
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_f(%rip), %ymm2, %ymm1
vpsrlw $4, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpsubw mask_3(%rip), %ymm2, %ymm14
vpsraw $15, %ymm14, %ymm15
vpandn %ymm14, %ymm15, %ymm1
vpand %ymm15, %ymm2, %ymm14
vpxor %ymm14, %ymm1, %ymm2
vmovdqa %ymm2, 416(%rdi)
vpaddw 448(%rdi), %ymm0, %ymm1
vpsrlw $8, %ymm1, %ymm2
vpand mask_ff(%rip), %ymm1, %ymm1
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_f(%rip), %ymm2, %ymm1
vpsrlw $4, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpsubw mask_3(%rip), %ymm2, %ymm14
vpsraw $15, %ymm14, %ymm15
vpandn %ymm14, %ymm15, %ymm1
vpand %ymm15, %ymm2, %ymm14
vpxor %ymm14, %ymm1, %ymm2
vmovdqa %ymm2, 448(%rdi)
vpaddw 480(%rdi), %ymm0, %ymm1
vpsrlw $8, %ymm1, %ymm2
vpand mask_ff(%rip), %ymm1, %ymm1
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_f(%rip), %ymm2, %ymm1
vpsrlw $4, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpsubw mask_3(%rip), %ymm2, %ymm14
vpsraw $15, %ymm14, %ymm15
vpandn %ymm14, %ymm15, %ymm1
vpand %ymm15, %ymm2, %ymm14
vpxor %ymm14, %ymm1, %ymm2
vmovdqa %ymm2, 480(%rdi)
vpaddw 512(%rdi), %ymm0, %ymm1
vpsrlw $8, %ymm1, %ymm2
vpand mask_ff(%rip), %ymm1, %ymm1
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_f(%rip), %ymm2, %ymm1
vpsrlw $4, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpsubw mask_3(%rip), %ymm2, %ymm14
vpsraw $15, %ymm14, %ymm15
vpandn %ymm14, %ymm15, %ymm1
vpand %ymm15, %ymm2, %ymm14
vpxor %ymm14, %ymm1, %ymm2
vmovdqa %ymm2, 512(%rdi)
vpaddw 544(%rdi), %ymm0, %ymm1
vpsrlw $8, %ymm1, %ymm2
vpand mask_ff(%rip), %ymm1, %ymm1
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_f(%rip), %ymm2, %ymm1
vpsrlw $4, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpsubw mask_3(%rip), %ymm2, %ymm14
vpsraw $15, %ymm14, %ymm15
vpandn %ymm14, %ymm15, %ymm1
vpand %ymm15, %ymm2, %ymm14
vpxor %ymm14, %ymm1, %ymm2
vmovdqa %ymm2, 544(%rdi)
vpaddw 576(%rdi), %ymm0, %ymm1
vpsrlw $8, %ymm1, %ymm2
vpand mask_ff(%rip), %ymm1, %ymm1
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_f(%rip), %ymm2, %ymm1
vpsrlw $4, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpsubw mask_3(%rip), %ymm2, %ymm14
vpsraw $15, %ymm14, %ymm15
vpandn %ymm14, %ymm15, %ymm1
vpand %ymm15, %ymm2, %ymm14
vpxor %ymm14, %ymm1, %ymm2
vmovdqa %ymm2, 576(%rdi)
vpaddw 608(%rdi), %ymm0, %ymm1
vpsrlw $8, %ymm1, %ymm2
vpand mask_ff(%rip), %ymm1, %ymm1
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_f(%rip), %ymm2, %ymm1
vpsrlw $4, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpsubw mask_3(%rip), %ymm2, %ymm14
vpsraw $15, %ymm14, %ymm15
vpandn %ymm14, %ymm15, %ymm1
vpand %ymm15, %ymm2, %ymm14
vpxor %ymm14, %ymm1, %ymm2
vmovdqa %ymm2, 608(%rdi)
vpaddw 640(%rdi), %ymm0, %ymm1
vpsrlw $8, %ymm1, %ymm2
vpand mask_ff(%rip), %ymm1, %ymm1
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_f(%rip), %ymm2, %ymm1
vpsrlw $4, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpsubw mask_3(%rip), %ymm2, %ymm14
vpsraw $15, %ymm14, %ymm15
vpandn %ymm14, %ymm15, %ymm1
vpand %ymm15, %ymm2, %ymm14
vpxor %ymm14, %ymm1, %ymm2
vmovdqa %ymm2, 640(%rdi)
vpaddw 672(%rdi), %ymm0, %ymm1
vpsrlw $8, %ymm1, %ymm2
vpand mask_ff(%rip), %ymm1, %ymm1
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_f(%rip), %ymm2, %ymm1
vpsrlw $4, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpsubw mask_3(%rip), %ymm2, %ymm14
vpsraw $15, %ymm14, %ymm15
vpandn %ymm14, %ymm15, %ymm1
vpand %ymm15, %ymm2, %ymm14
vpxor %ymm14, %ymm1, %ymm2
vmovdqa %ymm2, 672(%rdi)
vpaddw 704(%rdi), %ymm0, %ymm1
vpsrlw $8, %ymm1, %ymm2
vpand mask_ff(%rip), %ymm1, %ymm1
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_f(%rip), %ymm2, %ymm1
vpsrlw $4, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpsubw mask_3(%rip), %ymm2, %ymm14
vpsraw $15, %ymm14, %ymm15
vpandn %ymm14, %ymm15, %ymm1
vpand %ymm15, %ymm2, %ymm14
vpxor %ymm14, %ymm1, %ymm2
vmovdqa %ymm2, 704(%rdi)
vpaddw 736(%rdi), %ymm0, %ymm1
vpsrlw $8, %ymm1, %ymm2
vpand mask_ff(%rip), %ymm1, %ymm1
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_f(%rip), %ymm2, %ymm1
vpsrlw $4, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpsubw mask_3(%rip), %ymm2, %ymm14
vpsraw $15, %ymm14, %ymm15
vpandn %ymm14, %ymm15, %ymm1
vpand %ymm15, %ymm2, %ymm14
vpxor %ymm14, %ymm1, %ymm2
vmovdqa %ymm2, 736(%rdi)
vpaddw 768(%rdi), %ymm0, %ymm1
vpsrlw $8, %ymm1, %ymm2
vpand mask_ff(%rip), %ymm1, %ymm1
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_f(%rip), %ymm2, %ymm1
vpsrlw $4, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpsubw mask_3(%rip), %ymm2, %ymm14
vpsraw $15, %ymm14, %ymm15
vpandn %ymm14, %ymm15, %ymm1
vpand %ymm15, %ymm2, %ymm14
vpxor %ymm14, %ymm1, %ymm2
vmovdqa %ymm2, 768(%rdi)
vpaddw 800(%rdi), %ymm0, %ymm1
vpsrlw $8, %ymm1, %ymm2
vpand mask_ff(%rip), %ymm1, %ymm1
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_f(%rip), %ymm2, %ymm1
vpsrlw $4, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpsubw mask_3(%rip), %ymm2, %ymm14
vpsraw $15, %ymm14, %ymm15
vpandn %ymm14, %ymm15, %ymm1
vpand %ymm15, %ymm2, %ymm14
vpxor %ymm14, %ymm1, %ymm2
vmovdqa %ymm2, 800(%rdi)
vpaddw 832(%rdi), %ymm0, %ymm1
vpsrlw $8, %ymm1, %ymm2
vpand mask_ff(%rip), %ymm1, %ymm1
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_f(%rip), %ymm2, %ymm1
vpsrlw $4, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpsubw mask_3(%rip), %ymm2, %ymm14
vpsraw $15, %ymm14, %ymm15
vpandn %ymm14, %ymm15, %ymm1
vpand %ymm15, %ymm2, %ymm14
vpxor %ymm14, %ymm1, %ymm2
vmovdqa %ymm2, 832(%rdi)
vpaddw 864(%rdi), %ymm0, %ymm1
vpsrlw $8, %ymm1, %ymm2
vpand mask_ff(%rip), %ymm1, %ymm1
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_f(%rip), %ymm2, %ymm1
vpsrlw $4, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpsubw mask_3(%rip), %ymm2, %ymm14
vpsraw $15, %ymm14, %ymm15
vpandn %ymm14, %ymm15, %ymm1
vpand %ymm15, %ymm2, %ymm14
vpxor %ymm14, %ymm1, %ymm2
vmovdqa %ymm2, 864(%rdi)
vpaddw 896(%rdi), %ymm0, %ymm1
vpsrlw $8, %ymm1, %ymm2
vpand mask_ff(%rip), %ymm1, %ymm1
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_f(%rip), %ymm2, %ymm1
vpsrlw $4, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpsubw mask_3(%rip), %ymm2, %ymm14
vpsraw $15, %ymm14, %ymm15
vpandn %ymm14, %ymm15, %ymm1
vpand %ymm15, %ymm2, %ymm14
vpxor %ymm14, %ymm1, %ymm2
vmovdqa %ymm2, 896(%rdi)
vpaddw 928(%rdi), %ymm0, %ymm1
vpsrlw $8, %ymm1, %ymm2
vpand mask_ff(%rip), %ymm1, %ymm1
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_f(%rip), %ymm2, %ymm1
vpsrlw $4, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpsubw mask_3(%rip), %ymm2, %ymm14
vpsraw $15, %ymm14, %ymm15
vpandn %ymm14, %ymm15, %ymm1
vpand %ymm15, %ymm2, %ymm14
vpxor %ymm14, %ymm1, %ymm2
vmovdqa %ymm2, 928(%rdi)
vpaddw 960(%rdi), %ymm0, %ymm1
vpsrlw $8, %ymm1, %ymm2
vpand mask_ff(%rip), %ymm1, %ymm1
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_f(%rip), %ymm2, %ymm1
vpsrlw $4, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpsubw mask_3(%rip), %ymm2, %ymm14
vpsraw $15, %ymm14, %ymm15
vpandn %ymm14, %ymm15, %ymm1
vpand %ymm15, %ymm2, %ymm14
vpxor %ymm14, %ymm1, %ymm2
vmovdqa %ymm2, 960(%rdi)
vpaddw 992(%rdi), %ymm0, %ymm1
vpsrlw $8, %ymm1, %ymm2
vpand mask_ff(%rip), %ymm1, %ymm1
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_f(%rip), %ymm2, %ymm1
vpsrlw $4, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpsubw mask_3(%rip), %ymm2, %ymm14
vpsraw $15, %ymm14, %ymm15
vpandn %ymm14, %ymm15, %ymm1
vpand %ymm15, %ymm2, %ymm14
vpxor %ymm14, %ymm1, %ymm2
vmovdqa %ymm2, 992(%rdi)
vpaddw 1024(%rdi), %ymm0, %ymm1
vpsrlw $8, %ymm1, %ymm2
vpand mask_ff(%rip), %ymm1, %ymm1
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_f(%rip), %ymm2, %ymm1
vpsrlw $4, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpsubw mask_3(%rip), %ymm2, %ymm14
vpsraw $15, %ymm14, %ymm15
vpandn %ymm14, %ymm15, %ymm1
vpand %ymm15, %ymm2, %ymm14
vpxor %ymm14, %ymm1, %ymm2
vmovdqa %ymm2, 1024(%rdi)
vpaddw 1056(%rdi), %ymm0, %ymm1
vpsrlw $8, %ymm1, %ymm2
vpand mask_ff(%rip), %ymm1, %ymm1
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_f(%rip), %ymm2, %ymm1
vpsrlw $4, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpsubw mask_3(%rip), %ymm2, %ymm14
vpsraw $15, %ymm14, %ymm15
vpandn %ymm14, %ymm15, %ymm1
vpand %ymm15, %ymm2, %ymm14
vpxor %ymm14, %ymm1, %ymm2
vmovdqa %ymm2, 1056(%rdi)
vpaddw 1088(%rdi), %ymm0, %ymm1
vpsrlw $8, %ymm1, %ymm2
vpand mask_ff(%rip), %ymm1, %ymm1
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_f(%rip), %ymm2, %ymm1
vpsrlw $4, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpsubw mask_3(%rip), %ymm2, %ymm14
vpsraw $15, %ymm14, %ymm15
vpandn %ymm14, %ymm15, %ymm1
vpand %ymm15, %ymm2, %ymm14
vpxor %ymm14, %ymm1, %ymm2
vmovdqa %ymm2, 1088(%rdi)
vpaddw 1120(%rdi), %ymm0, %ymm1
vpsrlw $8, %ymm1, %ymm2
vpand mask_ff(%rip), %ymm1, %ymm1
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_f(%rip), %ymm2, %ymm1
vpsrlw $4, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpsubw mask_3(%rip), %ymm2, %ymm14
vpsraw $15, %ymm14, %ymm15
vpandn %ymm14, %ymm15, %ymm1
vpand %ymm15, %ymm2, %ymm14
vpxor %ymm14, %ymm1, %ymm2
vmovdqa %ymm2, 1120(%rdi)
vpaddw 1152(%rdi), %ymm0, %ymm1
vpsrlw $8, %ymm1, %ymm2
vpand mask_ff(%rip), %ymm1, %ymm1
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_f(%rip), %ymm2, %ymm1
vpsrlw $4, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpsubw mask_3(%rip), %ymm2, %ymm14
vpsraw $15, %ymm14, %ymm15
vpandn %ymm14, %ymm15, %ymm1
vpand %ymm15, %ymm2, %ymm14
vpxor %ymm14, %ymm1, %ymm2
vmovdqa %ymm2, 1152(%rdi)
vpaddw 1184(%rdi), %ymm0, %ymm1
vpsrlw $8, %ymm1, %ymm2
vpand mask_ff(%rip), %ymm1, %ymm1
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_f(%rip), %ymm2, %ymm1
vpsrlw $4, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpsubw mask_3(%rip), %ymm2, %ymm14
vpsraw $15, %ymm14, %ymm15
vpandn %ymm14, %ymm15, %ymm1
vpand %ymm15, %ymm2, %ymm14
vpxor %ymm14, %ymm1, %ymm2
vmovdqa %ymm2, 1184(%rdi)
vpaddw 1216(%rdi), %ymm0, %ymm1
vpsrlw $8, %ymm1, %ymm2
vpand mask_ff(%rip), %ymm1, %ymm1
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_f(%rip), %ymm2, %ymm1
vpsrlw $4, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpsubw mask_3(%rip), %ymm2, %ymm14
vpsraw $15, %ymm14, %ymm15
vpandn %ymm14, %ymm15, %ymm1
vpand %ymm15, %ymm2, %ymm14
vpxor %ymm14, %ymm1, %ymm2
vmovdqa %ymm2, 1216(%rdi)
vpaddw 1248(%rdi), %ymm0, %ymm1
vpsrlw $8, %ymm1, %ymm2
vpand mask_ff(%rip), %ymm1, %ymm1
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_f(%rip), %ymm2, %ymm1
vpsrlw $4, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpsubw mask_3(%rip), %ymm2, %ymm14
vpsraw $15, %ymm14, %ymm15
vpandn %ymm14, %ymm15, %ymm1
vpand %ymm15, %ymm2, %ymm14
vpxor %ymm14, %ymm1, %ymm2
vmovdqa %ymm2, 1248(%rdi)
vpaddw 1280(%rdi), %ymm0, %ymm1
vpsrlw $8, %ymm1, %ymm2
vpand mask_ff(%rip), %ymm1, %ymm1
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_f(%rip), %ymm2, %ymm1
vpsrlw $4, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpsubw mask_3(%rip), %ymm2, %ymm14
vpsraw $15, %ymm14, %ymm15
vpandn %ymm14, %ymm15, %ymm1
vpand %ymm15, %ymm2, %ymm14
vpxor %ymm14, %ymm1, %ymm2
vmovdqa %ymm2, 1280(%rdi)
vpaddw 1312(%rdi), %ymm0, %ymm1
vpsrlw $8, %ymm1, %ymm2
vpand mask_ff(%rip), %ymm1, %ymm1
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_f(%rip), %ymm2, %ymm1
vpsrlw $4, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpsubw mask_3(%rip), %ymm2, %ymm14
vpsraw $15, %ymm14, %ymm15
vpandn %ymm14, %ymm15, %ymm1
vpand %ymm15, %ymm2, %ymm14
vpxor %ymm14, %ymm1, %ymm2
vmovdqa %ymm2, 1312(%rdi)
vpaddw 1344(%rdi), %ymm0, %ymm1
vpsrlw $8, %ymm1, %ymm2
vpand mask_ff(%rip), %ymm1, %ymm1
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_f(%rip), %ymm2, %ymm1
vpsrlw $4, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpsubw mask_3(%rip), %ymm2, %ymm14
vpsraw $15, %ymm14, %ymm15
vpandn %ymm14, %ymm15, %ymm1
vpand %ymm15, %ymm2, %ymm14
vpxor %ymm14, %ymm1, %ymm2
vmovdqa %ymm2, 1344(%rdi)
vpaddw 1376(%rdi), %ymm0, %ymm1
vpsrlw $8, %ymm1, %ymm2
vpand mask_ff(%rip), %ymm1, %ymm1
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_f(%rip), %ymm2, %ymm1
vpsrlw $4, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpand mask_3(%rip), %ymm2, %ymm1
vpsrlw $2, %ymm2, %ymm2
vpaddw %ymm2, %ymm1, %ymm2
vpsubw mask_3(%rip), %ymm2, %ymm14
vpsraw $15, %ymm14, %ymm15
vpandn %ymm14, %ymm15, %ymm1
vpand %ymm15, %ymm2, %ymm14
vpxor %ymm14, %ymm1, %ymm2
vmovdqa %ymm2, 1376(%rdi)
movw $0, 1354(%rdi)
movw $0, 1356(%rdi)
movw $0, 1358(%rdi)
movw $0, 1360(%rdi)
movw $0, 1362(%rdi)
movw $0, 1364(%rdi)
movw $0, 1366(%rdi)
movw $0, 1368(%rdi)
movw $0, 1370(%rdi)
movw $0, 1372(%rdi)
movw $0, 1374(%rdi)
movw $0, 1376(%rdi)
movw $0, 1378(%rdi)
movw $0, 1380(%rdi)
movw $0, 1382(%rdi)
movw $0, 1384(%rdi)
movw $0, 1386(%rdi)
movw $0, 1388(%rdi)
movw $0, 1390(%rdi)
movw $0, 1392(%rdi)
movw $0, 1394(%rdi)
movw $0, 1396(%rdi)
movw $0, 1398(%rdi)
movw $0, 1400(%rdi)
movw $0, 1402(%rdi)
movw $0, 1404(%rdi)
movw $0, 1406(%rdi)
ret

+ 104
- 0
crypto_kem/ntruhps2048677/avx2/poly_mod_q_Phi_n.s View File

@@ -0,0 +1,104 @@
.data
.p2align 5
.text
.global PQCLEAN_NTRUHPS2048677_AVX2_poly_mod_q_Phi_n
.global _PQCLEAN_NTRUHPS2048677_AVX2_poly_mod_q_Phi_n
PQCLEAN_NTRUHPS2048677_AVX2_poly_mod_q_Phi_n:
_PQCLEAN_NTRUHPS2048677_AVX2_poly_mod_q_Phi_n:
vmovdqa 1344(%rdi), %ymm0
vpermq $1, %ymm0, %ymm0
vpslld $16, %ymm0, %ymm0
vpsrld $16, %ymm0, %ymm1
vpor %ymm0, %ymm1, %ymm0
vbroadcastss %xmm0, %ymm0
vxorpd %ymm1, %ymm1, %ymm1
vpsubw %ymm0, %ymm1, %ymm0
vpaddw 0(%rdi), %ymm0, %ymm1
vmovdqa %ymm1, 0(%rdi)
vpaddw 32(%rdi), %ymm0, %ymm1
vmovdqa %ymm1, 32(%rdi)
vpaddw 64(%rdi), %ymm0, %ymm1
vmovdqa %ymm1, 64(%rdi)
vpaddw 96(%rdi), %ymm0, %ymm1
vmovdqa %ymm1, 96(%rdi)
vpaddw 128(%rdi), %ymm0, %ymm1
vmovdqa %ymm1, 128(%rdi)
vpaddw 160(%rdi), %ymm0, %ymm1
vmovdqa %ymm1, 160(%rdi)
vpaddw 192(%rdi), %ymm0, %ymm1
vmovdqa %ymm1, 192(%rdi)
vpaddw 224(%rdi), %ymm0, %ymm1
vmovdqa %ymm1, 224(%rdi)
vpaddw 256(%rdi), %ymm0, %ymm1
vmovdqa %ymm1, 256(%rdi)
vpaddw 288(%rdi), %ymm0, %ymm1
vmovdqa %ymm1, 288(%rdi)
vpaddw 320(%rdi), %ymm0, %ymm1
vmovdqa %ymm1, 320(%rdi)
vpaddw 352(%rdi), %ymm0, %ymm1
vmovdqa %ymm1, 352(%rdi)
vpaddw 384(%rdi), %ymm0, %ymm1
vmovdqa %ymm1, 384(%rdi)
vpaddw 416(%rdi), %ymm0, %ymm1
vmovdqa %ymm1, 416(%rdi)
vpaddw 448(%rdi), %ymm0, %ymm1
vmovdqa %ymm1, 448(%rdi)
vpaddw 480(%rdi), %ymm0, %ymm1
vmovdqa %ymm1, 480(%rdi)
vpaddw 512(%rdi), %ymm0, %ymm1
vmovdqa %ymm1, 512(%rdi)
vpaddw 544(%rdi), %ymm0, %ymm1
vmovdqa %ymm1, 544(%rdi)
vpaddw 576(%rdi), %ymm0, %ymm1
vmovdqa %ymm1, 576(%rdi)
vpaddw 608(%rdi), %ymm0, %ymm1
vmovdqa %ymm1, 608(%rdi)
vpaddw 640(%rdi), %ymm0, %ymm1
vmovdqa %ymm1, 640(%rdi)
vpaddw 672(%rdi), %ymm0, %ymm1
vmovdqa %ymm1, 672(%rdi)
vpaddw 704(%rdi), %ymm0, %ymm1
vmovdqa %ymm1, 704(%rdi)
vpaddw 736(%rdi), %ymm0, %ymm1
vmovdqa %ymm1, 736(%rdi)
vpaddw 768(%rdi), %ymm0, %ymm1
vmovdqa %ymm1, 768(%rdi)
vpaddw 800(%rdi), %ymm0, %ymm1
vmovdqa %ymm1, 800(%rdi)
vpaddw 832(%rdi), %ymm0, %ymm1
vmovdqa %ymm1, 832(%rdi)
vpaddw 864(%rdi), %ymm0, %ymm1
vmovdqa %ymm1, 864(%rdi)
vpaddw 896(%rdi), %ymm0, %ymm1
vmovdqa %ymm1, 896(%rdi)
vpaddw 928(%rdi), %ymm0, %ymm1
vmovdqa %ymm1, 928(%rdi)
vpaddw 960(%rdi), %ymm0, %ymm1
vmovdqa %ymm1, 960(%rdi)
vpaddw 992(%rdi), %ymm0, %ymm1
vmovdqa %ymm1, 992(%rdi)
vpaddw 1024(%rdi), %ymm0, %ymm1
vmovdqa %ymm1, 1024(%rdi)
vpaddw 1056(%rdi), %ymm0, %ymm1
vmovdqa %ymm1, 1056(%rdi)
vpaddw 1088(%rdi), %ymm0, %ymm1
vmovdqa %ymm1, 1088(%rdi)
vpaddw 1120(%rdi), %ymm0, %ymm1
vmovdqa %ymm1, 1120(%rdi)
vpaddw 1152(%rdi), %ymm0, %ymm1
vmovdqa %ymm1, 1152(%rdi)
vpaddw 1184(%rdi), %ymm0, %ymm1
vmovdqa %ymm1, 1184(%rdi)
vpaddw 1216(%rdi), %ymm0, %ymm1
vmovdqa %ymm1, 1216(%rdi)
vpaddw 1248(%rdi), %ymm0, %ymm1
vmovdqa %ymm1, 1248(%rdi)
vpaddw 1280(%rdi), %ymm0, %ymm1
vmovdqa %ymm1, 1280(%rdi)
vpaddw 1312(%rdi), %ymm0, %ymm1
vmovdqa %ymm1, 1312(%rdi)
vpaddw 1344(%rdi), %ymm0, %ymm1
vmovdqa %ymm1, 1344(%rdi)
vpaddw 1376(%rdi), %ymm0, %ymm1
vmovdqa %ymm1, 1376(%rdi)
ret

+ 73
- 0
crypto_kem/ntruhps2048677/avx2/poly_r2_inv.c View File

@@ -0,0 +1,73 @@
#include "poly_r2_inv.h"
#include "poly.h"

// TODO this costs 1764 cycles.. (implementing as S3_to_bytes results in 2108)
// This can be implemented nicely in assembly using pdep / pext functions
void PQCLEAN_NTRUHPS2048677_AVX2_poly_R2_tobytes(unsigned char *out, const poly *a) {
int i, j, k;
for (i = 0; i < 12; i++) {
for (k = 0; k < 8; k++) {
out[i * 8 + k] = 0;
for (j = 0; j < 8; j++) {
if ((i * 8 + k) * 8 + j < NTRU_N) {
out[i * 8 + k] |= (a->coeffs[(i * 8 + k) * 8 + j] & 1) << j;
}
}
}
}
}

void PQCLEAN_NTRUHPS2048677_AVX2_poly_R2_frombytes(poly *a, const unsigned char *in) {
int i, j, k;
for (i = 0; i < 12; i++) {
for (k = 0; k < 8; k++) {
for (j = 0; j < 8; j++) {
if ((i * 8 + k) * 8 + j < NTRU_N) {
a->coeffs[(i * 8 + k) * 8 + j] = (in[i * 8 + k] >> j) & 1;
}
}
}
}
}

void PQCLEAN_NTRUHPS2048677_AVX2_poly_R2_inv(poly *r, const poly *a) {
union {
unsigned char s[96];
__m256i s_x32[3];
} squares[13];
#define s(x) squares[(x)].s

// This relies on the following addition chain:
// 1, 2, 3, 5, 10, 20, 21, 42, 84, 168, 336, 672, 675

PQCLEAN_NTRUHPS2048677_AVX2_poly_R2_tobytes(s(0), a); // TODO alignment

PQCLEAN_NTRUHPS2048677_AVX2_square_1_677(s(1), s(0));
PQCLEAN_NTRUHPS2048677_AVX2_poly_R2_mul(s(1), s(1), s(0));
PQCLEAN_NTRUHPS2048677_AVX2_square_1_677(s(2), s(1));
PQCLEAN_NTRUHPS2048677_AVX2_poly_R2_mul(s(2), s(2), s(0));
PQCLEAN_NTRUHPS2048677_AVX2_square_2_677(s(3), s(2));
PQCLEAN_NTRUHPS2048677_AVX2_poly_R2_mul(s(3), s(3), s(1));
PQCLEAN_NTRUHPS2048677_AVX2_square_5_677(s(4), s(3));
PQCLEAN_NTRUHPS2048677_AVX2_poly_R2_mul(s(4), s(4), s(3));
PQCLEAN_NTRUHPS2048677_AVX2_square_10_677(s(5), s(4));
PQCLEAN_NTRUHPS2048677_AVX2_poly_R2_mul(s(5), s(5), s(4));
PQCLEAN_NTRUHPS2048677_AVX2_square_1_677(s(6), s(5));
PQCLEAN_NTRUHPS2048677_AVX2_poly_R2_mul(s(6), s(6), s(0));
PQCLEAN_NTRUHPS2048677_AVX2_square_21_677(s(7), s(6));
PQCLEAN_NTRUHPS2048677_AVX2_poly_R2_mul(s(7), s(7), s(6));
PQCLEAN_NTRUHPS2048677_AVX2_square_42_677(s(8), s(7));
PQCLEAN_NTRUHPS2048677_AVX2_poly_R2_mul(s(8), s(8), s(7));
PQCLEAN_NTRUHPS2048677_AVX2_square_84_677(s(9), s(8));
PQCLEAN_NTRUHPS2048677_AVX2_poly_R2_mul(s(9), s(9), s(8));
PQCLEAN_NTRUHPS2048677_AVX2_square_168_677(s(10), s(9));
PQCLEAN_NTRUHPS2048677_AVX2_poly_R2_mul(s(10), s(10), s(9));
PQCLEAN_NTRUHPS2048677_AVX2_square_336_677(s(11), s(10));
PQCLEAN_NTRUHPS2048677_AVX2_poly_R2_mul(s(11), s(11), s(10));
PQCLEAN_NTRUHPS2048677_AVX2_square_3_677(s(12), s(11));
PQCLEAN_NTRUHPS2048677_AVX2_poly_R2_mul(s(12), s(12), s(2));
PQCLEAN_NTRUHPS2048677_AVX2_square_1_677(s(0), s(12));

PQCLEAN_NTRUHPS2048677_AVX2_poly_R2_frombytes(r, s(0));
#undef s
}

+ 23
- 0
crypto_kem/ntruhps2048677/avx2/poly_r2_inv.h View File

@@ -0,0 +1,23 @@
#ifndef POLY_R2_INV_H
#define POLY_R2_INV_H

#include "poly.h"

void PQCLEAN_NTRUHPS2048677_AVX2_poly_R2_tobytes(unsigned char *out, const poly *a);
void PQCLEAN_NTRUHPS2048677_AVX2_poly_R2_frombytes(poly *a, const unsigned char *in);

extern void PQCLEAN_NTRUHPS2048677_AVX2_square_1_677(unsigned char *out, const unsigned char *a);
extern void PQCLEAN_NTRUHPS2048677_AVX2_square_2_677(unsigned char *out, const unsigned char *a);
extern void PQCLEAN_NTRUHPS2048677_AVX2_square_3_677(unsigned char *out, const unsigned char *a);
extern void PQCLEAN_NTRUHPS2048677_AVX2_square_5_677(unsigned char *out, const unsigned char *a);
extern void PQCLEAN_NTRUHPS2048677_AVX2_square_10_677(unsigned char *out, const unsigned char *a);
extern void PQCLEAN_NTRUHPS2048677_AVX2_square_21_677(unsigned char *out, const unsigned char *a);
extern void PQCLEAN_NTRUHPS2048677_AVX2_square_42_677(unsigned char *out, const unsigned char *a);
extern void PQCLEAN_NTRUHPS2048677_AVX2_square_84_677(unsigned char *out, const unsigned char *a);
extern void PQCLEAN_NTRUHPS2048677_AVX2_square_168_677(unsigned char *out, const unsigned char *a);
extern void PQCLEAN_NTRUHPS2048677_AVX2_square_336_677(unsigned char *out, const unsigned char *a);

extern void PQCLEAN_NTRUHPS2048677_AVX2_poly_R2_mul(unsigned char *out, const unsigned char *a,
const unsigned char *b);

#endif

+ 466
- 0
crypto_kem/ntruhps2048677/avx2/poly_r2_mul.s View File

@@ -0,0 +1,466 @@
.data
.p2align 5
mask1100:
.word 0
.word 0
.word 0
.word 0
.word 0
.word 0
.word 0
.word 0
.word 65535
.word 65535
.word 65535
.word 65535
.word 65535
.word 65535
.word 65535
.word 65535
mask0110:
.word 0
.word 0
.word 0
.word 0
.word 65535
.word 65535
.word 65535
.word 65535
.word 65535
.word 65535
.word 65535
.word 65535
.word 0
.word 0
.word 0
.word 0
mask0011:
.word 65535
.word 65535
.word 65535
.word 65535
.word 65535
.word 65535
.word 65535
.word 65535
.word 0
.word 0
.word 0
.word 0
.word 0
.word 0
.word 0
.word 0
mask1000:
.word 0
.word 0
.word 0
.word 0
.word 0
.word 0
.word 0
.word 0
.word 0
.word 0
.word 0
.word 0
.word 65535
.word 65535
.word 65535
.word 65535
mask0111:
.word 65535
.word 65535
.word 65535
.word 65535
.word 65535
.word 65535
.word 65535
.word 65535
.word 65535
.word 65535
.word 65535
.word 65535
.word 0
.word 0
.word 0
.word 0
low165:
.word 65535
.word 65535
.word 65535
.word 65535
.word 65535
.word 65535
.word 65535
.word 65535
.word 65535
.word 65535
.word 31
.word 0
.word 0
.word 0
.word 0
.word 0
.text
.global PQCLEAN_NTRUHPS2048677_AVX2_poly_R2_mul
.global _PQCLEAN_NTRUHPS2048677_AVX2_poly_R2_mul
PQCLEAN_NTRUHPS2048677_AVX2_poly_R2_mul:
_PQCLEAN_NTRUHPS2048677_AVX2_poly_R2_mul:
vmovdqa 0(%rsi), %ymm0
vmovdqa 32(%rsi), %ymm1
vmovdqa 0(%rdx), %ymm3
vmovdqa 32(%rdx), %ymm4
vpxor %ymm0, %ymm1, %ymm6
vpxor %ymm3, %ymm4, %ymm7
vextracti128 $1, %ymm0, %xmm11
vextracti128 $1, %ymm3, %xmm12
vpclmulqdq $1, %xmm11, %xmm12, %xmm5
vpclmulqdq $16, %xmm11, %xmm12, %xmm14
vpclmulqdq $17, %xmm11, %xmm12, %xmm15
vpxor %xmm5, %xmm14, %xmm14
vpclmulqdq $0, %xmm11, %xmm12, %xmm5
vpermq $16, %ymm14, %ymm14
vinserti128 $1, %xmm15, %ymm15, %ymm15
vpand mask0011(%rip), %ymm5, %ymm5
vpand mask0110(%rip), %ymm14, %ymm14
vpand mask1100(%rip), %ymm15, %ymm15
vpxor %ymm5, %ymm14, %ymm14
vpxor %ymm14, %ymm15, %ymm5
vpxor %xmm0, %xmm11, %xmm11
vpxor %xmm3, %xmm12, %xmm12
vpclmulqdq $1, %xmm11, %xmm12, %xmm13
vpclmulqdq $16, %xmm11, %xmm12, %xmm14
vpclmulqdq $17, %xmm11, %xmm12, %xmm15
vpxor %xmm13, %xmm14, %xmm14
vpclmulqdq $0, %xmm11, %xmm12, %xmm13
vpermq $16, %ymm14, %ymm14
vinserti128 $1, %xmm15, %ymm15, %ymm15
vpand mask0011(%rip), %ymm13, %ymm13
vpand mask0110(%rip), %ymm14, %ymm14
vpand mask1100(%rip), %ymm15, %ymm15
vpxor %ymm13, %ymm14, %ymm14
vpxor %ymm14, %ymm15, %ymm13
vpclmulqdq $1, %xmm0, %xmm3, %xmm2
vpclmulqdq $16, %xmm0, %xmm3, %xmm14
vpclmulqdq $17, %xmm0, %xmm3, %xmm15
vpxor %xmm2, %xmm14, %xmm14
vpclmulqdq $0, %xmm0, %xmm3, %xmm2
vpermq $16, %ymm14, %ymm14
vinserti128 $1, %xmm15, %ymm15, %ymm15
vpand mask0011(%rip), %ymm2, %ymm2
vpand mask0110(%rip), %ymm14, %ymm14
vpand mask1100(%rip), %ymm15, %ymm15
vpxor %ymm2, %ymm14, %ymm14
vpxor %ymm14, %ymm15, %ymm2
vpxor %ymm13, %ymm5, %ymm13
vpxor %ymm13, %ymm2, %ymm13
vpxor %ymm11, %ymm11, %ymm11
vextracti128 $1, %ymm13, %xmm11
vpxor %ymm5, %ymm11, %ymm5
vpxor %ymm11, %ymm11, %ymm11
vinserti128 $1, %xmm13, %ymm11, %ymm11
vpxor %ymm11, %ymm2, %ymm2
vextracti128 $1, %ymm6, %xmm11
vextracti128 $1, %ymm7, %xmm12
vpclmulqdq $1, %xmm11, %xmm12, %xmm9
vpclmulqdq $16, %xmm11, %xmm12, %xmm14
vpclmulqdq $17, %xmm11, %xmm12, %xmm15
vpxor %xmm9, %xmm14, %xmm14
vpclmulqdq $0, %xmm11, %xmm12, %xmm9
vpermq $16, %ymm14, %ymm14
vinserti128 $1, %xmm15, %ymm15, %ymm15
vpand mask0011(%rip), %ymm9, %ymm9
vpand mask0110(%rip), %ymm14, %ymm14
vpand mask1100(%rip), %ymm15, %ymm15
vpxor %ymm9, %ymm14, %ymm14
vpxor %ymm14, %ymm15, %ymm9
vpxor %xmm6, %xmm11, %xmm11
vpxor %xmm7, %xmm12, %xmm12
vpclmulqdq $1, %xmm11, %xmm12, %xmm13
vpclmulqdq $16, %xmm11, %xmm12, %xmm14
vpclmulqdq $17, %xmm11, %xmm12, %xmm15
vpxor %xmm13, %xmm14, %xmm14
vpclmulqdq $0, %xmm11, %xmm12, %xmm13
vpermq $16, %ymm14, %ymm14
vinserti128 $1, %xmm15, %ymm15, %ymm15
vpand mask0011(%rip), %ymm13, %ymm13
vpand mask0110(%rip), %ymm14, %ymm14
vpand mask1100(%rip), %ymm15, %ymm15
vpxor %ymm13, %ymm14, %ymm14
vpxor %ymm14, %ymm15, %ymm13
vpclmulqdq $1, %xmm6, %xmm7, %xmm8
vpclmulqdq $16, %xmm6, %xmm7, %xmm14
vpclmulqdq $17, %xmm6, %xmm7, %xmm15
vpxor %xmm8, %xmm14, %xmm14
vpclmulqdq $0, %xmm6, %xmm7, %xmm8
vpermq $16, %ymm14, %ymm14
vinserti128 $1, %xmm15, %ymm15, %ymm15
vpand mask0011(%rip), %ymm8, %ymm8
vpand mask0110(%rip), %ymm14, %ymm14
vpand mask1100(%rip), %ymm15, %ymm15
vpxor %ymm8, %ymm14, %ymm14
vpxor %ymm14, %ymm15, %ymm8
vpxor %ymm13, %ymm9, %ymm13
vpxor %ymm13, %ymm8, %ymm13
vpxor %ymm11, %ymm11, %ymm11
vextracti128 $1, %ymm13, %xmm11
vpxor %ymm9, %ymm11, %ymm9
vpxor %ymm11, %ymm11, %ymm11
vinserti128 $1, %xmm13, %ymm11, %ymm11
vpxor %ymm11, %ymm8, %ymm8
vpxor %ymm8, %ymm2, %ymm0
vpxor %ymm9, %ymm5, %ymm3
vpxor %ymm5, %ymm0, %ymm0
vpxor %ymm3, %ymm8, %ymm8
vmovdqa 64(%rsi), %ymm10
vmovdqa 64(%rdx), %ymm15
vpxor %ymm6, %ymm10, %ymm6
vpxor %ymm7, %ymm15, %ymm7
vextracti128 $1, %ymm6, %xmm11
vextracti128 $1, %ymm7, %xmm12
vpclmulqdq $1, %xmm11, %xmm12, %xmm5
vpclmulqdq $16, %xmm11, %xmm12, %xmm13
vpclmulqdq $17, %xmm11, %xmm12, %xmm14
vpxor %xmm5, %xmm13, %xmm13
vpclmulqdq $0, %xmm11, %xmm12, %xmm5
vpermq $16, %ymm13, %ymm13
vinserti128 $1, %xmm14, %ymm14, %ymm14
vpand mask0011(%rip), %ymm5, %ymm5
vpand mask0110(%rip), %ymm13, %ymm13
vpand mask1100(%rip), %ymm14, %ymm14
vpxor %ymm5, %ymm13, %ymm13
vpxor %ymm13, %ymm14, %ymm5
vpclmulqdq $1, %xmm6, %xmm7, %xmm3
vpclmulqdq $16, %xmm6, %xmm7, %xmm13
vpclmulqdq $17, %xmm6, %xmm7, %xmm14
vpxor %xmm3, %xmm13, %xmm13
vpclmulqdq $0, %xmm6, %xmm7, %xmm3
vpermq $16, %ymm13, %ymm13
vinserti128 $1, %xmm14, %ymm14, %ymm14
vpand mask0011(%rip), %ymm3, %ymm3
vpand mask0110(%rip), %ymm13, %ymm13
vpand mask1100(%rip), %ymm14, %ymm14
vpxor %ymm3, %ymm13, %ymm13
vpxor %ymm13, %ymm14, %ymm3
vpxor %xmm6, %xmm11, %xmm11
vpxor %xmm7, %xmm12, %xmm12
vpclmulqdq $1, %xmm11, %xmm12, %xmm13
vpclmulqdq $16, %xmm11, %xmm12, %xmm6
vpclmulqdq $17, %xmm11, %xmm12, %xmm7
vpxor %xmm13, %xmm6, %xmm6
vpclmulqdq $0, %xmm11, %xmm12, %xmm13
vpermq $16, %ymm6, %ymm6
vinserti128 $1, %xmm7, %ymm7, %ymm7
vpand mask0011(%rip), %ymm13, %ymm13
vpand mask0110(%rip), %ymm6, %ymm6
vpand mask1100(%rip), %ymm7, %ymm7
vpxor %ymm13, %ymm6, %ymm6
vpxor %ymm6, %ymm7, %ymm13
vpxor %ymm13, %ymm5, %ymm13
vpxor %ymm13, %ymm3, %ymm13
vpxor %ymm11, %ymm11, %ymm11
vextracti128 $1, %ymm13, %xmm11
vpxor %ymm5, %ymm11, %ymm5
vpxor %ymm11, %ymm11, %ymm11
vinserti128 $1, %xmm13, %ymm11, %ymm11
vpxor %ymm11, %ymm3, %ymm3
vpxor %ymm3, %ymm8, %ymm8
vpxor %ymm5, %ymm9, %ymm9
vpxor %ymm1, %ymm10, %ymm6
vpxor %ymm4, %ymm15, %ymm7
vextracti128 $1, %ymm6, %xmm11
vextracti128 $1, %ymm7, %xmm12
vpclmulqdq $1, %xmm11, %xmm12, %xmm5
vpclmulqdq $16, %xmm11, %xmm12, %xmm13
vpclmulqdq $17, %xmm11, %xmm12, %xmm14
vpxor %xmm5, %xmm13, %xmm13
vpclmulqdq $0, %xmm11, %xmm12, %xmm5
vpermq $16, %ymm13, %ymm13
vinserti128 $1, %xmm14, %ymm14, %ymm14
vpand mask0011(%rip), %ymm5, %ymm5
vpand mask0110(%rip), %ymm13, %ymm13
vpand mask1100(%rip), %ymm14, %ymm14
vpxor %ymm5, %ymm13, %ymm13
vpxor %ymm13, %ymm14, %ymm5
vpclmulqdq $1, %xmm6, %xmm7, %xmm3
vpclmulqdq $16, %xmm6, %xmm7, %xmm13
vpclmulqdq $17, %xmm6, %xmm7, %xmm14
vpxor %xmm3, %xmm13, %xmm13
vpclmulqdq $0, %xmm6, %xmm7, %xmm3
vpermq $16, %ymm13, %ymm13
vinserti128 $1, %xmm14, %ymm14, %ymm14
vpand mask0011(%rip), %ymm3, %ymm3
vpand mask0110(%rip), %ymm13, %ymm13
vpand mask1100(%rip), %ymm14, %ymm14
vpxor %ymm3, %ymm13, %ymm13
vpxor %ymm13, %ymm14, %ymm3
vpxor %xmm6, %xmm11, %xmm11
vpxor %xmm7, %xmm12, %xmm12
vpclmulqdq $1, %xmm11, %xmm12, %xmm13
vpclmulqdq $16, %xmm11, %xmm12, %xmm6
vpclmulqdq $17, %xmm11, %xmm12, %xmm7
vpxor %xmm13, %xmm6, %xmm6
vpclmulqdq $0, %xmm11, %xmm12, %xmm13
vpermq $16, %ymm6, %ymm6
vinserti128 $1, %xmm7, %ymm7, %ymm7
vpand mask0011(%rip), %ymm13, %ymm13
vpand mask0110(%rip), %ymm6, %ymm6
vpand mask1100(%rip), %ymm7, %ymm7
vpxor %ymm13, %ymm6, %ymm6
vpxor %ymm6, %ymm7, %ymm13
vpxor %ymm13, %ymm5, %ymm13
vpxor %ymm13, %ymm3, %ymm13
vpxor %ymm11, %ymm11, %ymm11
vextracti128 $1, %ymm13, %xmm11
vpxor %ymm5, %ymm11, %ymm5
vpxor %ymm11, %ymm11, %ymm11
vinserti128 $1, %xmm13, %ymm11, %ymm11
vpxor %ymm11, %ymm3, %ymm3
vpxor %ymm3, %ymm8, %ymm8
vpxor %ymm5, %ymm9, %ymm9
vextracti128 $1, %ymm1, %xmm11
vextracti128 $1, %ymm4, %xmm12
vpclmulqdq $1, %xmm11, %xmm12, %xmm7
vpclmulqdq $16, %xmm11, %xmm12, %xmm13
vpclmulqdq $17, %xmm11, %xmm12, %xmm14
vpxor %xmm7, %xmm13, %xmm13
vpclmulqdq $0, %xmm11, %xmm12, %xmm7
vpermq $16, %ymm13, %ymm13
vinserti128 $1, %xmm14, %ymm14, %ymm14
vpand mask0011(%rip), %ymm7, %ymm7
vpand mask0110(%rip), %ymm13, %ymm13
vpand mask1100(%rip), %ymm14, %ymm14
vpxor %ymm7, %ymm13, %ymm13
vpxor %ymm13, %ymm14, %ymm7
vpclmulqdq $1, %xmm1, %xmm4, %xmm6
vpclmulqdq $16, %xmm1, %xmm4, %xmm13
vpclmulqdq $17, %xmm1, %xmm4, %xmm14
vpxor %xmm6, %xmm13, %xmm13
vpclmulqdq $0, %xmm1, %xmm4, %xmm6
vpermq $16, %ymm13, %ymm13
vinserti128 $1, %xmm14, %ymm14, %ymm14
vpand mask0011(%rip), %ymm6, %ymm6
vpand mask0110(%rip), %ymm13, %ymm13
vpand mask1100(%rip), %ymm14, %ymm14
vpxor %ymm6, %ymm13, %ymm13
vpxor %ymm13, %ymm14, %ymm6
vpxor %xmm1, %xmm11, %xmm11
vpxor %xmm4, %xmm12, %xmm12
vpclmulqdq $1, %xmm11, %xmm12, %xmm13
vpclmulqdq $16, %xmm11, %xmm12, %xmm1
vpclmulqdq $17, %xmm11, %xmm12, %xmm4
vpxor %xmm13, %xmm1, %xmm1
vpclmulqdq $0, %xmm11, %xmm12, %xmm13
vpermq $16, %ymm1, %ymm1
vinserti128 $1, %xmm4, %ymm4, %ymm4
vpand mask0011(%rip), %ymm13, %ymm13
vpand mask0110(%rip), %ymm1, %ymm1
vpand mask1100(%rip), %ymm4, %ymm4
vpxor %ymm13, %ymm1, %ymm1
vpxor %ymm1, %ymm4, %ymm13
vpxor %ymm13, %ymm7, %ymm13
vpxor %ymm13, %ymm6, %ymm13
vpxor %ymm11, %ymm11, %ymm11
vextracti128 $1, %ymm13, %xmm11
vpxor %ymm7, %ymm11, %ymm7
vpxor %ymm11, %ymm11, %ymm11
vinserti128 $1, %xmm13, %ymm11, %ymm11
vpxor %ymm11, %ymm6, %ymm6
vpxor %ymm6, %ymm0, %ymm0
vpxor %ymm7, %ymm8, %ymm8
vpxor %ymm6, %ymm3, %ymm3
vpxor %ymm7, %ymm5, %ymm5
vextracti128 $1, %ymm10, %xmm11
vextracti128 $1, %ymm15, %xmm12
vpclmulqdq $1, %xmm11, %xmm12, %xmm4
vpclmulqdq $16, %xmm11, %xmm12, %xmm13
vpclmulqdq $17, %xmm11, %xmm12, %xmm14
vpxor %xmm4, %xmm13, %xmm13
vpclmulqdq $0, %xmm11, %xmm12, %xmm4
vpermq $16, %ymm13, %ymm13
vinserti128 $1, %xmm14, %ymm14, %ymm14
vpand mask0011(%rip), %ymm4, %ymm4
vpand mask0110(%rip), %ymm13, %ymm13
vpand mask1100(%rip), %ymm14, %ymm14
vpxor %ymm4, %ymm13, %ymm13
vpxor %ymm13, %ymm14, %ymm4
vpclmulqdq $1, %xmm10, %xmm15, %xmm1
vpclmulqdq $16, %xmm10, %xmm15, %xmm13
vpclmulqdq $17, %xmm10, %xmm15, %xmm14
vpxor %xmm1, %xmm13, %xmm13
vpclmulqdq $0, %xmm10, %xmm15, %xmm1
vpermq $16, %ymm13, %ymm13
vinserti128 $1, %xmm14, %ymm14, %ymm14
vpand mask0011(%rip), %ymm1, %ymm1
vpand mask0110(%rip), %ymm13, %ymm13
vpand mask1100(%rip), %ymm14, %ymm14
vpxor %ymm1, %ymm13, %ymm13
vpxor %ymm13, %ymm14, %ymm1
vpxor %xmm10, %xmm11, %xmm11
vpxor %xmm15, %xmm12, %xmm12
vpclmulqdq $1, %xmm11, %xmm12, %xmm13
vpclmulqdq $16, %xmm11, %xmm12, %xmm10
vpclmulqdq $17, %xmm11, %xmm12, %xmm15
vpxor %xmm13, %xmm10, %xmm10
vpclmulqdq $0, %xmm11, %xmm12, %xmm13
vpermq $16, %ymm10, %ymm10
vinserti128 $1, %xmm15, %ymm15, %ymm15
vpand mask0011(%rip), %ymm13, %ymm13
vpand mask0110(%rip), %ymm10, %ymm10
vpand mask1100(%rip), %ymm15, %ymm15
vpxor %ymm13, %ymm10, %ymm10
vpxor %ymm10, %ymm15, %ymm13
vpxor %ymm13, %ymm4, %ymm13
vpxor %ymm13, %ymm1, %ymm13
vpxor %ymm11, %ymm11, %ymm11
vextracti128 $1, %ymm13, %xmm11
vpxor %ymm4, %ymm11, %ymm4
vpxor %ymm11, %ymm11, %ymm11
vinserti128 $1, %xmm13, %ymm11, %ymm11
vpxor %ymm11, %ymm1, %ymm1
vpxor %ymm1, %ymm3, %ymm3
vpxor %ymm4, %ymm5, %ymm5
vpxor %ymm9, %ymm3, %ymm3
vpxor %ymm5, %ymm1, %ymm1
vpand mask1100(%rip), %ymm8, %ymm13
vpand mask0011(%rip), %ymm3, %ymm12
vpxor %ymm12, %ymm13, %ymm12
vpsrlq $37, %ymm12, %ymm12
vpermq $78, %ymm12, %ymm12
vpxor %ymm12, %ymm2, %ymm2
vpand mask1000(%rip), %ymm8, %ymm12
vpand mask0111(%rip), %ymm3, %ymm13
vpxor %ymm12, %ymm13, %ymm12
vpsllq $27, %ymm12, %ymm12
vpermq $147, %ymm12, %ymm12
vpxor %ymm12, %ymm2, %ymm2
vpand mask1100(%rip), %ymm3, %ymm13
vpand mask0011(%rip), %ymm1, %ymm12
vpxor %ymm12, %ymm13, %ymm12
vpsrlq $37, %ymm12, %ymm12
vpermq $78, %ymm12, %ymm12
vpxor %ymm12, %ymm0, %ymm0
vpand mask1000(%rip), %ymm3, %ymm12
vpand mask0111(%rip), %ymm1, %ymm13
vpxor %ymm12, %ymm13, %ymm12
vpsllq $27, %ymm12, %ymm12
vpermq $147, %ymm12, %ymm12
vpxor %ymm12, %ymm0, %ymm0
vpand mask1100(%rip), %ymm1, %ymm13
vpand mask0011(%rip), %ymm4, %ymm12
vpxor %ymm12, %ymm13, %ymm12
vpsrlq $37, %ymm12, %ymm12
vpermq $78, %ymm12, %ymm12
vpxor %ymm12, %ymm8, %ymm8
vpand mask1000(%rip), %ymm1, %ymm12
vpand mask0111(%rip), %ymm4, %ymm13
vpxor %ymm12, %ymm13, %ymm12
vpsllq $27, %ymm12, %ymm12
vpermq $147, %ymm12, %ymm12
vpxor %ymm12, %ymm8, %ymm8
vpand low165(%rip), %ymm8, %ymm8
vmovdqa %ymm2, 0(%rdi)
vmovdqa %ymm0, 32(%rdi)
vmovdqa %ymm8, 64(%rdi)
ret

+ 8010
- 0
crypto_kem/ntruhps2048677/avx2/poly_rq_mul.s
File diff suppressed because it is too large
View File


+ 1255
- 0
crypto_kem/ntruhps2048677/avx2/poly_rq_to_s3.s
File diff suppressed because it is too large
View File


+ 569
- 0
crypto_kem/ntruhps2048677/avx2/poly_s3_inv.c View File

@@ -0,0 +1,569 @@
#include "poly.h"

#include <immintrin.h>

typedef signed char small;

#define p 676
#define ppad 768
#define numvec 3

typedef __m256i vec256;

/*
This code stores 768-coeff poly as vec256[3].
Order of 256 coefficients in each vec256
is optimized in light of costs of vector instructions:
0,4,...,252 in 64-bit word;
1,5,...,253 in 64-bit word;
2,6,...,254 in 64-bit word;
3,7,...,255 in 64-bit word.
*/

static inline void vec256_frombits(vec256 *v, const small *b) {
int i;

for (i = 0; i < numvec; ++i) {
vec256 b0 = _mm256_loadu_si256((vec256 *) b);
b += 32; /* 0,1,...,31 */
vec256 b1 = _mm256_loadu_si256((vec256 *) b);
b += 32; /* 32,33,... */
vec256 b2 = _mm256_loadu_si256((vec256 *) b);
b += 32;
vec256 b3 = _mm256_loadu_si256((vec256 *) b);
b += 32;
vec256 b4 = _mm256_loadu_si256((vec256 *) b);
b += 32;
vec256 b5 = _mm256_loadu_si256((vec256 *) b);
b += 32;
vec256 b6 = _mm256_loadu_si256((vec256 *) b);
b += 32;
vec256 b7 = _mm256_loadu_si256((vec256 *) b);
b += 32;

vec256 c0 = _mm256_unpacklo_epi32(b0, b1); /* 0 1 2 3 32 33 34 35 4 5 6 7 36 37 38 39 ... 55 */
vec256 c1 = _mm256_unpackhi_epi32(b0, b1); /* 8 9 10 11 40 41 42 43 ... 63 */
vec256 c2 = _mm256_unpacklo_epi32(b2, b3);
vec256 c3 = _mm256_unpackhi_epi32(b2, b3);
vec256 c4 = _mm256_unpacklo_epi32(b4, b5);
vec256 c5 = _mm256_unpackhi_epi32(b4, b5);
vec256 c6 = _mm256_unpacklo_epi32(b6, b7);
vec256 c7 = _mm256_unpackhi_epi32(b6, b7);

vec256 d0 = c0 | _mm256_slli_epi32(c1, 2); /* 0 8, 1 9, 2 10, 3 11, 32 40, 33 41, ..., 55 63 */
vec256 d2 = c2 | _mm256_slli_epi32(c3, 2);
vec256 d4 = c4 | _mm256_slli_epi32(c5, 2);
vec256 d6 = c6 | _mm256_slli_epi32(c7, 2);

vec256 e0 = _mm256_unpacklo_epi64(d0, d2);
vec256 e2 = _mm256_unpackhi_epi64(d0, d2);
vec256 e4 = _mm256_unpacklo_epi64(d4, d6);
vec256 e6 = _mm256_unpackhi_epi64(d4, d6);

vec256 f0 = e0 | _mm256_slli_epi32(e2, 1);
vec256 f4 = e4 | _mm256_slli_epi32(e6, 1);

vec256 g0 = _mm256_permute2x128_si256(f0, f4, 0x20);
vec256 g4 = _mm256_permute2x128_si256(f0, f4, 0x31);

vec256 h = g0 | _mm256_slli_epi32(g4, 4);

#define TRANSPOSE _mm256_set_epi8( 31,27,23,19, 30,26,22,18, 29,25,21,17, 28,24,20,16, 15,11,7,3, 14,10,6,2, 13,9,5,1, 12,8,4,0 )
h = _mm256_shuffle_epi8(h, TRANSPOSE);
h = _mm256_permute4x64_epi64(h, 0xd8);
h = _mm256_shuffle_epi32(h, 0xd8);

*v++ = h;
}
}

static inline void vec256_tobits(const vec256 *v, small *b) {
int i;

for (i = 0; i < numvec; ++i) {
vec256 h = *v++;

h = _mm256_shuffle_epi32(h, 0xd8);
h = _mm256_permute4x64_epi64(h, 0xd8);
h = _mm256_shuffle_epi8(h, TRANSPOSE);

vec256 g0 = h & _mm256_set1_epi8(15);
vec256 g4 = _mm256_srli_epi32(h, 4) & _mm256_set1_epi8(15);

vec256 f0 = _mm256_permute2x128_si256(g0, g4, 0x20);
vec256 f4 = _mm256_permute2x128_si256(g0, g4, 0x31);

vec256 e0 = f0 & _mm256_set1_epi8(5);
vec256 e2 = _mm256_srli_epi32(f0, 1) & _mm256_set1_epi8(5);
vec256 e4 = f4 & _mm256_set1_epi8(5);
vec256 e6 = _mm256_srli_epi32(f4, 1) & _mm256_set1_epi8(5);

vec256 d0 = _mm256_unpacklo_epi32(e0, e2);
vec256 d2 = _mm256_unpackhi_epi32(e0, e2);
vec256 d4 = _mm256_unpacklo_epi32(e4, e6);
vec256 d6 = _mm256_unpackhi_epi32(e4, e6);

vec256 c0 = d0 & _mm256_set1_epi8(1);
vec256 c1 = _mm256_srli_epi32(d0, 2) & _mm256_set1_epi8(1);
vec256 c2 = d2 & _mm256_set1_epi8(1);
vec256 c3 = _mm256_srli_epi32(d2, 2) & _mm256_set1_epi8(1);
vec256 c4 = d4 & _mm256_set1_epi8(1);
vec256 c5 = _mm256_srli_epi32(d4, 2) & _mm256_set1_epi8(1);
vec256 c6 = d6 & _mm256_set1_epi8(1);
vec256 c7 = _mm256_srli_epi32(d6, 2) & _mm256_set1_epi8(1);

vec256 b0 = _mm256_unpacklo_epi64(c0, c1);
vec256 b1 = _mm256_unpackhi_epi64(c0, c1);
vec256 b2 = _mm256_unpacklo_epi64(c2, c3);
vec256 b3 = _mm256_unpackhi_epi64(c2, c3);
vec256 b4 = _mm256_unpacklo_epi64(c4, c5);
vec256 b5 = _mm256_unpackhi_epi64(c4, c5);
vec256 b6 = _mm256_unpacklo_epi64(c6, c7);
vec256 b7 = _mm256_unpackhi_epi64(c6, c7);

_mm256_storeu_si256((vec256 *) b, b0);
b += 32;
_mm256_storeu_si256((vec256 *) b, b1);
b += 32;
_mm256_storeu_si256((vec256 *) b, b2);
b += 32;
_mm256_storeu_si256((vec256 *) b, b3);
b += 32;
_mm256_storeu_si256((vec256 *) b, b4);
b += 32;
_mm256_storeu_si256((vec256 *) b, b5);
b += 32;
_mm256_storeu_si256((vec256 *) b, b6);
b += 32;
_mm256_storeu_si256((vec256 *) b, b7);
b += 32;
}
}

static void vec256_init(vec256 *G0, vec256 *G1, const small *s) {
int i;
small srev[ppad + (ppad - p)];
small si;
small g0[ppad];
small g1[ppad];

for (i = 0; i < p; ++i) {
srev[ppad - 1 - i] = s[i];
}
for (i = 0; i < ppad - p; ++i) {
srev[i] = 0;
}
for (i = p; i < ppad; ++i) {
srev[i + ppad - p] = 0;
}

for (i = 0; i < ppad; ++i) {
si = srev[i + ppad - p];
g0[i] = si & 1;
g1[i] = (si >> 1) & g0[i];
}

vec256_frombits(G0, g0);
vec256_frombits(G1, g1);
}

static void vec256_final(small *out, const vec256 *V0, const vec256 *V1) {
int i;
small v0[ppad];
small v1[ppad];
small v[ppad];
small vrev[ppad + (ppad - p)];

vec256_tobits(V0, v0);
vec256_tobits(V1, v1);

for (i = 0; i < ppad; ++i) {
v[i] = v0[i] + 2 * v1[i] - 4 * (v0[i] & v1[i]);
}

for (i = 0; i < ppad; ++i) {
vrev[i] = v[ppad - 1 - i];
}
for (i = ppad; i < ppad + (ppad - p); ++i) {
vrev[i] = 0;
}

for (i = 0; i < p; ++i) {
out[i] = vrev[i + ppad - p];
}
}

static inline int negative_mask(int x) {
return x >> 31;
}

static inline void vec256_swap(vec256 *f, vec256 *g, int len, vec256 mask) {
vec256 flip;
int i;

for (i = 0; i < len; ++i) {
flip = mask & (f[i] ^ g[i]);
f[i] ^= flip;
g[i] ^= flip;
}
}

static inline void vec256_scale(vec256 *f0, vec256 *f1, const vec256 c0, const vec256 c1) {
int i;

for (i = 0; i < numvec; ++i) {
vec256 f0i = f0[i];
vec256 f1i = f1[i];

f0i &= c0;
f1i ^= c1;
f1i &= f0i;

f0[i] = f0i;
f1[i] = f1i;
}
}

static inline void vec256_eliminate(vec256 *f0, vec256 *f1, vec256 *g0, vec256 *g1, int len, const vec256 c0, const vec256 c1) {
int i;

for (i = 0; i < len; ++i) {
vec256 f0i = f0[i];
vec256 f1i = f1[i];
vec256 g0i = g0[i];
vec256 g1i = g1[i];
vec256 t;

f0i &= c0;
f1i ^= c1;
f1i &= f0i;

t = g0i ^ f0i;
g0[i] = t | (g1i ^ f1i);
g1[i] = (g1i ^ f0i) & (f1i ^ t);
}
}

static inline int vec256_bit0mask(vec256 *f) {
return -(_mm_cvtsi128_si32(_mm256_castsi256_si128(f[0])) & 1);
}

static inline void vec256_divx_1(vec256 *f) {
vec256 f0 = f[0];

unsigned long long low0 = _mm_cvtsi128_si64(_mm256_castsi256_si128(f0));

low0 = low0 >> 1;

f0 = _mm256_blend_epi32(f0, _mm256_set_epi64x(0, 0, 0, low0), 0x3);

f[0] = _mm256_permute4x64_epi64(f0, 0x39);
}

static inline void vec256_divx_2(vec256 *f) {
vec256 f0 = f[0];
vec256 f1 = f[1];

unsigned long long low0 = _mm_cvtsi128_si64(_mm256_castsi256_si128(f0));
unsigned long long low1 = _mm_cvtsi128_si64(_mm256_castsi256_si128(f1));

low0 = (low0 >> 1) | (low1 << 63);
low1 = low1 >> 1;

f0 = _mm256_blend_epi32(f0, _mm256_set_epi64x(0, 0, 0, low0), 0x3);
f1 = _mm256_blend_epi32(f1, _mm256_set_epi64x(0, 0, 0, low1), 0x3);

f[0] = _mm256_permute4x64_epi64(f0, 0x39);
f[1] = _mm256_permute4x64_epi64(f1, 0x39);
}

static inline void vec256_divx_3(vec256 *f) {
vec256 f0 = f[0];
vec256 f1 = f[1];
vec256 f2 = f[2];

unsigned long long low0 = _mm_cvtsi128_si64(_mm256_castsi256_si128(f0));
unsigned long long low1 = _mm_cvtsi128_si64(_mm256_castsi256_si128(f1));
unsigned long long low2 = _mm_cvtsi128_si64(_mm256_castsi256_si128(f2));

low0 = (low0 >> 1) | (low1 << 63);
low1 = (low1 >> 1) | (low2 << 63);
low2 = low2 >> 1;

f0 = _mm256_blend_epi32(f0, _mm256_set_epi64x(0, 0, 0, low0), 0x3);
f1 = _mm256_blend_epi32(f1, _mm256_set_epi64x(0, 0, 0, low1), 0x3);
f2 = _mm256_blend_epi32(f2, _mm256_set_epi64x(0, 0, 0, low2), 0x3);

f[0] = _mm256_permute4x64_epi64(f0, 0x39);
f[1] = _mm256_permute4x64_epi64(f1, 0x39);
f[2] = _mm256_permute4x64_epi64(f2, 0x39);
}

static inline void vec256_timesx_1(vec256 *f) {
vec256 f0 = _mm256_permute4x64_epi64(f[0], 0x93);

unsigned long long low0 = _mm_cvtsi128_si64(_mm256_castsi256_si128(f0));

low0 = low0 << 1;

f0 = _mm256_blend_epi32(f0, _mm256_set_epi64x(0, 0, 0, low0), 0x3);

f[0] = f0;
}

static inline void vec256_timesx_2(vec256 *f) {
vec256 f0 = _mm256_permute4x64_epi64(f[0], 0x93);
vec256 f1 = _mm256_permute4x64_epi64(f[1], 0x93);

unsigned long long low0 = _mm_cvtsi128_si64(_mm256_castsi256_si128(f0));
unsigned long long low1 = _mm_cvtsi128_si64(_mm256_castsi256_si128(f1));

low1 = (low1 << 1) | (low0 >> 63);
low0 = low0 << 1;

f0 = _mm256_blend_epi32(f0, _mm256_set_epi64x(0, 0, 0, low0), 0x3);
f1 = _mm256_blend_epi32(f1, _mm256_set_epi64x(0, 0, 0, low1), 0x3);

f[0] = f0;
f[1] = f1;
}

static inline void vec256_timesx_3(vec256 *f) {
vec256 f0 = _mm256_permute4x64_epi64(f[0], 0x93);
vec256 f1 = _mm256_permute4x64_epi64(f[1], 0x93);
vec256 f2 = _mm256_permute4x64_epi64(f[2], 0x93);

unsigned long long low0 = *(unsigned long long *) &f0;
unsigned long long low1 = *(unsigned long long *) &f1;
unsigned long long low2 = _mm_cvtsi128_si64(_mm256_castsi256_si128(f2));

low2 = (low2 << 1) | (low1 >> 63);
low1 = (low1 << 1) | (low0 >> 63);
low0 = low0 << 1;

*(unsigned long long *) &f0 = low0;
*(unsigned long long *) &f1 = low1;
f2 = _mm256_blend_epi32(f2, _mm256_set_epi64x(0, 0, 0, low2), 0x3);

f[0] = f0;
f[1] = f1;
f[2] = f2;
}


static int __poly_S3_inv(unsigned char *outbytes, const unsigned char *inbytes) {
small *out = (void *) outbytes;
small *in = (void *) inbytes;
vec256 F0[numvec];
vec256 F1[numvec];
vec256 G0[numvec];
vec256 G1[numvec];
vec256 V0[numvec];
vec256 V1[numvec];
vec256 R0[numvec];
vec256 R1[numvec];
vec256 c0vec, c1vec;
int loop;
int c0, c1;
int minusdelta = -1;
int swapmask;
vec256 swapvec;

vec256_init(G0, G1, in);
F0[0] = _mm256_set_epi32(-1, -1, -1, -1, -1, -1, -1, -1);
F0[1] = _mm256_set_epi32(-1, -1, -1, -1, -1, -1, -1, -1);
F0[2] = _mm256_set_epi32(511, -1, 511, -1, 511, -1, 1023, -1);
F1[0] = _mm256_set1_epi32(0);
F1[1] = _mm256_set1_epi32(0);
F1[2] = _mm256_set1_epi32(0);

V0[0] = _mm256_set1_epi32(0);
V1[0] = _mm256_set1_epi32(0);
V0[1] = _mm256_set1_epi32(0);
V1[1] = _mm256_set1_epi32(0);
V0[2] = _mm256_set1_epi32(0);
V1[2] = _mm256_set1_epi32(0);

R0[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, 1);
R1[0] = _mm256_set1_epi32(0);
R0[1] = _mm256_set1_epi32(0);
R1[1] = _mm256_set1_epi32(0);
R0[2] = _mm256_set1_epi32(0);
R1[2] = _mm256_set1_epi32(0);

for (loop = 256; loop > 0; --loop) {
vec256_timesx_1(V0);
vec256_timesx_1(V1);
swapmask = negative_mask(minusdelta) & vec256_bit0mask(G0);

c0 = vec256_bit0mask(F0) & vec256_bit0mask(G0);
c1 = vec256_bit0mask(F1) ^ vec256_bit0mask(G1);
c1 &= c0;

minusdelta ^= swapmask & (minusdelta ^ -minusdelta);
minusdelta -= 1;

swapvec = _mm256_set1_epi32(swapmask);
vec256_swap(F0, G0, 3, swapvec);
vec256_swap(F1, G1, 3, swapvec);

c0vec = _mm256_set1_epi32(c0);
c1vec = _mm256_set1_epi32(c1);

vec256_eliminate(F0, F1, G0, G1, 3, c0vec, c1vec);
vec256_divx_3(G0);
vec256_divx_3(G1);

vec256_swap(V0, R0, 1, swapvec);
vec256_swap(V1, R1, 1, swapvec);
vec256_eliminate(V0, V1, R0, R1, 1, c0vec, c1vec);
}

for (loop = 256; loop > 0; --loop) {
vec256_timesx_2(V0);
vec256_timesx_2(V1);
swapmask = negative_mask(minusdelta) & vec256_bit0mask(G0);

c0 = vec256_bit0mask(F0) & vec256_bit0mask(G0);
c1 = vec256_bit0mask(F1) ^ vec256_bit0mask(G1);
c1 &= c0;

minusdelta ^= swapmask & (minusdelta ^ -minusdelta);
minusdelta -= 1;

swapvec = _mm256_set1_epi32(swapmask);
vec256_swap(F0, G0, 3, swapvec);
vec256_swap(F1, G1, 3, swapvec);

c0vec = _mm256_set1_epi32(c0);
c1vec = _mm256_set1_epi32(c1);

vec256_eliminate(F0, F1, G0, G1, 3, c0vec, c1vec);
vec256_divx_3(G0);
vec256_divx_3(G1);

vec256_swap(V0, R0, 2, swapvec);
vec256_swap(V1, R1, 2, swapvec);
vec256_eliminate(V0, V1, R0, R1, 2, c0vec, c1vec);
}

for (loop = 327; loop > 0; --loop) {
vec256_timesx_3(V0);
vec256_timesx_3(V1);
swapmask = negative_mask(minusdelta) & vec256_bit0mask(G0);

c0 = vec256_bit0mask(F0) & vec256_bit0mask(G0);
c1 = vec256_bit0mask(F1) ^ vec256_bit0mask(G1);
c1 &= c0;

minusdelta ^= swapmask & (minusdelta ^ -minusdelta);
minusdelta -= 1;

swapvec = _mm256_set1_epi32(swapmask);
vec256_swap(F0, G0, 3, swapvec);
vec256_swap(F1, G1, 3, swapvec);

c0vec = _mm256_set1_epi32(c0);
c1vec = _mm256_set1_epi32(c1);

vec256_eliminate(F0, F1, G0, G1, 3, c0vec, c1vec);
vec256_divx_3(G0);
vec256_divx_3(G1);

vec256_swap(V0, R0, 3, swapvec);
vec256_swap(V1, R1, 3, swapvec);
vec256_eliminate(V0, V1, R0, R1, 3, c0vec, c1vec);
}

for (loop = 256; loop > 0; --loop) {
vec256_timesx_3(V0);
vec256_timesx_3(V1);
swapmask = negative_mask(minusdelta) & vec256_bit0mask(G0);

c0 = vec256_bit0mask(F0) & vec256_bit0mask(G0);
c1 = vec256_bit0mask(F1) ^ vec256_bit0mask(G1);
c1 &= c0;

minusdelta ^= swapmask & (minusdelta ^ -minusdelta);
minusdelta -= 1;

swapvec = _mm256_set1_epi32(swapmask);
vec256_swap(F0, G0, 2, swapvec);
vec256_swap(F1, G1, 2, swapvec);

c0vec = _mm256_set1_epi32(c0);
c1vec = _mm256_set1_epi32(c1);

vec256_eliminate(F0, F1, G0, G1, 2, c0vec, c1vec);
vec256_divx_2(G0);
vec256_divx_2(G1);

vec256_swap(V0, R0, 3, swapvec);
vec256_swap(V1, R1, 3, swapvec);
vec256_eliminate(V0, V1, R0, R1, 3, c0vec, c1vec);
}

for (loop = 256; loop > 0; --loop) {
vec256_timesx_3(V0);
vec256_timesx_3(V1);
swapmask = negative_mask(minusdelta) & vec256_bit0mask(G0);

c0 = vec256_bit0mask(F0) & vec256_bit0mask(G0);
c1 = vec256_bit0mask(F1) ^ vec256_bit0mask(G1);
c1 &= c0;

minusdelta ^= swapmask & (minusdelta ^ -minusdelta);
minusdelta -= 1;

swapvec = _mm256_set1_epi32(swapmask);
vec256_swap(F0, G0, 1, swapvec);
vec256_swap(F1, G1, 1, swapvec);

c0vec = _mm256_set1_epi32(c0);
c1vec = _mm256_set1_epi32(c1);

vec256_eliminate(F0, F1, G0, G1, 1, c0vec, c1vec);
vec256_divx_1(G0);
vec256_divx_1(G1);

vec256_swap(V0, R0, 3, swapvec);
vec256_swap(V1, R1, 3, swapvec);
vec256_eliminate(V0, V1, R0, R1, 3, c0vec, c1vec);
}

c0vec = _mm256_set1_epi32(vec256_bit0mask(F0));
c1vec = _mm256_set1_epi32(vec256_bit0mask(F1));
vec256_scale(V0, V1, c0vec, c1vec);

vec256_final(out, V0, V1);
out[p] = negative_mask(minusdelta);
return 0;
}

// This code is based on crypto_core/invhrss701/faster from SUPERCOP. The code was written as a case study
// for the paper "Fast constant-time gcd computation and modular inversion" by Daniel J. Bernstein and Bo-Yin Yang.
void PQCLEAN_NTRUHPS2048677_AVX2_poly_S3_inv(poly *r_out, const poly *a) {
const unsigned char *in = (void *) a;
unsigned char *out = (void *) r_out;

small input[ppad];
small output[ppad];
int i;

/* XXX: obviously input/output format should be packed into bytes */

for (i = 0; i < p; ++i) {
small x = in[2 * i] & 3; /* 0 1 2 3 */
x += 1; /* 0 1 2 3 4 5 6, offset by 1 */
x &= (x - 3) >> 5; /* 0 1 2, offset by 1 */
input[i] = x - 1;
}
/* XXX: merge with vec256_init */

__poly_S3_inv((unsigned char *)output, (unsigned char *)input);

for (i = 0; i < p; ++i) {
out[2 * i] = (3 & output[i]) ^ ((3 & output[i]) >> 1);
out[2 * i + 1] = 0;
}
}

+ 46
- 0
crypto_kem/ntruhps2048677/avx2/sample.c View File

@@ -0,0 +1,46 @@
#include "sample.h"

void PQCLEAN_NTRUHPS2048677_AVX2_sample_fg(poly *f, poly *g, const unsigned char uniformbytes[NTRU_SAMPLE_FG_BYTES]) {

PQCLEAN_NTRUHPS2048677_AVX2_sample_iid(f, uniformbytes);
PQCLEAN_NTRUHPS2048677_AVX2_sample_fixed_type(g, uniformbytes + NTRU_SAMPLE_IID_BYTES);
}

void PQCLEAN_NTRUHPS2048677_AVX2_sample_rm(poly *r, poly *m, const unsigned char uniformbytes[NTRU_SAMPLE_RM_BYTES]) {

PQCLEAN_NTRUHPS2048677_AVX2_sample_iid(r, uniformbytes);
PQCLEAN_NTRUHPS2048677_AVX2_sample_fixed_type(m, uniformbytes + NTRU_SAMPLE_IID_BYTES);
}


#include "crypto_sort_int32.h"
void PQCLEAN_NTRUHPS2048677_AVX2_sample_fixed_type(poly *r, const unsigned char u[NTRU_SAMPLE_FT_BYTES]) {
// Assumes NTRU_SAMPLE_FT_BYTES = ceil(30*(n-1)/8)

int32_t s[NTRU_N - 1];
int i;

// Use 30 bits of u per word
for (i = 0; i < (NTRU_N - 1) / 4; i++) {
s[4 * i + 0] = (u[15 * i + 0] << 2) + (u[15 * i + 1] << 10) + (u[15 * i + 2] << 18) + ((uint32_t) u[15 * i + 3] << 26);
s[4 * i + 1] = ((u[15 * i + 3] & 0xc0) >> 4) + (u[15 * i + 4] << 4) + (u[15 * i + 5] << 12) + (u[15 * i + 6] << 20) + ((uint32_t) u[15 * i + 7] << 28);
s[4 * i + 2] = ((u[15 * i + 7] & 0xf0) >> 2) + (u[15 * i + 8] << 6) + (u[15 * i + 9] << 14) + (u[15 * i + 10] << 22) + ((uint32_t) u[15 * i + 11] << 30);
s[4 * i + 3] = (u[15 * i + 11] & 0xfc) + (u[15 * i + 12] << 8) + (u[15 * i + 13] << 15) + ((uint32_t) u[15 * i + 14] << 24);
}

for (i = 0; i < NTRU_WEIGHT / 2; i++) {
s[i] |= 1;
}

for (i = NTRU_WEIGHT / 2; i < NTRU_WEIGHT; i++) {
s[i] |= 2;
}

PQCLEAN_NTRUHPS2048677_AVX2_crypto_sort_int32(s, NTRU_N - 1);

for (i = 0; i < NTRU_N - 1; i++) {
r->coeffs[i] = ((uint16_t) (s[i] & 3));
}

r->coeffs[NTRU_N - 1] = 0;
}

+ 15
- 0
crypto_kem/ntruhps2048677/avx2/sample.h View File

@@ -0,0 +1,15 @@
#ifndef SAMPLE_H
#define SAMPLE_H

#include "params.h"
#include "poly.h"

void PQCLEAN_NTRUHPS2048677_AVX2_sample_fg(poly *f, poly *g, const unsigned char uniformbytes[NTRU_SAMPLE_FG_BYTES]);
void PQCLEAN_NTRUHPS2048677_AVX2_sample_rm(poly *r, poly *m, const unsigned char uniformbytes[NTRU_SAMPLE_RM_BYTES]);

void PQCLEAN_NTRUHPS2048677_AVX2_sample_iid(poly *r, const unsigned char uniformbytes[NTRU_SAMPLE_IID_BYTES]);

void PQCLEAN_NTRUHPS2048677_AVX2_sample_fixed_type(poly *r, const unsigned char uniformbytes[NTRU_SAMPLE_FT_BYTES]);


#endif

+ 21
- 0
crypto_kem/ntruhps2048677/avx2/sample_iid.c View File

@@ -0,0 +1,21 @@
#include <immintrin.h>

#include "sample.h"

extern void PQCLEAN_NTRUHPS2048677_AVX2_vec32_sample_iid(poly *r, const unsigned char uniformbytes[PAD32(NTRU_SAMPLE_IID_BYTES)]);

void PQCLEAN_NTRUHPS2048677_AVX2_sample_iid(poly *r, const unsigned char uniformbytes[NTRU_SAMPLE_IID_BYTES]) {
int i;
union { /* align to 32 byte boundary for vmovdqa */
unsigned char b[PAD32(NTRU_SAMPLE_IID_BYTES)];
__m256i b_x32[PAD32(NTRU_SAMPLE_IID_BYTES) / 32];
} buffer;

for (i = 0; i < NTRU_SAMPLE_IID_BYTES; i++) {
buffer.b[i] = uniformbytes[i];
}
for (i = NTRU_SAMPLE_IID_BYTES; i < PAD32(NTRU_SAMPLE_IID_BYTES); i++) {
buffer.b[i] = 0;
}
PQCLEAN_NTRUHPS2048677_AVX2_vec32_sample_iid(r, buffer.b);
}

+ 7189
- 0
crypto_kem/ntruhps2048677/avx2/square_10_677_shufbytes.s
File diff suppressed because it is too large
View File


+ 7312
- 0
crypto_kem/ntruhps2048677/avx2/square_168_677_shufbytes.s
File diff suppressed because it is too large
View File


+ 134
- 0
crypto_kem/ntruhps2048677/avx2/square_1_677_patience.s View File

@@ -0,0 +1,134 @@
.data
.p2align 5
.text
.global PQCLEAN_NTRUHPS2048677_AVX2_square_1_677
.global _PQCLEAN_NTRUHPS2048677_AVX2_square_1_677
PQCLEAN_NTRUHPS2048677_AVX2_square_1_677:
_PQCLEAN_NTRUHPS2048677_AVX2_square_1_677:
push %r15
push %r14
push %r13
push %r12
push %rbx
push %rbp
mov 0(%rsi), %r11
mov %r11, %r10
and $-0x1, %r10
mov $0x5555555555555555, %rbp
pdep %rbp, %r10, %r10
mov %r10, 0(%rdi)
mov $0xffffffff00000000, %rbx
pext %rbx, %r11, %r10
pdep %rbp, %r10, %r10
mov %r10, 8(%rdi)
mov 8(%rsi), %r11
mov %r11, %r10
and $-0x1, %r10
pdep %rbp, %r10, %r10
mov %r10, 16(%rdi)
pext %rbx, %r11, %r10
pdep %rbp, %r10, %r10
mov %r10, 24(%rdi)
mov 16(%rsi), %r11
mov %r11, %r10
and $-0x1, %r10
pdep %rbp, %r10, %r10
mov %r10, 32(%rdi)
pext %rbx, %r11, %r10
pdep %rbp, %r10, %r10
mov %r10, 40(%rdi)
mov 24(%rsi), %r11
mov %r11, %r10
and $-0x1, %r10
pdep %rbp, %r10, %r10
mov %r10, 48(%rdi)
pext %rbx, %r11, %r10
pdep %rbp, %r10, %r10
mov %r10, 56(%rdi)
mov 32(%rsi), %r11
mov %r11, %r10
and $-0x1, %r10
pdep %rbp, %r10, %r10
mov %r10, 64(%rdi)
pext %rbx, %r11, %r10
pdep %rbp, %r10, %r10
mov %r10, 72(%rdi)
mov 40(%rsi), %r11
mov $0x7fffffff80000, %r12
pext %r12, %r11, %r10
mov $0xaaaaaaaaaaaaaaaa, %r13
pdep %r13, %r10, %r10
xor %r10, 0(%rdi)
mov $0xfff8000000000000, %r14
pext %r14, %r11, %r10
mov $0x2aaaaaa, %r15
pdep %r15, %r10, %r10
xor %r10, 8(%rdi)
mov %r11, %r10
and $0x7ffff, %r10
mov $0x1555555555, %r9
pdep %r9, %r10, %r10
mov %r10, 80(%rdi)
mov 48(%rsi), %r11
mov %r11, %r10
and $0x7ffff, %r10
mov $0xaaaaaaaaa8000000, %r8
pdep %r8, %r10, %r10
xor %r10, 8(%rdi)
pext %r12, %r11, %r10
pdep %r13, %r10, %r10
xor %r10, 16(%rdi)
pext %r14, %r11, %r10
pdep %r15, %r10, %r10
xor %r10, 24(%rdi)
mov 56(%rsi), %r11
mov %r11, %r10
and $0x7ffff, %r10
pdep %r8, %r10, %r10
xor %r10, 24(%rdi)
pext %r12, %r11, %r10
pdep %r13, %r10, %r10
xor %r10, 32(%rdi)
pext %r14, %r11, %r10
pdep %r15, %r10, %r10
xor %r10, 40(%rdi)
mov 64(%rsi), %r11
mov %r11, %r10
and $0x7ffff, %r10
pdep %r8, %r10, %r10
xor %r10, 40(%rdi)
pext %r12, %r11, %r10
pdep %r13, %r10, %r10
xor %r10, 48(%rdi)
pext %r14, %r11, %r10
pdep %r15, %r10, %r10
xor %r10, 56(%rdi)
mov 72(%rsi), %r11
mov %r11, %r10
and $0x7ffff, %r10
pdep %r8, %r10, %r10
xor %r10, 56(%rdi)
pext %r12, %r11, %r10
pdep %r13, %r10, %r10
xor %r10, 64(%rdi)
pext %r14, %r11, %r10
pdep %r15, %r10, %r10
xor %r10, 72(%rdi)
mov 80(%rsi), %r11
mov %r11, %r10
and $0x7ffff, %r10
pdep %r8, %r10, %r10
xor %r10, 72(%rdi)
mov $0x1ffff80000, %rdx
pext %rdx, %r11, %r10
mov $0xaaaaaaaaa, %rcx
pdep %rcx, %r10, %r10
xor %r10, 80(%rdi)
movq $0x0, 88(%rdi)
pop %rbp
pop %rbx
pop %r12
pop %r13
pop %r14
pop %r15
ret

+ 6580
- 0
crypto_kem/ntruhps2048677/avx2/square_21_677_shufbytes.s
File diff suppressed because it is too large
View File


+ 235
- 0
crypto_kem/ntruhps2048677/avx2/square_2_677_patience.s View File

@@ -0,0 +1,235 @@
.data
.p2align 5
.text
.global PQCLEAN_NTRUHPS2048677_AVX2_square_2_677
.global _PQCLEAN_NTRUHPS2048677_AVX2_square_2_677
PQCLEAN_NTRUHPS2048677_AVX2_square_2_677:
_PQCLEAN_NTRUHPS2048677_AVX2_square_2_677:
push %r15
push %r14
push %r13
push %r12
push %rbx
push %rbp
mov 0(%rsi), %r11
mov %r11, %r10
and $0xffff, %r10
mov $0x1111111111111111, %rbp
pdep %rbp, %r10, %r10
mov %r10, 0(%rdi)
mov $0xffff0000, %rbx
pext %rbx, %r11, %r10
pdep %rbp, %r10, %r10
mov %r10, 8(%rdi)
mov $0xffff00000000, %r12
pext %r12, %r11, %r10
pdep %rbp, %r10, %r10
mov %r10, 16(%rdi)
mov $0xffff000000000000, %r13
pext %r13, %r11, %r10
pdep %rbp, %r10, %r10
mov %r10, 24(%rdi)
mov 8(%rsi), %r11
mov %r11, %r10
and $0xffff, %r10
pdep %rbp, %r10, %r10
mov %r10, 32(%rdi)
pext %rbx, %r11, %r10
pdep %rbp, %r10, %r10
mov %r10, 40(%rdi)
pext %r12, %r11, %r10
pdep %rbp, %r10, %r10
mov %r10, 48(%rdi)
pext %r13, %r11, %r10
pdep %rbp, %r10, %r10
mov %r10, 56(%rdi)
mov 16(%rsi), %r11
mov $0x3fffc0000000000, %r14
pext %r14, %r11, %r10
mov $0x8888888888888888, %r15
pdep %r15, %r10, %r10
xor %r10, 0(%rdi)
mov $0xfc00000000000000, %r9
pext %r9, %r11, %r10
mov $0x888888, %r8
pdep %r8, %r10, %r10
xor %r10, 8(%rdi)
mov %r11, %r10
and $0xffff, %r10
pdep %rbp, %r10, %r10
mov %r10, 64(%rdi)
pext %rbx, %r11, %r10
pdep %rbp, %r10, %r10
mov %r10, 72(%rdi)
mov $0x3ff00000000, %rdx
pext %rdx, %r11, %r10
mov $0x1111111111, %rcx
pdep %rcx, %r10, %r10
mov %r10, 80(%rdi)
mov 24(%rsi), %r11
mov %r11, %r10
and $0x3ff, %r10
mov $0x8888888888000000, %rax
pdep %rax, %r10, %r10
xor %r10, 8(%rdi)
mov $0x3fffc00, %r12
pext %r12, %r11, %r10
pdep %r15, %r10, %r10
xor %r10, 16(%rdi)
mov $0x3fffc000000, %r13
pext %r13, %r11, %r10
pdep %r15, %r10, %r10
xor %r10, 24(%rdi)
pext %r14, %r11, %r10
pdep %r15, %r10, %r10
xor %r10, 32(%rdi)
pext %r9, %r11, %r10
pdep %r8, %r10, %r10
xor %r10, 40(%rdi)
mov 32(%rsi), %r11
mov %r11, %r10
and $0x3ff, %r10
pdep %rax, %r10, %r10
xor %r10, 40(%rdi)
pext %r12, %r11, %r10
pdep %r15, %r10, %r10
xor %r10, 48(%rdi)
pext %r13, %r11, %r10
pdep %r15, %r10, %r10
xor %r10, 56(%rdi)
pext %r14, %r11, %r10
pdep %r15, %r10, %r10
xor %r10, 64(%rdi)
pext %r9, %r11, %r10
pdep %r8, %r10, %r10
xor %r10, 72(%rdi)
mov 40(%rsi), %r11
mov $0x7fff80000, %rbx
pext %rbx, %r11, %r10
mov $0x4444444444444444, %rbp
pdep %rbp, %r10, %r10
xor %r10, 0(%rdi)
mov $0x7fff800000000, %rdx
pext %rdx, %r11, %r10
pdep %rbp, %r10, %r10
xor %r10, 8(%rdi)
mov $0xfff8000000000000, %rcx
pext %rcx, %r11, %r10
mov $0x4444444444444, %rax
pdep %rax, %r10, %r10
xor %r10, 16(%rdi)
mov %r11, %r10
and $0x3ff, %r10
mov $0x8888888888000000, %r12
pdep %r12, %r10, %r10
xor %r10, 72(%rdi)
mov $0x7fc00, %r13
pext %r13, %r11, %r10
mov $0x888888888, %r14
pdep %r14, %r10, %r10
xor %r10, 80(%rdi)
mov 48(%rsi), %r11
mov %r11, %r10
and $0x7, %r10
mov $0x4440000000000000, %r15
pdep %r15, %r10, %r10
xor %r10, 16(%rdi)
mov $0x7fff8, %r9
pext %r9, %r11, %r10
pdep %rbp, %r10, %r10
xor %r10, 24(%rdi)
pext %rbx, %r11, %r10
pdep %rbp, %r10, %r10
xor %r10, 32(%rdi)
pext %rdx, %r11, %r10
pdep %rbp, %r10, %r10
xor %r10, 40(%rdi)
pext %rcx, %r11, %r10
pdep %rax, %r10, %r10
xor %r10, 48(%rdi)
mov 56(%rsi), %r11
mov $0xf000000000000000, %r8
pext %r8, %r11, %r10
mov $0x2222, %r12
pdep %r12, %r10, %r10
xor %r10, 0(%rdi)
mov %r11, %r10
and $0x7, %r10
pdep %r15, %r10, %r10
xor %r10, 48(%rdi)
pext %r9, %r11, %r10
pdep %rbp, %r10, %r10
xor %r10, 56(%rdi)
pext %rbx, %r11, %r10
pdep %rbp, %r10, %r10
xor %r10, 64(%rdi)
pext %rdx, %r11, %r10
pdep %rbp, %r10, %r10
xor %r10, 72(%rdi)
mov $0xff8000000000000, %r13
pext %r13, %r11, %r10
mov $0x444444444, %r14
pdep %r14, %r10, %r10
xor %r10, 80(%rdi)
mov 64(%rsi), %r11
mov %r11, %r10
and $0xfff, %r10
mov $0x2222222222220000, %rcx
pdep %rcx, %r10, %r10
xor %r10, 0(%rdi)
mov $0xffff000, %rax
pext %rax, %r11, %r10
mov $0x2222222222222222, %r8
pdep %r8, %r10, %r10
xor %r10, 8(%rdi)
mov $0xffff0000000, %r12
pext %r12, %r11, %r10
pdep %r8, %r10, %r10
xor %r10, 16(%rdi)
mov $0xffff00000000000, %r15
pext %r15, %r11, %r10
pdep %r8, %r10, %r10
xor %r10, 24(%rdi)
mov $0xf000000000000000, %r9
pext %r9, %r11, %r10
mov $0x2222, %rbx
pdep %rbx, %r10, %r10
xor %r10, 32(%rdi)
mov 72(%rsi), %r11
mov %r11, %r10
and $0xfff, %r10
pdep %rcx, %r10, %r10
xor %r10, 32(%rdi)
pext %rax, %r11, %r10
pdep %r8, %r10, %r10
xor %r10, 40(%rdi)
pext %r12, %r11, %r10
pdep %r8, %r10, %r10
xor %r10, 48(%rdi)
pext %r15, %r11, %r10
pdep %r8, %r10, %r10
xor %r10, 56(%rdi)
pext %r9, %r11, %r10
pdep %rbx, %r10, %r10
xor %r10, 64(%rdi)
mov 80(%rsi), %r11
mov %r11, %r10
and $0xfff, %r10
pdep %rcx, %r10, %r10
xor %r10, 64(%rdi)
pext %rax, %r11, %r10
pdep %r8, %r10, %r10
xor %r10, 72(%rdi)
mov $0x1ff0000000, %rdx
pext %rdx, %r11, %r10
mov $0x222222222, %rbp
pdep %rbp, %r10, %r10
xor %r10, 80(%rdi)
movq $0x0, 88(%rdi)
pop %rbp
pop %rbx
pop %r12
pop %r13
pop %r14
pop %r15
ret

+ 6450
- 0
crypto_kem/ntruhps2048677/avx2/square_336_677_shufbytes.s
File diff suppressed because it is too large
View File


+ 452
- 0
crypto_kem/ntruhps2048677/avx2/square_3_677_patience.s View File

@@ -0,0 +1,452 @@
.data
.p2align 5
.text
.global PQCLEAN_NTRUHPS2048677_AVX2_square_3_677
.global _PQCLEAN_NTRUHPS2048677_AVX2_square_3_677
PQCLEAN_NTRUHPS2048677_AVX2_square_3_677:
_PQCLEAN_NTRUHPS2048677_AVX2_square_3_677:
push %r15
push %r14
push %r13
push %r12
push %rbx
push %rbp
mov 0(%rsi), %r11
mov %r11, %r10
and $0xff, %r10
mov $0x101010101010101, %rbp
pdep %rbp, %r10, %r10
mov %r10, 0(%rdi)
mov $0xff00, %rbx
pext %rbx, %r11, %r10
pdep %rbp, %r10, %r10
mov %r10, 8(%rdi)
mov $0xff0000, %r12
pext %r12, %r11, %r10
pdep %rbp, %r10, %r10
mov %r10, 16(%rdi)
mov $0xff000000, %r13
pext %r13, %r11, %r10
pdep %rbp, %r10, %r10
mov %r10, 24(%rdi)
mov $0xff00000000, %r14
pext %r14, %r11, %r10
pdep %rbp, %r10, %r10
mov %r10, 32(%rdi)
mov $0xff0000000000, %r15
pext %r15, %r11, %r10
pdep %rbp, %r10, %r10
mov %r10, 40(%rdi)
mov $0xff000000000000, %r9
pext %r9, %r11, %r10
pdep %rbp, %r10, %r10
mov %r10, 48(%rdi)
mov $0xff00000000000000, %r8
pext %r8, %r11, %r10
pdep %rbp, %r10, %r10
mov %r10, 56(%rdi)
mov 8(%rsi), %r11
mov $0x1fe00000, %rdx
pext %rdx, %r11, %r10
mov $0x808080808080808, %rcx
pdep %rcx, %r10, %r10
xor %r10, 0(%rdi)
mov $0x1fe0000000, %rax
pext %rax, %r11, %r10
pdep %rcx, %r10, %r10
xor %r10, 8(%rdi)
mov $0x1fe000000000, %rbx
pext %rbx, %r11, %r10
pdep %rcx, %r10, %r10
xor %r10, 16(%rdi)
mov $0x1fe00000000000, %r12
pext %r12, %r11, %r10
pdep %rcx, %r10, %r10
xor %r10, 24(%rdi)
mov $0x1fe0000000000000, %r13
pext %r13, %r11, %r10
pdep %rcx, %r10, %r10
xor %r10, 32(%rdi)
mov $0xe000000000000000, %r14
pext %r14, %r11, %r10
mov $0x80808, %r15
pdep %r15, %r10, %r10
xor %r10, 40(%rdi)
mov %r11, %r10
and $0xff, %r10
pdep %rbp, %r10, %r10
mov %r10, 64(%rdi)
mov $0xff00, %r9
pext %r9, %r11, %r10
pdep %rbp, %r10, %r10
mov %r10, 72(%rdi)
mov $0x1f0000, %r8
pext %r8, %r11, %r10
mov $0x101010101, %rdx
pdep %rdx, %r10, %r10
mov %r10, 80(%rdi)
mov 16(%rsi), %r11
mov $0x3fc0000000000, %rax
pext %rax, %r11, %r10
mov $0x4040404040404040, %rbx
pdep %rbx, %r10, %r10
xor %r10, 0(%rdi)
mov $0x3fc000000000000, %r12
pext %r12, %r11, %r10
pdep %rbx, %r10, %r10
xor %r10, 8(%rdi)
mov $0xfc00000000000000, %r13
pext %r13, %r11, %r10
mov $0x404040404040, %rcx
pdep %rcx, %r10, %r10
xor %r10, 16(%rdi)
mov %r11, %r10
and $0x1f, %r10
mov $0x808080808000000, %r14
pdep %r14, %r10, %r10
xor %r10, 40(%rdi)
mov $0x1fe0, %r15
pext %r15, %r11, %r10
mov $0x808080808080808, %r9
pdep %r9, %r10, %r10
xor %r10, 48(%rdi)
mov $0x1fe000, %rbp
pext %rbp, %r11, %r10
pdep %r9, %r10, %r10
xor %r10, 56(%rdi)
mov $0x1fe00000, %r8
pext %r8, %r11, %r10
pdep %r9, %r10, %r10
xor %r10, 64(%rdi)
mov $0x1fe0000000, %rdx
pext %rdx, %r11, %r10
pdep %r9, %r10, %r10
xor %r10, 72(%rdi)
mov $0x3e000000000, %rax
pext %rax, %r11, %r10
mov $0x808080808, %r12
pdep %r12, %r10, %r10
xor %r10, 80(%rdi)
mov 24(%rsi), %r11
mov $0xc000000000000000, %rbx
pext %rbx, %r11, %r10
mov $0x202, %r13
pdep %r13, %r10, %r10
xor %r10, 0(%rdi)
mov %r11, %r10
and $0x3, %r10
mov $0x4040000000000000, %rcx
pdep %rcx, %r10, %r10
xor %r10, 16(%rdi)
mov $0x3fc, %r14
pext %r14, %r11, %r10
mov $0x4040404040404040, %r15
pdep %r15, %r10, %r10
xor %r10, 24(%rdi)
mov $0x3fc00, %rbp
pext %rbp, %r11, %r10
pdep %r15, %r10, %r10
xor %r10, 32(%rdi)
mov $0x3fc0000, %r8
pext %r8, %r11, %r10
pdep %r15, %r10, %r10
xor %r10, 40(%rdi)
mov $0x3fc000000, %rdx
pext %rdx, %r11, %r10
pdep %r15, %r10, %r10
xor %r10, 48(%rdi)
mov $0x3fc00000000, %r9
pext %r9, %r11, %r10
pdep %r15, %r10, %r10
xor %r10, 56(%rdi)
mov $0x3fc0000000000, %rax
pext %rax, %r11, %r10
pdep %r15, %r10, %r10
xor %r10, 64(%rdi)
mov $0x3fc000000000000, %r12
pext %r12, %r11, %r10
pdep %r15, %r10, %r10
xor %r10, 72(%rdi)
mov $0x3c00000000000000, %rbx
pext %rbx, %r11, %r10
mov $0x40404040, %r13
pdep %r13, %r10, %r10
xor %r10, 80(%rdi)
mov 32(%rsi), %r11
mov %r11, %r10
and $0x3f, %r10
mov $0x202020202020000, %rcx
pdep %rcx, %r10, %r10
xor %r10, 0(%rdi)
mov $0x3fc0, %r14
pext %r14, %r11, %r10
mov $0x202020202020202, %rbp
pdep %rbp, %r10, %r10
xor %r10, 8(%rdi)
mov $0x3fc000, %r8
pext %r8, %r11, %r10
pdep %rbp, %r10, %r10
xor %r10, 16(%rdi)
mov $0x3fc00000, %rdx
pext %rdx, %r11, %r10
pdep %rbp, %r10, %r10
xor %r10, 24(%rdi)
mov $0x3fc0000000, %r9
pext %r9, %r11, %r10
pdep %rbp, %r10, %r10
xor %r10, 32(%rdi)
mov $0x3fc000000000, %rax
pext %rax, %r11, %r10
pdep %rbp, %r10, %r10
xor %r10, 40(%rdi)
mov $0x3fc00000000000, %r12
pext %r12, %r11, %r10
pdep %rbp, %r10, %r10
xor %r10, 48(%rdi)
mov $0x3fc0000000000000, %r15
pext %r15, %r11, %r10
pdep %rbp, %r10, %r10
xor %r10, 56(%rdi)
mov $0xc000000000000000, %rbx
pext %rbx, %r11, %r10
mov $0x202, %r13
pdep %r13, %r10, %r10
xor %r10, 64(%rdi)
mov 40(%rsi), %r11
mov $0x7f80000, %rcx
pext %rcx, %r11, %r10
mov $0x1010101010101010, %r14
pdep %r14, %r10, %r10
xor %r10, 0(%rdi)
mov $0x7f8000000, %r8
pext %r8, %r11, %r10
pdep %r14, %r10, %r10
xor %r10, 8(%rdi)
mov $0x7f800000000, %rdx
pext %rdx, %r11, %r10
pdep %r14, %r10, %r10
xor %r10, 16(%rdi)
mov $0x7f80000000000, %r9
pext %r9, %r11, %r10
pdep %r14, %r10, %r10
xor %r10, 24(%rdi)
mov $0x7f8000000000000, %rax
pext %rax, %r11, %r10
pdep %r14, %r10, %r10
xor %r10, 32(%rdi)
mov $0xf800000000000000, %r12
pext %r12, %r11, %r10
mov $0x1010101010, %r15
pdep %r15, %r10, %r10
xor %r10, 40(%rdi)
mov %r11, %r10
and $0x3f, %r10
mov $0x202020202020000, %rbp
pdep %rbp, %r10, %r10
xor %r10, 64(%rdi)
mov $0x3fc0, %rbx
pext %rbx, %r11, %r10
mov $0x202020202020202, %r13
pdep %r13, %r10, %r10
xor %r10, 72(%rdi)
mov $0x7c000, %rcx
pext %rcx, %r11, %r10
mov $0x202020202, %r8
pdep %r8, %r10, %r10
xor %r10, 80(%rdi)
mov 48(%rsi), %r11
mov $0xff0000000000, %rdx
pext %rdx, %r11, %r10
mov $0x8080808080808080, %r9
pdep %r9, %r10, %r10
xor %r10, 0(%rdi)
mov $0xff000000000000, %rax
pext %rax, %r11, %r10
pdep %r9, %r10, %r10
xor %r10, 8(%rdi)
mov $0xff00000000000000, %r14
pext %r14, %r11, %r10
pdep %r9, %r10, %r10
xor %r10, 16(%rdi)
mov %r11, %r10
and $0x7, %r10
mov $0x1010100000000000, %r12
pdep %r12, %r10, %r10
xor %r10, 40(%rdi)
mov $0x7f8, %r15
pext %r15, %r11, %r10
mov $0x1010101010101010, %rbp
pdep %rbp, %r10, %r10
xor %r10, 48(%rdi)
mov $0x7f800, %rbx
pext %rbx, %r11, %r10
pdep %rbp, %r10, %r10
xor %r10, 56(%rdi)
mov $0x7f80000, %r13
pext %r13, %r11, %r10
pdep %rbp, %r10, %r10
xor %r10, 64(%rdi)
mov $0x7f8000000, %rcx
pext %rcx, %r11, %r10
pdep %rbp, %r10, %r10
xor %r10, 72(%rdi)
mov $0xf800000000, %r8
pext %r8, %r11, %r10
mov $0x1010101010, %rdx
pdep %rdx, %r10, %r10
xor %r10, 80(%rdi)
mov 56(%rsi), %r11
mov $0xf000000000000000, %rax
pext %rax, %r11, %r10
mov $0x4040404, %r14
pdep %r14, %r10, %r10
xor %r10, 0(%rdi)
mov %r11, %r10
and $0xff, %r10
pdep %r9, %r10, %r10
xor %r10, 24(%rdi)
mov $0xff00, %r12
pext %r12, %r11, %r10
pdep %r9, %r10, %r10
xor %r10, 32(%rdi)
mov $0xff0000, %r15
pext %r15, %r11, %r10
pdep %r9, %r10, %r10
xor %r10, 40(%rdi)
mov $0xff000000, %rbx
pext %rbx, %r11, %r10
pdep %r9, %r10, %r10
xor %r10, 48(%rdi)
mov $0xff00000000, %r13
pext %r13, %r11, %r10
pdep %r9, %r10, %r10
xor %r10, 56(%rdi)
mov $0xff0000000000, %rcx
pext %rcx, %r11, %r10
pdep %r9, %r10, %r10
xor %r10, 64(%rdi)
mov $0xff000000000000, %rbp
pext %rbp, %r11, %r10
pdep %r9, %r10, %r10
xor %r10, 72(%rdi)
mov $0xf00000000000000, %r8
pext %r8, %r11, %r10
mov $0x80808080, %rdx
pdep %rdx, %r10, %r10
xor %r10, 80(%rdi)
mov 64(%rsi), %r11
mov %r11, %r10
and $0xf, %r10
mov $0x404040400000000, %rax
pdep %rax, %r10, %r10
xor %r10, 0(%rdi)
mov $0xff0, %r14
pext %r14, %r11, %r10
mov $0x404040404040404, %r12
pdep %r12, %r10, %r10
xor %r10, 8(%rdi)
mov $0xff000, %r15
pext %r15, %r11, %r10
pdep %r12, %r10, %r10
xor %r10, 16(%rdi)
mov $0xff00000, %rbx
pext %rbx, %r11, %r10
pdep %r12, %r10, %r10
xor %r10, 24(%rdi)
mov $0xff0000000, %r13
pext %r13, %r11, %r10
pdep %r12, %r10, %r10
xor %r10, 32(%rdi)
mov $0xff000000000, %rcx
pext %rcx, %r11, %r10
pdep %r12, %r10, %r10
xor %r10, 40(%rdi)
mov $0xff00000000000, %rbp
pext %rbp, %r11, %r10
pdep %r12, %r10, %r10
xor %r10, 48(%rdi)
mov $0xff0000000000000, %r9
pext %r9, %r11, %r10
pdep %r12, %r10, %r10
xor %r10, 56(%rdi)
mov $0xf000000000000000, %r8
pext %r8, %r11, %r10
mov $0x4040404, %rdx
pdep %rdx, %r10, %r10
xor %r10, 64(%rdi)
mov 72(%rsi), %r11
mov $0x1fe0000, %rax
pext %rax, %r11, %r10
mov $0x2020202020202020, %r14
pdep %r14, %r10, %r10
xor %r10, 0(%rdi)
mov $0x1fe000000, %r15
pext %r15, %r11, %r10
pdep %r14, %r10, %r10
xor %r10, 8(%rdi)
mov $0x1fe00000000, %rbx
pext %rbx, %r11, %r10
pdep %r14, %r10, %r10
xor %r10, 16(%rdi)
mov $0x1fe0000000000, %r13
pext %r13, %r11, %r10
pdep %r14, %r10, %r10
xor %r10, 24(%rdi)
mov $0x1fe000000000000, %rcx
pext %rcx, %r11, %r10
pdep %r14, %r10, %r10
xor %r10, 32(%rdi)
mov $0xfe00000000000000, %rbp
pext %rbp, %r11, %r10
mov $0x20202020202020, %r9
pdep %r9, %r10, %r10
xor %r10, 40(%rdi)
mov %r11, %r10
and $0xf, %r10
mov $0x404040400000000, %r12
pdep %r12, %r10, %r10
xor %r10, 64(%rdi)
mov $0xff0, %r8
pext %r8, %r11, %r10
mov $0x404040404040404, %rdx
pdep %rdx, %r10, %r10
xor %r10, 72(%rdi)
mov $0x1f000, %rax
pext %rax, %r11, %r10
mov $0x404040404, %r15
pdep %r15, %r10, %r10
xor %r10, 80(%rdi)
mov 80(%rsi), %r11
mov %r11, %r10
and $0x1, %r10
rol $61, %r10
xor %r10, 40(%rdi)
mov $0x1fe, %rbx
pext %rbx, %r11, %r10
pdep %r14, %r10, %r10
xor %r10, 48(%rdi)
mov $0x1fe00, %r13
pext %r13, %r11, %r10
pdep %r14, %r10, %r10
xor %r10, 56(%rdi)
mov $0x1fe0000, %rcx
pext %rcx, %r11, %r10
pdep %r14, %r10, %r10
xor %r10, 64(%rdi)
mov $0x1fe000000, %rbp
pext %rbp, %r11, %r10
pdep %r14, %r10, %r10
xor %r10, 72(%rdi)
mov $0x1e00000000, %r9
pext %r9, %r11, %r10
mov $0x20202020, %r12
pdep %r12, %r10, %r10
xor %r10, 80(%rdi)
movq $0x0, 88(%rdi)
pop %rbp
pop %rbx
pop %r12
pop %r13
pop %r14
pop %r15
ret

+ 8477
- 0
crypto_kem/ntruhps2048677/avx2/square_42_677_shufbytes.s
File diff suppressed because it is too large
View File


+ 1478
- 0
crypto_kem/ntruhps2048677/avx2/square_5_677_patience.s
File diff suppressed because it is too large
View File


+ 6940
- 0
crypto_kem/ntruhps2048677/avx2/square_84_677_shufbytes.s
File diff suppressed because it is too large
View File


+ 1066
- 0
crypto_kem/ntruhps2048677/avx2/vec32_sample_iid.s
File diff suppressed because it is too large
View File


+ 3
- 3
crypto_kem/ntruhps2048677/clean/Makefile View File

@@ -1,10 +1,10 @@
# This Makefile can be used with GNU Make or BSD Make

LIB=libntruhps2048677_clean.a
HEADERS=api.h crypto_sort.h owcpa.h params.h poly.h sample.h verify.h
OBJECTS=crypto_sort.o kem.o owcpa.o pack3.o packq.o poly.o poly_lift.o poly_mod.o poly_r2_inv.o poly_rq_mul.o poly_s3_inv.o sample.o sample_iid.o verify.o
HEADERS=api.h cmov.h crypto_sort_int32.h owcpa.h params.h poly.h sample.h
OBJECTS=cmov.o crypto_sort_int32.o kem.o owcpa.o pack3.o packq.o poly.o poly_lift.o poly_mod.o poly_r2_inv.o poly_rq_mul.o poly_s3_inv.o sample.o sample_iid.o

CFLAGS=-O3 -Wall -Wextra -Wpedantic -Werror -Wmissing-prototypes -Wredundant-decls -std=c99 -I../../../common $(EXTRAFLAGS)
CFLAGS=-O3 -Wall -Wextra -Wpedantic -Wvla -Werror -Wredundant-decls -Wmissing-prototypes -std=c99 -I../../../common $(EXTRAFLAGS)

all: $(LIB)



+ 1
- 1
crypto_kem/ntruhps2048677/clean/Makefile.Microsoft_nmake View File

@@ -2,7 +2,7 @@
# nmake /f Makefile.Microsoft_nmake

LIBRARY=libntruhps2048677_clean.lib
OBJECTS=crypto_sort.obj kem.obj owcpa.obj pack3.obj packq.obj poly.obj poly_lift.obj poly_mod.obj poly_r2_inv.obj poly_rq_mul.obj poly_s3_inv.obj sample.obj sample_iid.obj verify.obj
OBJECTS=cmov.obj crypto_sort_int32.obj kem.obj owcpa.obj pack3.obj packq.obj poly.obj poly_lift.obj poly_mod.obj poly_r2_inv.obj poly_rq_mul.obj poly_s3_inv.obj sample.obj sample_iid.obj

CFLAGS=/nologo /O2 /I ..\..\..\common /W4 /WX



Some files were not shown because too many files changed in this diff

Loading…
Cancel
Save