Update NTRU

Oussama Danba caught a bug in how we were handing a test related to ciphertext encoding. Not a security issue as far as we can tell, but certainly not the correct behavior. More detail here: e0ab9525f1 This commit also switches ntruhrss701/avx2/poly_S3_inv to the Bernstein--Yang code, also slightly simplifies the poly_Rq_to_S3 routine.
2020-10-09 15:13:48 -04:00 · 2020-10-09 15:13:48 -04:00 · a94abf08a1
commit a94abf08a1
parent 462c7971d8
23 changed files with 4556 additions and 9753 deletions
--- a/crypto_kem/ntruhps2048509/META.yml
+++ b/crypto_kem/ntruhps2048509/META.yml
@ -23,9 +23,9 @@ auxiliary-submitters:
  - Zhenfei Zhang
 implementations:
    - name: clean
-      version: https://github.com/jschanck/ntru/tree/2d4df948 reference implementation
+      version: https://github.com/jschanck/ntru/tree/b38a346a reference implementation
    - name: avx2
-      version: https://github.com/jschanck/ntru/tree/2d4df948 avx2 implementation
+      version: https://github.com/jschanck/ntru/tree/b38a346a avx2 implementation
      supported_platforms:
          - architecture: x86_64
            operating_systems:
--- a/crypto_kem/ntruhps2048509/avx2/owcpa.c
+++ b/crypto_kem/ntruhps2048509/avx2/owcpa.c
@ -2,40 +2,59 @@
 #include "poly.h"
 #include "sample.h"

-static int owcpa_check_r(const poly *r) {
-    /* Check that r is in message space. */
-    /* Note: Assumes that r has coefficients in {0, 1, ..., q-1} */
-    int i;
-    uint64_t t = 0;
-    uint16_t c;
-    for (i = 0; i < NTRU_N; i++) {
-        c = MODQ(r->coeffs[i] + 1);
-        t |= c & (NTRU_Q - 4); /* 0 if c is in {0,1,2,3} */
-        t |= (c + 1) & 0x4;   /* 0 if c is in {0,1,2} */
+static int owcpa_check_ciphertext(const unsigned char *ciphertext) {
+    /* A ciphertext is log2(q)*(n-1) bits packed into bytes.  */
+    /* Check that any unused bits of the final byte are zero. */
+
+    uint16_t t = 0;
+
+    t = ciphertext[NTRU_CIPHERTEXTBYTES - 1];
+    t &= 0xff << (8 - (7 & (NTRU_LOGQ * NTRU_PACK_DEG)));
+
+    /* We have 0 <= t < 256 */
+    /* Return 0 on success (t=0), 1 on failure */
+    return (int) (1 & ((~t + 1) >> 15));
 }
-    t |= MODQ(r->coeffs[NTRU_N - 1]); /* Coefficient n-1 must be zero */
-    t = (~t + 1); // two's complement
-    t >>= 63;
-    return (int) t;
+
+static int owcpa_check_r(const poly *r) {
+    /* A valid r has coefficients in {0,1,q-1} and has r[N-1] = 0 */
+    /* Note: We may assume that 0 <= r[i] <= q-1 for all i        */
+
+    int i;
+    uint32_t t = 0;
+    uint16_t c;
+    for (i = 0; i < NTRU_N - 1; i++) {
+        c = r->coeffs[i];
+        t |= (c + 1) & (NTRU_Q - 4); /* 0 iff c is in {-1,0,1,2} */
+        t |= (c + 2) & 4;  /* 1 if c = 2, 0 if c is in {-1,0,1} */
+    }
+    t |= r->coeffs[NTRU_N - 1]; /* Coefficient n-1 must be zero */
+
+    /* We have 0 <= t < 2^16. */
+    /* Return 0 on success (t=0), 1 on failure */
+    return (int) (1 & ((~t + 1) >> 31));
 }

 static int owcpa_check_m(const poly *m) {
-    /* Check that m is in message space. */
-    /* Note: Assumes that m has coefficients in {0,1,2}. */
+    /* Check that m is in message space, i.e.                  */
+    /*  (1)  |{i : m[i] = 1}| = |{i : m[i] = 2}|, and          */
+    /*  (2)  |{i : m[i] != 0}| = NTRU_WEIGHT.                  */
+    /* Note: We may assume that m has coefficients in {0,1,2}. */
+
    int i;
-    uint64_t t = 0;
-    uint16_t p1 = 0;
-    uint16_t m1 = 0;
+    uint32_t t = 0;
+    uint16_t ps = 0;
+    uint16_t ms = 0;
    for (i = 0; i < NTRU_N; i++) {
-        p1 += m->coeffs[i] & 0x01;
-        m1 += (m->coeffs[i] & 0x02) >> 1;
+        ps += m->coeffs[i] & 1;
+        ms += m->coeffs[i] & 2;
    }
-    /* Need p1 = m1 and p1 + m1 = NTRU_WEIGHT */
-    t |= p1 ^ m1;
-    t |= (p1 + m1) ^ NTRU_WEIGHT;
-    t = (~t + 1); // two's complement
-    t >>= 63;
-    return (int) t;
+    t |= ps ^ (ms >> 1);   /* 0 if (1) holds */
+    t |= ms ^ NTRU_WEIGHT; /* 0 if (1) and (2) hold */
+
+    /* We have 0 <= t < 2^16. */
+    /* Return 0 on success (t=0), 1 on failure */
+    return (int) (1 & ((~t + 1) >> 31));
 }

 void PQCLEAN_NTRUHPS2048509_AVX2_owcpa_keypair(unsigned char *pk,
@ -125,8 +144,8 @@ int PQCLEAN_NTRUHPS2048509_AVX2_owcpa_dec(unsigned char *rm,

    fail = 0;

-    /* Check that unused bits of last byte of ciphertext are zero */
-    fail |= ciphertext[NTRU_CIPHERTEXTBYTES - 1] & (0xff << (8 - (7 & (NTRU_LOGQ * NTRU_PACK_DEG))));
+    /* Check that the unused bits of the last byte of the ciphertext are zero */
+    fail |= owcpa_check_ciphertext(ciphertext);

    /* For the IND-CCA2 KEM we must ensure that c = Enc(h, (r,m)).             */
    /* We can avoid re-computing r*h + Lift(m) as long as we check that        */
--- a/crypto_kem/ntruhps2048509/avx2/poly_rq_to_s3.s
+++ b/crypto_kem/ntruhps2048509/avx2/poly_rq_to_s3.s
--- a/crypto_kem/ntruhps2048509/clean/owcpa.c
+++ b/crypto_kem/ntruhps2048509/clean/owcpa.c
@ -2,40 +2,59 @@
 #include "poly.h"
 #include "sample.h"

-static int owcpa_check_r(const poly *r) {
-    /* Check that r is in message space. */
-    /* Note: Assumes that r has coefficients in {0, 1, ..., q-1} */
-    int i;
-    uint64_t t = 0;
-    uint16_t c;
-    for (i = 0; i < NTRU_N; i++) {
-        c = MODQ(r->coeffs[i] + 1);
-        t |= c & (NTRU_Q - 4); /* 0 if c is in {0,1,2,3} */
-        t |= (c + 1) & 0x4;   /* 0 if c is in {0,1,2} */
+static int owcpa_check_ciphertext(const unsigned char *ciphertext) {
+    /* A ciphertext is log2(q)*(n-1) bits packed into bytes.  */
+    /* Check that any unused bits of the final byte are zero. */
+
+    uint16_t t = 0;
+
+    t = ciphertext[NTRU_CIPHERTEXTBYTES - 1];
+    t &= 0xff << (8 - (7 & (NTRU_LOGQ * NTRU_PACK_DEG)));
+
+    /* We have 0 <= t < 256 */
+    /* Return 0 on success (t=0), 1 on failure */
+    return (int) (1 & ((~t + 1) >> 15));
 }
-    t |= MODQ(r->coeffs[NTRU_N - 1]); /* Coefficient n-1 must be zero */
-    t = (~t + 1); // two's complement
-    t >>= 63;
-    return (int) t;
+
+static int owcpa_check_r(const poly *r) {
+    /* A valid r has coefficients in {0,1,q-1} and has r[N-1] = 0 */
+    /* Note: We may assume that 0 <= r[i] <= q-1 for all i        */
+
+    int i;
+    uint32_t t = 0;
+    uint16_t c;
+    for (i = 0; i < NTRU_N - 1; i++) {
+        c = r->coeffs[i];
+        t |= (c + 1) & (NTRU_Q - 4); /* 0 iff c is in {-1,0,1,2} */
+        t |= (c + 2) & 4;  /* 1 if c = 2, 0 if c is in {-1,0,1} */
+    }
+    t |= r->coeffs[NTRU_N - 1]; /* Coefficient n-1 must be zero */
+
+    /* We have 0 <= t < 2^16. */
+    /* Return 0 on success (t=0), 1 on failure */
+    return (int) (1 & ((~t + 1) >> 31));
 }

 static int owcpa_check_m(const poly *m) {
-    /* Check that m is in message space. */
-    /* Note: Assumes that m has coefficients in {0,1,2}. */
+    /* Check that m is in message space, i.e.                  */
+    /*  (1)  |{i : m[i] = 1}| = |{i : m[i] = 2}|, and          */
+    /*  (2)  |{i : m[i] != 0}| = NTRU_WEIGHT.                  */
+    /* Note: We may assume that m has coefficients in {0,1,2}. */
+
    int i;
-    uint64_t t = 0;
-    uint16_t p1 = 0;
-    uint16_t m1 = 0;
+    uint32_t t = 0;
+    uint16_t ps = 0;
+    uint16_t ms = 0;
    for (i = 0; i < NTRU_N; i++) {
-        p1 += m->coeffs[i] & 0x01;
-        m1 += (m->coeffs[i] & 0x02) >> 1;
+        ps += m->coeffs[i] & 1;
+        ms += m->coeffs[i] & 2;
    }
-    /* Need p1 = m1 and p1 + m1 = NTRU_WEIGHT */
-    t |= p1 ^ m1;
-    t |= (p1 + m1) ^ NTRU_WEIGHT;
-    t = (~t + 1); // two's complement
-    t >>= 63;
-    return (int) t;
+    t |= ps ^ (ms >> 1);   /* 0 if (1) holds */
+    t |= ms ^ NTRU_WEIGHT; /* 0 if (1) and (2) hold */
+
+    /* We have 0 <= t < 2^16. */
+    /* Return 0 on success (t=0), 1 on failure */
+    return (int) (1 & ((~t + 1) >> 31));
 }

 void PQCLEAN_NTRUHPS2048509_CLEAN_owcpa_keypair(unsigned char *pk,
@ -125,8 +144,8 @@ int PQCLEAN_NTRUHPS2048509_CLEAN_owcpa_dec(unsigned char *rm,

    fail = 0;

-    /* Check that unused bits of last byte of ciphertext are zero */
-    fail |= ciphertext[NTRU_CIPHERTEXTBYTES - 1] & (0xff << (8 - (7 & (NTRU_LOGQ * NTRU_PACK_DEG))));
+    /* Check that the unused bits of the last byte of the ciphertext are zero */
+    fail |= owcpa_check_ciphertext(ciphertext);

    /* For the IND-CCA2 KEM we must ensure that c = Enc(h, (r,m)).             */
    /* We can avoid re-computing r*h + Lift(m) as long as we check that        */
--- a/crypto_kem/ntruhps2048509/clean/poly_mod.c
+++ b/crypto_kem/ntruhps2048509/clean/poly_mod.c
@ -30,14 +30,22 @@ void PQCLEAN_NTRUHPS2048509_CLEAN_poly_mod_q_Phi_n(poly *r) {
 }

 void PQCLEAN_NTRUHPS2048509_CLEAN_poly_Rq_to_S3(poly *r, const poly *a) {
-    /* NOTE: Assumes input is in [0,Q-1]^N */
-    /*       Produces output in {0,1,2}^N */
    int i;
+    uint16_t flag;

-    /* Center coeffs around 3Q: [0, Q-1] -> [3Q - Q/2, 3Q + Q/2) */
+    /* The coefficients of a are stored as non-negative integers. */
+    /* We must translate to representatives in [-q/2, q/2) before */
+    /* reduction mod 3.                                           */
    for (i = 0; i < NTRU_N; i++) {
-        r->coeffs[i] = ((MODQ(a->coeffs[i]) >> (NTRU_LOGQ - 1)) ^ 3) << NTRU_LOGQ;
-        r->coeffs[i] += MODQ(a->coeffs[i]);
+        /* Need an explicit reduction mod q here                    */
+        r->coeffs[i] = MODQ(a->coeffs[i]);
+
+        /* flag = 1 if r[i] >= q/2 else 0                            */
+        flag = r->coeffs[i] >> (NTRU_LOGQ - 1);
+
+        /* Now we will add (-q) mod 3 if r[i] >= q/2                 */
+        /* Note (-q) mod 3=(-2^k) mod 3=1<<(1-(k&1))                */
+        r->coeffs[i] += flag << (1 - (NTRU_LOGQ & 1));
    }

    PQCLEAN_NTRUHPS2048509_CLEAN_poly_mod_3_Phi_n(r);
--- a/crypto_kem/ntruhps2048677/META.yml
+++ b/crypto_kem/ntruhps2048677/META.yml
@ -23,9 +23,9 @@ auxiliary-submitters:
  - Zhenfei Zhang
 implementations:
    - name: clean
-      version: https://github.com/jschanck/ntru/tree/2d4df948 reference implementation
+      version: https://github.com/jschanck/ntru/tree/b38a346a reference implementation
    - name: avx2
-      version: https://github.com/jschanck/ntru/tree/2d4df948 avx2 implementation
+      version: https://github.com/jschanck/ntru/tree/b38a346a avx2 implementation
      supported_platforms:
          - architecture: x86_64
            operating_systems:
--- a/crypto_kem/ntruhps2048677/avx2/owcpa.c
+++ b/crypto_kem/ntruhps2048677/avx2/owcpa.c
@ -2,40 +2,59 @@
 #include "poly.h"
 #include "sample.h"

-static int owcpa_check_r(const poly *r) {
-    /* Check that r is in message space. */
-    /* Note: Assumes that r has coefficients in {0, 1, ..., q-1} */
-    int i;
-    uint64_t t = 0;
-    uint16_t c;
-    for (i = 0; i < NTRU_N; i++) {
-        c = MODQ(r->coeffs[i] + 1);
-        t |= c & (NTRU_Q - 4); /* 0 if c is in {0,1,2,3} */
-        t |= (c + 1) & 0x4;   /* 0 if c is in {0,1,2} */
+static int owcpa_check_ciphertext(const unsigned char *ciphertext) {
+    /* A ciphertext is log2(q)*(n-1) bits packed into bytes.  */
+    /* Check that any unused bits of the final byte are zero. */
+
+    uint16_t t = 0;
+
+    t = ciphertext[NTRU_CIPHERTEXTBYTES - 1];
+    t &= 0xff << (8 - (7 & (NTRU_LOGQ * NTRU_PACK_DEG)));
+
+    /* We have 0 <= t < 256 */
+    /* Return 0 on success (t=0), 1 on failure */
+    return (int) (1 & ((~t + 1) >> 15));
 }
-    t |= MODQ(r->coeffs[NTRU_N - 1]); /* Coefficient n-1 must be zero */
-    t = (~t + 1); // two's complement
-    t >>= 63;
-    return (int) t;
+
+static int owcpa_check_r(const poly *r) {
+    /* A valid r has coefficients in {0,1,q-1} and has r[N-1] = 0 */
+    /* Note: We may assume that 0 <= r[i] <= q-1 for all i        */
+
+    int i;
+    uint32_t t = 0;
+    uint16_t c;
+    for (i = 0; i < NTRU_N - 1; i++) {
+        c = r->coeffs[i];
+        t |= (c + 1) & (NTRU_Q - 4); /* 0 iff c is in {-1,0,1,2} */
+        t |= (c + 2) & 4;  /* 1 if c = 2, 0 if c is in {-1,0,1} */
+    }
+    t |= r->coeffs[NTRU_N - 1]; /* Coefficient n-1 must be zero */
+
+    /* We have 0 <= t < 2^16. */
+    /* Return 0 on success (t=0), 1 on failure */
+    return (int) (1 & ((~t + 1) >> 31));
 }

 static int owcpa_check_m(const poly *m) {
-    /* Check that m is in message space. */
-    /* Note: Assumes that m has coefficients in {0,1,2}. */
+    /* Check that m is in message space, i.e.                  */
+    /*  (1)  |{i : m[i] = 1}| = |{i : m[i] = 2}|, and          */
+    /*  (2)  |{i : m[i] != 0}| = NTRU_WEIGHT.                  */
+    /* Note: We may assume that m has coefficients in {0,1,2}. */
+
    int i;
-    uint64_t t = 0;
-    uint16_t p1 = 0;
-    uint16_t m1 = 0;
+    uint32_t t = 0;
+    uint16_t ps = 0;
+    uint16_t ms = 0;
    for (i = 0; i < NTRU_N; i++) {
-        p1 += m->coeffs[i] & 0x01;
-        m1 += (m->coeffs[i] & 0x02) >> 1;
+        ps += m->coeffs[i] & 1;
+        ms += m->coeffs[i] & 2;
    }
-    /* Need p1 = m1 and p1 + m1 = NTRU_WEIGHT */
-    t |= p1 ^ m1;
-    t |= (p1 + m1) ^ NTRU_WEIGHT;
-    t = (~t + 1); // two's complement
-    t >>= 63;
-    return (int) t;
+    t |= ps ^ (ms >> 1);   /* 0 if (1) holds */
+    t |= ms ^ NTRU_WEIGHT; /* 0 if (1) and (2) hold */
+
+    /* We have 0 <= t < 2^16. */
+    /* Return 0 on success (t=0), 1 on failure */
+    return (int) (1 & ((~t + 1) >> 31));
 }

 void PQCLEAN_NTRUHPS2048677_AVX2_owcpa_keypair(unsigned char *pk,
@ -125,8 +144,8 @@ int PQCLEAN_NTRUHPS2048677_AVX2_owcpa_dec(unsigned char *rm,

    fail = 0;

-    /* Check that unused bits of last byte of ciphertext are zero */
-    fail |= ciphertext[NTRU_CIPHERTEXTBYTES - 1] & (0xff << (8 - (7 & (NTRU_LOGQ * NTRU_PACK_DEG))));
+    /* Check that the unused bits of the last byte of the ciphertext are zero */
+    fail |= owcpa_check_ciphertext(ciphertext);

    /* For the IND-CCA2 KEM we must ensure that c = Enc(h, (r,m)).             */
    /* We can avoid re-computing r*h + Lift(m) as long as we check that        */
--- a/crypto_kem/ntruhps2048677/avx2/poly_rq_to_s3.s
+++ b/crypto_kem/ntruhps2048677/avx2/poly_rq_to_s3.s
--- a/crypto_kem/ntruhps2048677/clean/owcpa.c
+++ b/crypto_kem/ntruhps2048677/clean/owcpa.c
@ -2,40 +2,59 @@
 #include "poly.h"
 #include "sample.h"

-static int owcpa_check_r(const poly *r) {
-    /* Check that r is in message space. */
-    /* Note: Assumes that r has coefficients in {0, 1, ..., q-1} */
-    int i;
-    uint64_t t = 0;
-    uint16_t c;
-    for (i = 0; i < NTRU_N; i++) {
-        c = MODQ(r->coeffs[i] + 1);
-        t |= c & (NTRU_Q - 4); /* 0 if c is in {0,1,2,3} */
-        t |= (c + 1) & 0x4;   /* 0 if c is in {0,1,2} */
+static int owcpa_check_ciphertext(const unsigned char *ciphertext) {
+    /* A ciphertext is log2(q)*(n-1) bits packed into bytes.  */
+    /* Check that any unused bits of the final byte are zero. */
+
+    uint16_t t = 0;
+
+    t = ciphertext[NTRU_CIPHERTEXTBYTES - 1];
+    t &= 0xff << (8 - (7 & (NTRU_LOGQ * NTRU_PACK_DEG)));
+
+    /* We have 0 <= t < 256 */
+    /* Return 0 on success (t=0), 1 on failure */
+    return (int) (1 & ((~t + 1) >> 15));
 }
-    t |= MODQ(r->coeffs[NTRU_N - 1]); /* Coefficient n-1 must be zero */
-    t = (~t + 1); // two's complement
-    t >>= 63;
-    return (int) t;
+
+static int owcpa_check_r(const poly *r) {
+    /* A valid r has coefficients in {0,1,q-1} and has r[N-1] = 0 */
+    /* Note: We may assume that 0 <= r[i] <= q-1 for all i        */
+
+    int i;
+    uint32_t t = 0;
+    uint16_t c;
+    for (i = 0; i < NTRU_N - 1; i++) {
+        c = r->coeffs[i];
+        t |= (c + 1) & (NTRU_Q - 4); /* 0 iff c is in {-1,0,1,2} */
+        t |= (c + 2) & 4;  /* 1 if c = 2, 0 if c is in {-1,0,1} */
+    }
+    t |= r->coeffs[NTRU_N - 1]; /* Coefficient n-1 must be zero */
+
+    /* We have 0 <= t < 2^16. */
+    /* Return 0 on success (t=0), 1 on failure */
+    return (int) (1 & ((~t + 1) >> 31));
 }

 static int owcpa_check_m(const poly *m) {
-    /* Check that m is in message space. */
-    /* Note: Assumes that m has coefficients in {0,1,2}. */
+    /* Check that m is in message space, i.e.                  */
+    /*  (1)  |{i : m[i] = 1}| = |{i : m[i] = 2}|, and          */
+    /*  (2)  |{i : m[i] != 0}| = NTRU_WEIGHT.                  */
+    /* Note: We may assume that m has coefficients in {0,1,2}. */
+
    int i;
-    uint64_t t = 0;
-    uint16_t p1 = 0;
-    uint16_t m1 = 0;
+    uint32_t t = 0;
+    uint16_t ps = 0;
+    uint16_t ms = 0;
    for (i = 0; i < NTRU_N; i++) {
-        p1 += m->coeffs[i] & 0x01;
-        m1 += (m->coeffs[i] & 0x02) >> 1;
+        ps += m->coeffs[i] & 1;
+        ms += m->coeffs[i] & 2;
    }
-    /* Need p1 = m1 and p1 + m1 = NTRU_WEIGHT */
-    t |= p1 ^ m1;
-    t |= (p1 + m1) ^ NTRU_WEIGHT;
-    t = (~t + 1); // two's complement
-    t >>= 63;
-    return (int) t;
+    t |= ps ^ (ms >> 1);   /* 0 if (1) holds */
+    t |= ms ^ NTRU_WEIGHT; /* 0 if (1) and (2) hold */
+
+    /* We have 0 <= t < 2^16. */
+    /* Return 0 on success (t=0), 1 on failure */
+    return (int) (1 & ((~t + 1) >> 31));
 }

 void PQCLEAN_NTRUHPS2048677_CLEAN_owcpa_keypair(unsigned char *pk,
@ -125,8 +144,8 @@ int PQCLEAN_NTRUHPS2048677_CLEAN_owcpa_dec(unsigned char *rm,

    fail = 0;

-    /* Check that unused bits of last byte of ciphertext are zero */
-    fail |= ciphertext[NTRU_CIPHERTEXTBYTES - 1] & (0xff << (8 - (7 & (NTRU_LOGQ * NTRU_PACK_DEG))));
+    /* Check that the unused bits of the last byte of the ciphertext are zero */
+    fail |= owcpa_check_ciphertext(ciphertext);

    /* For the IND-CCA2 KEM we must ensure that c = Enc(h, (r,m)).             */
    /* We can avoid re-computing r*h + Lift(m) as long as we check that        */
--- a/crypto_kem/ntruhps2048677/clean/poly_mod.c
+++ b/crypto_kem/ntruhps2048677/clean/poly_mod.c
@ -30,14 +30,22 @@ void PQCLEAN_NTRUHPS2048677_CLEAN_poly_mod_q_Phi_n(poly *r) {
 }

 void PQCLEAN_NTRUHPS2048677_CLEAN_poly_Rq_to_S3(poly *r, const poly *a) {
-    /* NOTE: Assumes input is in [0,Q-1]^N */
-    /*       Produces output in {0,1,2}^N */
    int i;
+    uint16_t flag;

-    /* Center coeffs around 3Q: [0, Q-1] -> [3Q - Q/2, 3Q + Q/2) */
+    /* The coefficients of a are stored as non-negative integers. */
+    /* We must translate to representatives in [-q/2, q/2) before */
+    /* reduction mod 3.                                           */
    for (i = 0; i < NTRU_N; i++) {
-        r->coeffs[i] = ((MODQ(a->coeffs[i]) >> (NTRU_LOGQ - 1)) ^ 3) << NTRU_LOGQ;
-        r->coeffs[i] += MODQ(a->coeffs[i]);
+        /* Need an explicit reduction mod q here                    */
+        r->coeffs[i] = MODQ(a->coeffs[i]);
+
+        /* flag = 1 if r[i] >= q/2 else 0                            */
+        flag = r->coeffs[i] >> (NTRU_LOGQ - 1);
+
+        /* Now we will add (-q) mod 3 if r[i] >= q/2                 */
+        /* Note (-q) mod 3=(-2^k) mod 3=1<<(1-(k&1))                */
+        r->coeffs[i] += flag << (1 - (NTRU_LOGQ & 1));
    }

    PQCLEAN_NTRUHPS2048677_CLEAN_poly_mod_3_Phi_n(r);
--- a/crypto_kem/ntruhps4096821/META.yml
+++ b/crypto_kem/ntruhps4096821/META.yml
@ -23,9 +23,9 @@ auxiliary-submitters:
  - Zhenfei Zhang
 implementations:
    - name: clean
-      version: https://github.com/jschanck/ntru/tree/2d4df948 reference implementation
+      version: https://github.com/jschanck/ntru/tree/b38a346a reference implementation
    - name: avx2
-      version: https://github.com/jschanck/ntru/tree/2d4df948 avx2 implementation
+      version: https://github.com/jschanck/ntru/tree/b38a346a avx2 implementation
      supported_platforms:
          - architecture: x86_64
            operating_systems:
--- a/crypto_kem/ntruhps4096821/avx2/owcpa.c
+++ b/crypto_kem/ntruhps4096821/avx2/owcpa.c
@ -2,40 +2,59 @@
 #include "poly.h"
 #include "sample.h"

-static int owcpa_check_r(const poly *r) {
-    /* Check that r is in message space. */
-    /* Note: Assumes that r has coefficients in {0, 1, ..., q-1} */
-    int i;
-    uint64_t t = 0;
-    uint16_t c;
-    for (i = 0; i < NTRU_N; i++) {
-        c = MODQ(r->coeffs[i] + 1);
-        t |= c & (NTRU_Q - 4); /* 0 if c is in {0,1,2,3} */
-        t |= (c + 1) & 0x4;   /* 0 if c is in {0,1,2} */
+static int owcpa_check_ciphertext(const unsigned char *ciphertext) {
+    /* A ciphertext is log2(q)*(n-1) bits packed into bytes.  */
+    /* Check that any unused bits of the final byte are zero. */
+
+    uint16_t t = 0;
+
+    t = ciphertext[NTRU_CIPHERTEXTBYTES - 1];
+    t &= 0xff << (8 - (7 & (NTRU_LOGQ * NTRU_PACK_DEG)));
+
+    /* We have 0 <= t < 256 */
+    /* Return 0 on success (t=0), 1 on failure */
+    return (int) (1 & ((~t + 1) >> 15));
 }
-    t |= MODQ(r->coeffs[NTRU_N - 1]); /* Coefficient n-1 must be zero */
-    t = (~t + 1); // two's complement
-    t >>= 63;
-    return (int) t;
+
+static int owcpa_check_r(const poly *r) {
+    /* A valid r has coefficients in {0,1,q-1} and has r[N-1] = 0 */
+    /* Note: We may assume that 0 <= r[i] <= q-1 for all i        */
+
+    int i;
+    uint32_t t = 0;
+    uint16_t c;
+    for (i = 0; i < NTRU_N - 1; i++) {
+        c = r->coeffs[i];
+        t |= (c + 1) & (NTRU_Q - 4); /* 0 iff c is in {-1,0,1,2} */
+        t |= (c + 2) & 4;  /* 1 if c = 2, 0 if c is in {-1,0,1} */
+    }
+    t |= r->coeffs[NTRU_N - 1]; /* Coefficient n-1 must be zero */
+
+    /* We have 0 <= t < 2^16. */
+    /* Return 0 on success (t=0), 1 on failure */
+    return (int) (1 & ((~t + 1) >> 31));
 }

 static int owcpa_check_m(const poly *m) {
-    /* Check that m is in message space. */
-    /* Note: Assumes that m has coefficients in {0,1,2}. */
+    /* Check that m is in message space, i.e.                  */
+    /*  (1)  |{i : m[i] = 1}| = |{i : m[i] = 2}|, and          */
+    /*  (2)  |{i : m[i] != 0}| = NTRU_WEIGHT.                  */
+    /* Note: We may assume that m has coefficients in {0,1,2}. */
+
    int i;
-    uint64_t t = 0;
-    uint16_t p1 = 0;
-    uint16_t m1 = 0;
+    uint32_t t = 0;
+    uint16_t ps = 0;
+    uint16_t ms = 0;
    for (i = 0; i < NTRU_N; i++) {
-        p1 += m->coeffs[i] & 0x01;
-        m1 += (m->coeffs[i] & 0x02) >> 1;
+        ps += m->coeffs[i] & 1;
+        ms += m->coeffs[i] & 2;
    }
-    /* Need p1 = m1 and p1 + m1 = NTRU_WEIGHT */
-    t |= p1 ^ m1;
-    t |= (p1 + m1) ^ NTRU_WEIGHT;
-    t = (~t + 1); // two's complement
-    t >>= 63;
-    return (int) t;
+    t |= ps ^ (ms >> 1);   /* 0 if (1) holds */
+    t |= ms ^ NTRU_WEIGHT; /* 0 if (1) and (2) hold */
+
+    /* We have 0 <= t < 2^16. */
+    /* Return 0 on success (t=0), 1 on failure */
+    return (int) (1 & ((~t + 1) >> 31));
 }

 void PQCLEAN_NTRUHPS4096821_AVX2_owcpa_keypair(unsigned char *pk,
@ -125,8 +144,8 @@ int PQCLEAN_NTRUHPS4096821_AVX2_owcpa_dec(unsigned char *rm,

    fail = 0;

-    /* Check that unused bits of last byte of ciphertext are zero */
-    fail |= ciphertext[NTRU_CIPHERTEXTBYTES - 1] & (0xff << (8 - (7 & (NTRU_LOGQ * NTRU_PACK_DEG))));
+    /* Check that the unused bits of the last byte of the ciphertext are zero */
+    fail |= owcpa_check_ciphertext(ciphertext);

    /* For the IND-CCA2 KEM we must ensure that c = Enc(h, (r,m)).             */
    /* We can avoid re-computing r*h + Lift(m) as long as we check that        */
--- a/crypto_kem/ntruhps4096821/avx2/poly_rq_to_s3.s
+++ b/crypto_kem/ntruhps4096821/avx2/poly_rq_to_s3.s
--- a/crypto_kem/ntruhps4096821/clean/owcpa.c
+++ b/crypto_kem/ntruhps4096821/clean/owcpa.c
@ -2,40 +2,59 @@
 #include "poly.h"
 #include "sample.h"

-static int owcpa_check_r(const poly *r) {
-    /* Check that r is in message space. */
-    /* Note: Assumes that r has coefficients in {0, 1, ..., q-1} */
-    int i;
-    uint64_t t = 0;
-    uint16_t c;
-    for (i = 0; i < NTRU_N; i++) {
-        c = MODQ(r->coeffs[i] + 1);
-        t |= c & (NTRU_Q - 4); /* 0 if c is in {0,1,2,3} */
-        t |= (c + 1) & 0x4;   /* 0 if c is in {0,1,2} */
+static int owcpa_check_ciphertext(const unsigned char *ciphertext) {
+    /* A ciphertext is log2(q)*(n-1) bits packed into bytes.  */
+    /* Check that any unused bits of the final byte are zero. */
+
+    uint16_t t = 0;
+
+    t = ciphertext[NTRU_CIPHERTEXTBYTES - 1];
+    t &= 0xff << (8 - (7 & (NTRU_LOGQ * NTRU_PACK_DEG)));
+
+    /* We have 0 <= t < 256 */
+    /* Return 0 on success (t=0), 1 on failure */
+    return (int) (1 & ((~t + 1) >> 15));
 }
-    t |= MODQ(r->coeffs[NTRU_N - 1]); /* Coefficient n-1 must be zero */
-    t = (~t + 1); // two's complement
-    t >>= 63;
-    return (int) t;
+
+static int owcpa_check_r(const poly *r) {
+    /* A valid r has coefficients in {0,1,q-1} and has r[N-1] = 0 */
+    /* Note: We may assume that 0 <= r[i] <= q-1 for all i        */
+
+    int i;
+    uint32_t t = 0;
+    uint16_t c;
+    for (i = 0; i < NTRU_N - 1; i++) {
+        c = r->coeffs[i];
+        t |= (c + 1) & (NTRU_Q - 4); /* 0 iff c is in {-1,0,1,2} */
+        t |= (c + 2) & 4;  /* 1 if c = 2, 0 if c is in {-1,0,1} */
+    }
+    t |= r->coeffs[NTRU_N - 1]; /* Coefficient n-1 must be zero */
+
+    /* We have 0 <= t < 2^16. */
+    /* Return 0 on success (t=0), 1 on failure */
+    return (int) (1 & ((~t + 1) >> 31));
 }

 static int owcpa_check_m(const poly *m) {
-    /* Check that m is in message space. */
-    /* Note: Assumes that m has coefficients in {0,1,2}. */
+    /* Check that m is in message space, i.e.                  */
+    /*  (1)  |{i : m[i] = 1}| = |{i : m[i] = 2}|, and          */
+    /*  (2)  |{i : m[i] != 0}| = NTRU_WEIGHT.                  */
+    /* Note: We may assume that m has coefficients in {0,1,2}. */
+
    int i;
-    uint64_t t = 0;
-    uint16_t p1 = 0;
-    uint16_t m1 = 0;
+    uint32_t t = 0;
+    uint16_t ps = 0;
+    uint16_t ms = 0;
    for (i = 0; i < NTRU_N; i++) {
-        p1 += m->coeffs[i] & 0x01;
-        m1 += (m->coeffs[i] & 0x02) >> 1;
+        ps += m->coeffs[i] & 1;
+        ms += m->coeffs[i] & 2;
    }
-    /* Need p1 = m1 and p1 + m1 = NTRU_WEIGHT */
-    t |= p1 ^ m1;
-    t |= (p1 + m1) ^ NTRU_WEIGHT;
-    t = (~t + 1); // two's complement
-    t >>= 63;
-    return (int) t;
+    t |= ps ^ (ms >> 1);   /* 0 if (1) holds */
+    t |= ms ^ NTRU_WEIGHT; /* 0 if (1) and (2) hold */
+
+    /* We have 0 <= t < 2^16. */
+    /* Return 0 on success (t=0), 1 on failure */
+    return (int) (1 & ((~t + 1) >> 31));
 }

 void PQCLEAN_NTRUHPS4096821_CLEAN_owcpa_keypair(unsigned char *pk,
@ -125,8 +144,8 @@ int PQCLEAN_NTRUHPS4096821_CLEAN_owcpa_dec(unsigned char *rm,

    fail = 0;

-    /* Check that unused bits of last byte of ciphertext are zero */
-    fail |= ciphertext[NTRU_CIPHERTEXTBYTES - 1] & (0xff << (8 - (7 & (NTRU_LOGQ * NTRU_PACK_DEG))));
+    /* Check that the unused bits of the last byte of the ciphertext are zero */
+    fail |= owcpa_check_ciphertext(ciphertext);

    /* For the IND-CCA2 KEM we must ensure that c = Enc(h, (r,m)).             */
    /* We can avoid re-computing r*h + Lift(m) as long as we check that        */
--- a/crypto_kem/ntruhps4096821/clean/poly_mod.c
+++ b/crypto_kem/ntruhps4096821/clean/poly_mod.c
@ -30,14 +30,22 @@ void PQCLEAN_NTRUHPS4096821_CLEAN_poly_mod_q_Phi_n(poly *r) {
 }

 void PQCLEAN_NTRUHPS4096821_CLEAN_poly_Rq_to_S3(poly *r, const poly *a) {
-    /* NOTE: Assumes input is in [0,Q-1]^N */
-    /*       Produces output in {0,1,2}^N */
    int i;
+    uint16_t flag;

-    /* Center coeffs around 3Q: [0, Q-1] -> [3Q - Q/2, 3Q + Q/2) */
+    /* The coefficients of a are stored as non-negative integers. */
+    /* We must translate to representatives in [-q/2, q/2) before */
+    /* reduction mod 3.                                           */
    for (i = 0; i < NTRU_N; i++) {
-        r->coeffs[i] = ((MODQ(a->coeffs[i]) >> (NTRU_LOGQ - 1)) ^ 3) << NTRU_LOGQ;
-        r->coeffs[i] += MODQ(a->coeffs[i]);
+        /* Need an explicit reduction mod q here                    */
+        r->coeffs[i] = MODQ(a->coeffs[i]);
+
+        /* flag = 1 if r[i] >= q/2 else 0                            */
+        flag = r->coeffs[i] >> (NTRU_LOGQ - 1);
+
+        /* Now we will add (-q) mod 3 if r[i] >= q/2                 */
+        /* Note (-q) mod 3=(-2^k) mod 3=1<<(1-(k&1))                */
+        r->coeffs[i] += flag << (1 - (NTRU_LOGQ & 1));
    }

    PQCLEAN_NTRUHPS4096821_CLEAN_poly_mod_3_Phi_n(r);
--- a/crypto_kem/ntruhrss701/META.yml
+++ b/crypto_kem/ntruhrss701/META.yml
@ -23,9 +23,9 @@ auxiliary-submitters:
  - Zhenfei Zhang
 implementations:
    - name: clean
-      version: https://github.com/jschanck/ntru/tree/2d4df948 reference implementation
+      version: https://github.com/jschanck/ntru/tree/b38a346a reference implementation
    - name: avx2
-      version: https://github.com/jschanck/ntru/tree/2d4df948 avx2 implementation
+      version: https://github.com/jschanck/ntru/tree/b38a346a avx2 implementation
      supported_platforms:
          - architecture: x86_64
            operating_systems:
--- a/crypto_kem/ntruhrss701/avx2/Makefile
+++ b/crypto_kem/ntruhrss701/avx2/Makefile
@ -2,9 +2,9 @@

 LIB=libntruhrss701_avx2.a
 HEADERS=api.h cmov.h owcpa.h params.h poly.h poly_r2_inv.h sample.h 
-OBJECTS=cmov.o kem.o owcpa.o pack3.o packq.o poly.o poly_r2_inv.o sample.o sample_iid.o  \
+OBJECTS=cmov.o kem.o owcpa.o pack3.o packq.o poly.o poly_r2_inv.o poly_s3_inv.o sample.o sample_iid.o  \
        square_1_701_patience.o square_3_701_patience.o square_6_701_patience.o square_12_701_shufbytes.o square_15_701_shufbytes.o square_27_701_shufbytes.o square_42_701_shufbytes.o square_84_701_shufbytes.o square_168_701_shufbytes.o square_336_701_shufbytes.o  \
-        poly_lift.o poly_mod_3_Phi_n.o poly_mod_q_Phi_n.o poly_r2_mul.o poly_rq_mul.o poly_rq_to_s3.o poly_s3_inv.o  vec32_sample_iid.o
+        poly_lift.o poly_mod_3_Phi_n.o poly_mod_q_Phi_n.o poly_r2_mul.o poly_rq_mul.o poly_rq_to_s3.o  vec32_sample_iid.o

 CFLAGS=-O3 -mavx2 -mbmi2 -Wall -Wextra -Wpedantic -Wvla -Werror -Wredundant-decls -Wmissing-prototypes -std=c99 -I../../../common $(EXTRAFLAGS)

--- a/crypto_kem/ntruhrss701/avx2/owcpa.c
+++ b/crypto_kem/ntruhrss701/avx2/owcpa.c
@ -2,21 +2,37 @@
 #include "poly.h"
 #include "sample.h"

-static int owcpa_check_r(const poly *r) {
-    /* Check that r is in message space. */
-    /* Note: Assumes that r has coefficients in {0, 1, ..., q-1} */
-    int i;
-    uint64_t t = 0;
-    uint16_t c;
-    for (i = 0; i < NTRU_N; i++) {
-        c = MODQ(r->coeffs[i] + 1);
-        t |= c & (NTRU_Q - 4); /* 0 if c is in {0,1,2,3} */
-        t |= (c + 1) & 0x4;   /* 0 if c is in {0,1,2} */
+static int owcpa_check_ciphertext(const unsigned char *ciphertext) {
+    /* A ciphertext is log2(q)*(n-1) bits packed into bytes.  */
+    /* Check that any unused bits of the final byte are zero. */
+
+    uint16_t t = 0;
+
+    t = ciphertext[NTRU_CIPHERTEXTBYTES - 1];
+    t &= 0xff << (8 - (7 & (NTRU_LOGQ * NTRU_PACK_DEG)));
+
+    /* We have 0 <= t < 256 */
+    /* Return 0 on success (t=0), 1 on failure */
+    return (int) (1 & ((~t + 1) >> 15));
 }
-    t |= MODQ(r->coeffs[NTRU_N - 1]); /* Coefficient n-1 must be zero */
-    t = (~t + 1); // two's complement
-    t >>= 63;
-    return (int) t;
+
+static int owcpa_check_r(const poly *r) {
+    /* A valid r has coefficients in {0,1,q-1} and has r[N-1] = 0 */
+    /* Note: We may assume that 0 <= r[i] <= q-1 for all i        */
+
+    int i;
+    uint32_t t = 0;
+    uint16_t c;
+    for (i = 0; i < NTRU_N - 1; i++) {
+        c = r->coeffs[i];
+        t |= (c + 1) & (NTRU_Q - 4); /* 0 iff c is in {-1,0,1,2} */
+        t |= (c + 2) & 4;  /* 1 if c = 2, 0 if c is in {-1,0,1} */
+    }
+    t |= r->coeffs[NTRU_N - 1]; /* Coefficient n-1 must be zero */
+
+    /* We have 0 <= t < 2^16. */
+    /* Return 0 on success (t=0), 1 on failure */
+    return (int) (1 & ((~t + 1) >> 31));
 }


@ -108,8 +124,8 @@ int PQCLEAN_NTRUHRSS701_AVX2_owcpa_dec(unsigned char *rm,

    fail = 0;

-    /* Check that unused bits of last byte of ciphertext are zero */
-    fail |= ciphertext[NTRU_CIPHERTEXTBYTES - 1] & (0xff << (8 - (7 & (NTRU_LOGQ * NTRU_PACK_DEG))));
+    /* Check that the unused bits of the last byte of the ciphertext are zero */
+    fail |= owcpa_check_ciphertext(ciphertext);

    /* For the IND-CCA2 KEM we must ensure that c = Enc(h, (r,m)).             */
    /* We can avoid re-computing r*h + Lift(m) as long as we check that        */
--- a/crypto_kem/ntruhrss701/avx2/poly_rq_to_s3.s
+++ b/crypto_kem/ntruhrss701/avx2/poly_rq_to_s3.s
--- a/crypto_kem/ntruhrss701/avx2/poly_s3_inv.c
+++ b/crypto_kem/ntruhrss701/avx2/poly_s3_inv.c
@ -0,0 +1,569 @@
+#include "poly.h"
+
+#include <immintrin.h>
+
+typedef signed char small;
+
+#define p 700
+#define ppad 768
+#define numvec 3
+
+typedef __m256i vec256;
+
+/*
+This code stores 768-coeff poly as vec256[3].
+Order of 256 coefficients in each vec256
+is optimized in light of costs of vector instructions:
+  0,4,...,252 in 64-bit word;
+  1,5,...,253 in 64-bit word;
+  2,6,...,254 in 64-bit word;
+  3,7,...,255 in 64-bit word.
+*/
+
+static inline void vec256_frombits(vec256 *v, const small *b) {
+    int i;
+
+    for (i = 0; i < numvec; ++i) {
+        vec256 b0 = _mm256_loadu_si256((vec256 *) b);
+        b += 32; /* 0,1,...,31 */
+        vec256 b1 = _mm256_loadu_si256((vec256 *) b);
+        b += 32; /* 32,33,... */
+        vec256 b2 = _mm256_loadu_si256((vec256 *) b);
+        b += 32;
+        vec256 b3 = _mm256_loadu_si256((vec256 *) b);
+        b += 32;
+        vec256 b4 = _mm256_loadu_si256((vec256 *) b);
+        b += 32;
+        vec256 b5 = _mm256_loadu_si256((vec256 *) b);
+        b += 32;
+        vec256 b6 = _mm256_loadu_si256((vec256 *) b);
+        b += 32;
+        vec256 b7 = _mm256_loadu_si256((vec256 *) b);
+        b += 32;
+
+        vec256 c0 = _mm256_unpacklo_epi32(b0, b1); /* 0 1 2 3 32 33 34 35 4 5 6 7 36 37 38 39 ... 55 */
+        vec256 c1 = _mm256_unpackhi_epi32(b0, b1); /* 8 9 10 11 40 41 42 43 ... 63 */
+        vec256 c2 = _mm256_unpacklo_epi32(b2, b3);
+        vec256 c3 = _mm256_unpackhi_epi32(b2, b3);
+        vec256 c4 = _mm256_unpacklo_epi32(b4, b5);
+        vec256 c5 = _mm256_unpackhi_epi32(b4, b5);
+        vec256 c6 = _mm256_unpacklo_epi32(b6, b7);
+        vec256 c7 = _mm256_unpackhi_epi32(b6, b7);
+
+        vec256 d0 = c0 | _mm256_slli_epi32(c1, 2); /* 0 8, 1 9, 2 10, 3 11, 32 40, 33 41, ..., 55 63 */
+        vec256 d2 = c2 | _mm256_slli_epi32(c3, 2);
+        vec256 d4 = c4 | _mm256_slli_epi32(c5, 2);
+        vec256 d6 = c6 | _mm256_slli_epi32(c7, 2);
+
+        vec256 e0 = _mm256_unpacklo_epi64(d0, d2);
+        vec256 e2 = _mm256_unpackhi_epi64(d0, d2);
+        vec256 e4 = _mm256_unpacklo_epi64(d4, d6);
+        vec256 e6 = _mm256_unpackhi_epi64(d4, d6);
+
+        vec256 f0 = e0 | _mm256_slli_epi32(e2, 1);
+        vec256 f4 = e4 | _mm256_slli_epi32(e6, 1);
+
+        vec256 g0 = _mm256_permute2x128_si256(f0, f4, 0x20);
+        vec256 g4 = _mm256_permute2x128_si256(f0, f4, 0x31);
+
+        vec256 h = g0 | _mm256_slli_epi32(g4, 4);
+
+#define TRANSPOSE _mm256_set_epi8( 31,27,23,19, 30,26,22,18, 29,25,21,17, 28,24,20,16, 15,11,7,3, 14,10,6,2, 13,9,5,1, 12,8,4,0 )
+        h = _mm256_shuffle_epi8(h, TRANSPOSE);
+        h = _mm256_permute4x64_epi64(h, 0xd8);
+        h = _mm256_shuffle_epi32(h, 0xd8);
+
+        *v++ = h;
+    }
+}
+
+static inline void vec256_tobits(const vec256 *v, small *b) {
+    int i;
+
+    for (i = 0; i < numvec; ++i) {
+        vec256 h = *v++;
+
+        h = _mm256_shuffle_epi32(h, 0xd8);
+        h = _mm256_permute4x64_epi64(h, 0xd8);
+        h = _mm256_shuffle_epi8(h, TRANSPOSE);
+
+        vec256 g0 = h & _mm256_set1_epi8(15);
+        vec256 g4 = _mm256_srli_epi32(h, 4) & _mm256_set1_epi8(15);
+
+        vec256 f0 = _mm256_permute2x128_si256(g0, g4, 0x20);
+        vec256 f4 = _mm256_permute2x128_si256(g0, g4, 0x31);
+
+        vec256 e0 = f0 & _mm256_set1_epi8(5);
+        vec256 e2 = _mm256_srli_epi32(f0, 1) & _mm256_set1_epi8(5);
+        vec256 e4 = f4 & _mm256_set1_epi8(5);
+        vec256 e6 = _mm256_srli_epi32(f4, 1) & _mm256_set1_epi8(5);
+
+        vec256 d0 = _mm256_unpacklo_epi32(e0, e2);
+        vec256 d2 = _mm256_unpackhi_epi32(e0, e2);
+        vec256 d4 = _mm256_unpacklo_epi32(e4, e6);
+        vec256 d6 = _mm256_unpackhi_epi32(e4, e6);
+
+        vec256 c0 = d0 & _mm256_set1_epi8(1);
+        vec256 c1 = _mm256_srli_epi32(d0, 2) & _mm256_set1_epi8(1);
+        vec256 c2 = d2 & _mm256_set1_epi8(1);
+        vec256 c3 = _mm256_srli_epi32(d2, 2) & _mm256_set1_epi8(1);
+        vec256 c4 = d4 & _mm256_set1_epi8(1);
+        vec256 c5 = _mm256_srli_epi32(d4, 2) & _mm256_set1_epi8(1);
+        vec256 c6 = d6 & _mm256_set1_epi8(1);
+        vec256 c7 = _mm256_srli_epi32(d6, 2) & _mm256_set1_epi8(1);
+
+        vec256 b0 = _mm256_unpacklo_epi64(c0, c1);
+        vec256 b1 = _mm256_unpackhi_epi64(c0, c1);
+        vec256 b2 = _mm256_unpacklo_epi64(c2, c3);
+        vec256 b3 = _mm256_unpackhi_epi64(c2, c3);
+        vec256 b4 = _mm256_unpacklo_epi64(c4, c5);
+        vec256 b5 = _mm256_unpackhi_epi64(c4, c5);
+        vec256 b6 = _mm256_unpacklo_epi64(c6, c7);
+        vec256 b7 = _mm256_unpackhi_epi64(c6, c7);
+
+        _mm256_storeu_si256((vec256 *) b, b0);
+        b += 32;
+        _mm256_storeu_si256((vec256 *) b, b1);
+        b += 32;
+        _mm256_storeu_si256((vec256 *) b, b2);
+        b += 32;
+        _mm256_storeu_si256((vec256 *) b, b3);
+        b += 32;
+        _mm256_storeu_si256((vec256 *) b, b4);
+        b += 32;
+        _mm256_storeu_si256((vec256 *) b, b5);
+        b += 32;
+        _mm256_storeu_si256((vec256 *) b, b6);
+        b += 32;
+        _mm256_storeu_si256((vec256 *) b, b7);
+        b += 32;
+    }
+}
+
+static void vec256_init(vec256 *G0, vec256 *G1, const small *s) {
+    int i;
+    small srev[ppad + (ppad - p)];
+    small si;
+    small g0[ppad];
+    small g1[ppad];
+
+    for (i = 0; i < p; ++i) {
+        srev[ppad - 1 - i] = s[i];
+    }
+    for (i = 0; i < ppad - p; ++i) {
+        srev[i] = 0;
+    }
+    for (i = p; i < ppad; ++i) {
+        srev[i + ppad - p] = 0;
+    }
+
+    for (i = 0; i < ppad; ++i) {
+        si = srev[i + ppad - p];
+        g0[i] = si & 1;
+        g1[i] = (si >> 1) & g0[i];
+    }
+
+    vec256_frombits(G0, g0);
+    vec256_frombits(G1, g1);
+}
+
+static void vec256_final(small *out, const vec256 *V0, const vec256 *V1) {
+    int i;
+    small v0[ppad];
+    small v1[ppad];
+    small v[ppad];
+    small vrev[ppad + (ppad - p)];
+
+    vec256_tobits(V0, v0);
+    vec256_tobits(V1, v1);
+
+    for (i = 0; i < ppad; ++i) {
+        v[i] = v0[i] + 2 * v1[i] - 4 * (v0[i] & v1[i]);
+    }
+
+    for (i = 0; i < ppad; ++i) {
+        vrev[i] = v[ppad - 1 - i];
+    }
+    for (i = ppad; i < ppad + (ppad - p); ++i) {
+        vrev[i] = 0;
+    }
+
+    for (i = 0; i < p; ++i) {
+        out[i] = vrev[i + ppad - p];
+    }
+}
+
+static inline int negative_mask(int x) {
+    return x >> 31;
+}
+
+static inline void vec256_swap(vec256 *f, vec256 *g, int len, vec256 mask) {
+    vec256 flip;
+    int i;
+
+    for (i = 0; i < len; ++i) {
+        flip = mask & (f[i] ^ g[i]);
+        f[i] ^= flip;
+        g[i] ^= flip;
+    }
+}
+
+static inline void vec256_scale(vec256 *f0, vec256 *f1, const vec256 c0, const vec256 c1) {
+    int i;
+
+    for (i = 0; i < numvec; ++i) {
+        vec256 f0i = f0[i];
+        vec256 f1i = f1[i];
+
+        f0i &= c0;
+        f1i ^= c1;
+        f1i &= f0i;
+
+        f0[i] = f0i;
+        f1[i] = f1i;
+    }
+}
+
+static inline void vec256_eliminate(vec256 *f0, vec256 *f1, vec256 *g0, vec256 *g1, int len, const vec256 c0, const vec256 c1) {
+    int i;
+
+    for (i = 0; i < len; ++i) {
+        vec256 f0i = f0[i];
+        vec256 f1i = f1[i];
+        vec256 g0i = g0[i];
+        vec256 g1i = g1[i];
+        vec256 t;
+
+        f0i &= c0;
+        f1i ^= c1;
+        f1i &= f0i;
+
+        t = g0i ^ f0i;
+        g0[i] = t | (g1i ^ f1i);
+        g1[i] = (g1i ^ f0i) & (f1i ^ t);
+    }
+}
+
+static inline int vec256_bit0mask(vec256 *f) {
+    return -(_mm_cvtsi128_si32(_mm256_castsi256_si128(f[0])) & 1);
+}
+
+static inline void vec256_divx_1(vec256 *f) {
+    vec256 f0 = f[0];
+
+    unsigned long long low0 = _mm_cvtsi128_si64(_mm256_castsi256_si128(f0));
+
+    low0 = low0 >> 1;
+
+    f0 = _mm256_blend_epi32(f0, _mm256_set_epi64x(0, 0, 0, low0), 0x3);
+
+    f[0] = _mm256_permute4x64_epi64(f0, 0x39);
+}
+
+static inline void vec256_divx_2(vec256 *f) {
+    vec256 f0 = f[0];
+    vec256 f1 = f[1];
+
+    unsigned long long low0 = _mm_cvtsi128_si64(_mm256_castsi256_si128(f0));
+    unsigned long long low1 = _mm_cvtsi128_si64(_mm256_castsi256_si128(f1));
+
+    low0 = (low0 >> 1) | (low1 << 63);
+    low1 = low1 >> 1;
+
+    f0 = _mm256_blend_epi32(f0, _mm256_set_epi64x(0, 0, 0, low0), 0x3);
+    f1 = _mm256_blend_epi32(f1, _mm256_set_epi64x(0, 0, 0, low1), 0x3);
+
+    f[0] = _mm256_permute4x64_epi64(f0, 0x39);
+    f[1] = _mm256_permute4x64_epi64(f1, 0x39);
+}
+
+static inline void vec256_divx_3(vec256 *f) {
+    vec256 f0 = f[0];
+    vec256 f1 = f[1];
+    vec256 f2 = f[2];
+
+    unsigned long long low0 = _mm_cvtsi128_si64(_mm256_castsi256_si128(f0));
+    unsigned long long low1 = _mm_cvtsi128_si64(_mm256_castsi256_si128(f1));
+    unsigned long long low2 = _mm_cvtsi128_si64(_mm256_castsi256_si128(f2));
+
+    low0 = (low0 >> 1) | (low1 << 63);
+    low1 = (low1 >> 1) | (low2 << 63);
+    low2 = low2 >> 1;
+
+    f0 = _mm256_blend_epi32(f0, _mm256_set_epi64x(0, 0, 0, low0), 0x3);
+    f1 = _mm256_blend_epi32(f1, _mm256_set_epi64x(0, 0, 0, low1), 0x3);
+    f2 = _mm256_blend_epi32(f2, _mm256_set_epi64x(0, 0, 0, low2), 0x3);
+
+    f[0] = _mm256_permute4x64_epi64(f0, 0x39);
+    f[1] = _mm256_permute4x64_epi64(f1, 0x39);
+    f[2] = _mm256_permute4x64_epi64(f2, 0x39);
+}
+
+static inline void vec256_timesx_1(vec256 *f) {
+    vec256 f0 = _mm256_permute4x64_epi64(f[0], 0x93);
+
+    unsigned long long low0 = _mm_cvtsi128_si64(_mm256_castsi256_si128(f0));
+
+    low0 = low0 << 1;
+
+    f0 = _mm256_blend_epi32(f0, _mm256_set_epi64x(0, 0, 0, low0), 0x3);
+
+    f[0] = f0;
+}
+
+static inline void vec256_timesx_2(vec256 *f) {
+    vec256 f0 = _mm256_permute4x64_epi64(f[0], 0x93);
+    vec256 f1 = _mm256_permute4x64_epi64(f[1], 0x93);
+
+    unsigned long long low0 = _mm_cvtsi128_si64(_mm256_castsi256_si128(f0));
+    unsigned long long low1 = _mm_cvtsi128_si64(_mm256_castsi256_si128(f1));
+
+    low1 = (low1 << 1) | (low0 >> 63);
+    low0 = low0 << 1;
+
+    f0 = _mm256_blend_epi32(f0, _mm256_set_epi64x(0, 0, 0, low0), 0x3);
+    f1 = _mm256_blend_epi32(f1, _mm256_set_epi64x(0, 0, 0, low1), 0x3);
+
+    f[0] = f0;
+    f[1] = f1;
+}
+
+static inline void vec256_timesx_3(vec256 *f) {
+    vec256 f0 = _mm256_permute4x64_epi64(f[0], 0x93);
+    vec256 f1 = _mm256_permute4x64_epi64(f[1], 0x93);
+    vec256 f2 = _mm256_permute4x64_epi64(f[2], 0x93);
+
+    unsigned long long low0 = *(unsigned long long *) &f0;
+    unsigned long long low1 = *(unsigned long long *) &f1;
+    unsigned long long low2 = _mm_cvtsi128_si64(_mm256_castsi256_si128(f2));
+
+    low2 = (low2 << 1) | (low1 >> 63);
+    low1 = (low1 << 1) | (low0 >> 63);
+    low0 = low0 << 1;
+
+    *(unsigned long long *) &f0 = low0;
+    *(unsigned long long *) &f1 = low1;
+    f2 = _mm256_blend_epi32(f2, _mm256_set_epi64x(0, 0, 0, low2), 0x3);
+
+    f[0] = f0;
+    f[1] = f1;
+    f[2] = f2;
+}
+
+
+static int __poly_S3_inv(unsigned char *outbytes, const unsigned char *inbytes) {
+    small *out = (void *) outbytes;
+    small *in = (void *) inbytes;
+    vec256 F0[numvec];
+    vec256 F1[numvec];
+    vec256 G0[numvec];
+    vec256 G1[numvec];
+    vec256 V0[numvec];
+    vec256 V1[numvec];
+    vec256 R0[numvec];
+    vec256 R1[numvec];
+    vec256 c0vec, c1vec;
+    int loop;
+    int c0, c1;
+    int minusdelta = -1;
+    int swapmask;
+    vec256 swapvec;
+
+    vec256_init(G0, G1, in);
+    F0[0] = _mm256_set_epi32(-1, -1, -1, -1, -1, -1, -1, -1);
+    F0[1] = _mm256_set_epi32(-1, -1, -1, -1, -1, -1, -1, -1);
+    F0[2] = _mm256_set_epi32(32767, -1, 32767, -1, 32767, -1, 65535, -1);
+    F1[0] = _mm256_set1_epi32(0);
+    F1[1] = _mm256_set1_epi32(0);
+    F1[2] = _mm256_set1_epi32(0);
+
+    V0[0] = _mm256_set1_epi32(0);
+    V1[0] = _mm256_set1_epi32(0);
+    V0[1] = _mm256_set1_epi32(0);
+    V1[1] = _mm256_set1_epi32(0);
+    V0[2] = _mm256_set1_epi32(0);
+    V1[2] = _mm256_set1_epi32(0);
+
+    R0[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, 1);
+    R1[0] = _mm256_set1_epi32(0);
+    R0[1] = _mm256_set1_epi32(0);
+    R1[1] = _mm256_set1_epi32(0);
+    R0[2] = _mm256_set1_epi32(0);
+    R1[2] = _mm256_set1_epi32(0);
+
+    for (loop = 256; loop > 0; --loop) {
+        vec256_timesx_1(V0);
+        vec256_timesx_1(V1);
+        swapmask = negative_mask(minusdelta) & vec256_bit0mask(G0);
+
+        c0 = vec256_bit0mask(F0) & vec256_bit0mask(G0);
+        c1 = vec256_bit0mask(F1) ^ vec256_bit0mask(G1);
+        c1 &= c0;
+
+        minusdelta ^= swapmask & (minusdelta ^ -minusdelta);
+        minusdelta -= 1;
+
+        swapvec = _mm256_set1_epi32(swapmask);
+        vec256_swap(F0, G0, 3, swapvec);
+        vec256_swap(F1, G1, 3, swapvec);
+
+        c0vec = _mm256_set1_epi32(c0);
+        c1vec = _mm256_set1_epi32(c1);
+
+        vec256_eliminate(F0, F1, G0, G1, 3, c0vec, c1vec);
+        vec256_divx_3(G0);
+        vec256_divx_3(G1);
+
+        vec256_swap(V0, R0, 1, swapvec);
+        vec256_swap(V1, R1, 1, swapvec);
+        vec256_eliminate(V0, V1, R0, R1, 1, c0vec, c1vec);
+    }
+
+    for (loop = 256; loop > 0; --loop) {
+        vec256_timesx_2(V0);
+        vec256_timesx_2(V1);
+        swapmask = negative_mask(minusdelta) & vec256_bit0mask(G0);
+
+        c0 = vec256_bit0mask(F0) & vec256_bit0mask(G0);
+        c1 = vec256_bit0mask(F1) ^ vec256_bit0mask(G1);
+        c1 &= c0;
+
+        minusdelta ^= swapmask & (minusdelta ^ -minusdelta);
+        minusdelta -= 1;
+
+        swapvec = _mm256_set1_epi32(swapmask);
+        vec256_swap(F0, G0, 3, swapvec);
+        vec256_swap(F1, G1, 3, swapvec);
+
+        c0vec = _mm256_set1_epi32(c0);
+        c1vec = _mm256_set1_epi32(c1);
+
+        vec256_eliminate(F0, F1, G0, G1, 3, c0vec, c1vec);
+        vec256_divx_3(G0);
+        vec256_divx_3(G1);
+
+        vec256_swap(V0, R0, 2, swapvec);
+        vec256_swap(V1, R1, 2, swapvec);
+        vec256_eliminate(V0, V1, R0, R1, 2, c0vec, c1vec);
+    }
+
+    for (loop = 375; loop > 0; --loop) {
+        vec256_timesx_3(V0);
+        vec256_timesx_3(V1);
+        swapmask = negative_mask(minusdelta) & vec256_bit0mask(G0);
+
+        c0 = vec256_bit0mask(F0) & vec256_bit0mask(G0);
+        c1 = vec256_bit0mask(F1) ^ vec256_bit0mask(G1);
+        c1 &= c0;
+
+        minusdelta ^= swapmask & (minusdelta ^ -minusdelta);
+        minusdelta -= 1;
+
+        swapvec = _mm256_set1_epi32(swapmask);
+        vec256_swap(F0, G0, 3, swapvec);
+        vec256_swap(F1, G1, 3, swapvec);
+
+        c0vec = _mm256_set1_epi32(c0);
+        c1vec = _mm256_set1_epi32(c1);
+
+        vec256_eliminate(F0, F1, G0, G1, 3, c0vec, c1vec);
+        vec256_divx_3(G0);
+        vec256_divx_3(G1);
+
+        vec256_swap(V0, R0, 3, swapvec);
+        vec256_swap(V1, R1, 3, swapvec);
+        vec256_eliminate(V0, V1, R0, R1, 3, c0vec, c1vec);
+    }
+
+    for (loop = 256; loop > 0; --loop) {
+        vec256_timesx_3(V0);
+        vec256_timesx_3(V1);
+        swapmask = negative_mask(minusdelta) & vec256_bit0mask(G0);
+
+        c0 = vec256_bit0mask(F0) & vec256_bit0mask(G0);
+        c1 = vec256_bit0mask(F1) ^ vec256_bit0mask(G1);
+        c1 &= c0;
+
+        minusdelta ^= swapmask & (minusdelta ^ -minusdelta);
+        minusdelta -= 1;
+
+        swapvec = _mm256_set1_epi32(swapmask);
+        vec256_swap(F0, G0, 2, swapvec);
+        vec256_swap(F1, G1, 2, swapvec);
+
+        c0vec = _mm256_set1_epi32(c0);
+        c1vec = _mm256_set1_epi32(c1);
+
+        vec256_eliminate(F0, F1, G0, G1, 2, c0vec, c1vec);
+        vec256_divx_2(G0);
+        vec256_divx_2(G1);
+
+        vec256_swap(V0, R0, 3, swapvec);
+        vec256_swap(V1, R1, 3, swapvec);
+        vec256_eliminate(V0, V1, R0, R1, 3, c0vec, c1vec);
+    }
+
+    for (loop = 256; loop > 0; --loop) {
+        vec256_timesx_3(V0);
+        vec256_timesx_3(V1);
+        swapmask = negative_mask(minusdelta) & vec256_bit0mask(G0);
+
+        c0 = vec256_bit0mask(F0) & vec256_bit0mask(G0);
+        c1 = vec256_bit0mask(F1) ^ vec256_bit0mask(G1);
+        c1 &= c0;
+
+        minusdelta ^= swapmask & (minusdelta ^ -minusdelta);
+        minusdelta -= 1;
+
+        swapvec = _mm256_set1_epi32(swapmask);
+        vec256_swap(F0, G0, 1, swapvec);
+        vec256_swap(F1, G1, 1, swapvec);
+
+        c0vec = _mm256_set1_epi32(c0);
+        c1vec = _mm256_set1_epi32(c1);
+
+        vec256_eliminate(F0, F1, G0, G1, 1, c0vec, c1vec);
+        vec256_divx_1(G0);
+        vec256_divx_1(G1);
+
+        vec256_swap(V0, R0, 3, swapvec);
+        vec256_swap(V1, R1, 3, swapvec);
+        vec256_eliminate(V0, V1, R0, R1, 3, c0vec, c1vec);
+    }
+
+    c0vec = _mm256_set1_epi32(vec256_bit0mask(F0));
+    c1vec = _mm256_set1_epi32(vec256_bit0mask(F1));
+    vec256_scale(V0, V1, c0vec, c1vec);
+
+    vec256_final(out, V0, V1);
+    out[p] = negative_mask(minusdelta);
+    return 0;
+}
+
+// This code is based on crypto_core/invhrss701/faster from SUPERCOP. The code was written as a case study
+// for the paper "Fast constant-time gcd computation and modular inversion" by Daniel J. Bernstein and Bo-Yin Yang.
+void PQCLEAN_NTRUHRSS701_AVX2_poly_S3_inv(poly *r_out, const poly *a) {
+    const unsigned char *in = (void *) a;
+    unsigned char *out = (void *) r_out;
+
+    small input[ppad];
+    small output[ppad];
+    int i;
+
+    /* XXX: obviously input/output format should be packed into bytes */
+
+    for (i = 0; i < p; ++i) {
+        small x = in[2 * i] & 3; /* 0 1 2 3 */
+        x += 1; /* 0 1 2 3 4 5 6, offset by 1 */
+        x &= (x - 3) >> 5; /* 0 1 2, offset by 1 */
+        input[i] = x - 1;
+    }
+    /* XXX: merge with vec256_init */
+
+    __poly_S3_inv((unsigned char *)output, (unsigned char *)input);
+
+    for (i = 0; i < p; ++i) {
+        out[2 * i] = (3 & output[i]) ^ ((3 & output[i]) >> 1);
+        out[2 * i + 1] = 0;
+    }
+}
--- a/crypto_kem/ntruhrss701/avx2/poly_s3_inv.s
+++ b/crypto_kem/ntruhrss701/avx2/poly_s3_inv.s
--- a/crypto_kem/ntruhrss701/clean/owcpa.c
+++ b/crypto_kem/ntruhrss701/clean/owcpa.c
@ -2,21 +2,37 @@
 #include "poly.h"
 #include "sample.h"

-static int owcpa_check_r(const poly *r) {
-    /* Check that r is in message space. */
-    /* Note: Assumes that r has coefficients in {0, 1, ..., q-1} */
-    int i;
-    uint64_t t = 0;
-    uint16_t c;
-    for (i = 0; i < NTRU_N; i++) {
-        c = MODQ(r->coeffs[i] + 1);
-        t |= c & (NTRU_Q - 4); /* 0 if c is in {0,1,2,3} */
-        t |= (c + 1) & 0x4;   /* 0 if c is in {0,1,2} */
+static int owcpa_check_ciphertext(const unsigned char *ciphertext) {
+    /* A ciphertext is log2(q)*(n-1) bits packed into bytes.  */
+    /* Check that any unused bits of the final byte are zero. */
+
+    uint16_t t = 0;
+
+    t = ciphertext[NTRU_CIPHERTEXTBYTES - 1];
+    t &= 0xff << (8 - (7 & (NTRU_LOGQ * NTRU_PACK_DEG)));
+
+    /* We have 0 <= t < 256 */
+    /* Return 0 on success (t=0), 1 on failure */
+    return (int) (1 & ((~t + 1) >> 15));
 }
-    t |= MODQ(r->coeffs[NTRU_N - 1]); /* Coefficient n-1 must be zero */
-    t = (~t + 1); // two's complement
-    t >>= 63;
-    return (int) t;
+
+static int owcpa_check_r(const poly *r) {
+    /* A valid r has coefficients in {0,1,q-1} and has r[N-1] = 0 */
+    /* Note: We may assume that 0 <= r[i] <= q-1 for all i        */
+
+    int i;
+    uint32_t t = 0;
+    uint16_t c;
+    for (i = 0; i < NTRU_N - 1; i++) {
+        c = r->coeffs[i];
+        t |= (c + 1) & (NTRU_Q - 4); /* 0 iff c is in {-1,0,1,2} */
+        t |= (c + 2) & 4;  /* 1 if c = 2, 0 if c is in {-1,0,1} */
+    }
+    t |= r->coeffs[NTRU_N - 1]; /* Coefficient n-1 must be zero */
+
+    /* We have 0 <= t < 2^16. */
+    /* Return 0 on success (t=0), 1 on failure */
+    return (int) (1 & ((~t + 1) >> 31));
 }


@ -108,8 +124,8 @@ int PQCLEAN_NTRUHRSS701_CLEAN_owcpa_dec(unsigned char *rm,

    fail = 0;

-    /* Check that unused bits of last byte of ciphertext are zero */
-    fail |= ciphertext[NTRU_CIPHERTEXTBYTES - 1] & (0xff << (8 - (7 & (NTRU_LOGQ * NTRU_PACK_DEG))));
+    /* Check that the unused bits of the last byte of the ciphertext are zero */
+    fail |= owcpa_check_ciphertext(ciphertext);

    /* For the IND-CCA2 KEM we must ensure that c = Enc(h, (r,m)).             */
    /* We can avoid re-computing r*h + Lift(m) as long as we check that        */
--- a/crypto_kem/ntruhrss701/clean/poly_mod.c
+++ b/crypto_kem/ntruhrss701/clean/poly_mod.c
@ -30,14 +30,22 @@ void PQCLEAN_NTRUHRSS701_CLEAN_poly_mod_q_Phi_n(poly *r) {
 }

 void PQCLEAN_NTRUHRSS701_CLEAN_poly_Rq_to_S3(poly *r, const poly *a) {
-    /* NOTE: Assumes input is in [0,Q-1]^N */
-    /*       Produces output in {0,1,2}^N */
    int i;
+    uint16_t flag;

-    /* Center coeffs around 3Q: [0, Q-1] -> [3Q - Q/2, 3Q + Q/2) */
+    /* The coefficients of a are stored as non-negative integers. */
+    /* We must translate to representatives in [-q/2, q/2) before */
+    /* reduction mod 3.                                           */
    for (i = 0; i < NTRU_N; i++) {
-        r->coeffs[i] = ((MODQ(a->coeffs[i]) >> (NTRU_LOGQ - 1)) ^ 3) << NTRU_LOGQ;
-        r->coeffs[i] += MODQ(a->coeffs[i]);
+        /* Need an explicit reduction mod q here                    */
+        r->coeffs[i] = MODQ(a->coeffs[i]);
+
+        /* flag = 1 if r[i] >= q/2 else 0                            */
+        flag = r->coeffs[i] >> (NTRU_LOGQ - 1);
+
+        /* Now we will add (-q) mod 3 if r[i] >= q/2                 */
+        /* Note (-q) mod 3=(-2^k) mod 3=1<<(1-(k&1))                */
+        r->coeffs[i] += flag << (1 - (NTRU_LOGQ & 1));
    }

    PQCLEAN_NTRUHRSS701_CLEAN_poly_mod_3_Phi_n(r);