From 431dbada4533cf867f74b80ed09db7399dd0cdb9 Mon Sep 17 00:00:00 2001
From: "John M. Schanck" <jschanck@uwaterloo.ca>
Date: Fri, 28 Aug 2020 20:18:54 -0400
Subject: [PATCH] Add sntrup{653,761,857} and ntrulpr{653,761,857}

Exported from SUPERCOP-20200826 using the scripts at:
https://github.com/jschanck/pqclean-package-ntruprime
---
 crypto_kem/ntrulpr653/META.yml                |   26 +
 crypto_kem/ntrulpr653/avx2/LICENSE            |    1 +
 crypto_kem/ntrulpr653/avx2/Makefile           |   22 +
 crypto_kem/ntrulpr653/avx2/api.h              |   16 +
 .../avx2/crypto_core_multsntrup653.c          |  314 +++++
 .../avx2/crypto_core_multsntrup653.h          |   11 +
 .../avx2/crypto_core_multsntrup653_ntt.c      |  927 +++++++++++++
 .../avx2/crypto_core_multsntrup653_ntt.h      |   13 +
 .../ntrulpr653/avx2/crypto_decode_256x16.c    |   11 +
 .../ntrulpr653/avx2/crypto_decode_256x16.h    |   10 +
 .../ntrulpr653/avx2/crypto_decode_256x2.c     |   27 +
 .../ntrulpr653/avx2/crypto_decode_256x2.h     |   10 +
 .../ntrulpr653/avx2/crypto_decode_653x1541.c  |  408 ++++++
 .../ntrulpr653/avx2/crypto_decode_653x1541.h  |   10 +
 .../ntrulpr653/avx2/crypto_decode_653x3.c     |   65 +
 .../ntrulpr653/avx2/crypto_decode_653x3.h     |   10 +
 .../ntrulpr653/avx2/crypto_decode_653xint16.c |   16 +
 .../ntrulpr653/avx2/crypto_decode_653xint16.h |   10 +
 .../ntrulpr653/avx2/crypto_decode_653xint32.c |   20 +
 .../ntrulpr653/avx2/crypto_decode_653xint32.h |   10 +
 .../ntrulpr653/avx2/crypto_encode_256x16.c    |   10 +
 .../ntrulpr653/avx2/crypto_encode_256x16.h    |   10 +
 .../ntrulpr653/avx2/crypto_encode_256x2.c     |   88 ++
 .../ntrulpr653/avx2/crypto_encode_256x2.h     |   10 +
 .../ntrulpr653/avx2/crypto_encode_653x1541.c  |  286 ++++
 .../ntrulpr653/avx2/crypto_encode_653x1541.h  |   10 +
 .../avx2/crypto_encode_653x1541round.c        |  288 ++++
 .../avx2/crypto_encode_653x1541round.h        |   10 +
 .../ntrulpr653/avx2/crypto_encode_653x3.c     |   64 +
 .../ntrulpr653/avx2/crypto_encode_653x3.h     |   10 +
 .../ntrulpr653/avx2/crypto_encode_653xint16.c |   13 +
 .../ntrulpr653/avx2/crypto_encode_653xint16.h |   10 +
 .../ntrulpr653/avx2/crypto_sort_int32.c       | 1210 +++++++++++++++++
 .../ntrulpr653/avx2/crypto_sort_int32.h       |   10 +
 .../ntrulpr653/avx2/crypto_sort_uint32.c      |   20 +
 .../ntrulpr653/avx2/crypto_sort_uint32.h      |   10 +
 .../ntrulpr653/avx2/crypto_stream_aes256ctr.c |   15 +
 .../ntrulpr653/avx2/crypto_stream_aes256ctr.h |   15 +
 .../ntrulpr653/avx2/crypto_verify_1025.c      |   36 +
 .../ntrulpr653/avx2/crypto_verify_1025.h      |    8 +
 crypto_kem/ntrulpr653/avx2/kem.c              |  287 ++++
 crypto_kem/ntrulpr653/avx2/params.h           |   61 +
 crypto_kem/ntrulpr653/clean/LICENSE           |    1 +
 crypto_kem/ntrulpr653/clean/Makefile          |   19 +
 .../ntrulpr653/clean/Makefile.Microsoft_nmake |   19 +
 crypto_kem/ntrulpr653/clean/api.h             |   16 +
 .../clean/crypto_core_multsntrup653.c         |   60 +
 .../clean/crypto_core_multsntrup653.h         |   11 +
 .../ntrulpr653/clean/crypto_decode_256x16.c   |   11 +
 .../ntrulpr653/clean/crypto_decode_256x16.h   |   10 +
 .../ntrulpr653/clean/crypto_decode_256x2.c    |   10 +
 .../ntrulpr653/clean/crypto_decode_256x2.h    |   10 +
 .../ntrulpr653/clean/crypto_decode_653x1541.c |  200 +++
 .../ntrulpr653/clean/crypto_decode_653x1541.h |   10 +
 .../ntrulpr653/clean/crypto_decode_653x3.c    |   24 +
 .../ntrulpr653/clean/crypto_decode_653x3.h    |   10 +
 .../clean/crypto_decode_653xint16.c           |   16 +
 .../clean/crypto_decode_653xint16.h           |   10 +
 .../clean/crypto_decode_653xint32.c           |   20 +
 .../clean/crypto_decode_653xint32.h           |   10 +
 .../ntrulpr653/clean/crypto_encode_256x16.c   |   10 +
 .../ntrulpr653/clean/crypto_encode_256x16.h   |   10 +
 .../ntrulpr653/clean/crypto_encode_256x2.c    |   13 +
 .../ntrulpr653/clean/crypto_encode_256x2.h    |   10 +
 .../ntrulpr653/clean/crypto_encode_653x1541.c |  127 ++
 .../ntrulpr653/clean/crypto_encode_653x1541.h |   10 +
 .../clean/crypto_encode_653x1541round.c       |   17 +
 .../clean/crypto_encode_653x1541round.h       |   10 +
 .../ntrulpr653/clean/crypto_encode_653x3.c    |   21 +
 .../ntrulpr653/clean/crypto_encode_653x3.h    |   10 +
 .../clean/crypto_encode_653xint16.c           |   13 +
 .../clean/crypto_encode_653xint16.h           |   10 +
 .../ntrulpr653/clean/crypto_sort_int32.c      |   86 ++
 .../ntrulpr653/clean/crypto_sort_int32.h      |   10 +
 .../ntrulpr653/clean/crypto_sort_uint32.c     |   20 +
 .../ntrulpr653/clean/crypto_sort_uint32.h     |   10 +
 .../clean/crypto_stream_aes256ctr.c           |   15 +
 .../clean/crypto_stream_aes256ctr.h           |   15 +
 .../ntrulpr653/clean/crypto_verify_1025.c     |   13 +
 .../ntrulpr653/clean/crypto_verify_1025.h     |    8 +
 crypto_kem/ntrulpr653/clean/kem.c             |  287 ++++
 crypto_kem/ntrulpr653/clean/params.h          |   63 +
 crypto_kem/ntrulpr761/META.yml                |   26 +
 crypto_kem/ntrulpr761/avx2/LICENSE            |    1 +
 crypto_kem/ntrulpr761/avx2/Makefile           |   22 +
 crypto_kem/ntrulpr761/avx2/api.h              |   16 +
 .../avx2/crypto_core_multsntrup761.c          |  314 +++++
 .../avx2/crypto_core_multsntrup761.h          |   11 +
 .../avx2/crypto_core_multsntrup761_ntt.c      |  927 +++++++++++++
 .../avx2/crypto_core_multsntrup761_ntt.h      |   13 +
 .../ntrulpr761/avx2/crypto_decode_256x16.c    |   11 +
 .../ntrulpr761/avx2/crypto_decode_256x16.h    |   10 +
 .../ntrulpr761/avx2/crypto_decode_256x2.c     |   27 +
 .../ntrulpr761/avx2/crypto_decode_256x2.h     |   10 +
 .../ntrulpr761/avx2/crypto_decode_761x1531.c  |  436 ++++++
 .../ntrulpr761/avx2/crypto_decode_761x1531.h  |   10 +
 .../ntrulpr761/avx2/crypto_decode_761x3.c     |   65 +
 .../ntrulpr761/avx2/crypto_decode_761x3.h     |   10 +
 .../ntrulpr761/avx2/crypto_decode_761xint16.c |   16 +
 .../ntrulpr761/avx2/crypto_decode_761xint16.h |   10 +
 .../ntrulpr761/avx2/crypto_decode_761xint32.c |   20 +
 .../ntrulpr761/avx2/crypto_decode_761xint32.h |   10 +
 .../ntrulpr761/avx2/crypto_encode_256x16.c    |   10 +
 .../ntrulpr761/avx2/crypto_encode_256x16.h    |   10 +
 .../ntrulpr761/avx2/crypto_encode_256x2.c     |   88 ++
 .../ntrulpr761/avx2/crypto_encode_256x2.h     |   10 +
 .../ntrulpr761/avx2/crypto_encode_761x1531.c  |  301 ++++
 .../ntrulpr761/avx2/crypto_encode_761x1531.h  |   10 +
 .../avx2/crypto_encode_761x1531round.c        |  303 +++++
 .../avx2/crypto_encode_761x1531round.h        |   10 +
 .../ntrulpr761/avx2/crypto_encode_761x3.c     |   64 +
 .../ntrulpr761/avx2/crypto_encode_761x3.h     |   10 +
 .../ntrulpr761/avx2/crypto_encode_761xint16.c |   13 +
 .../ntrulpr761/avx2/crypto_encode_761xint16.h |   10 +
 .../ntrulpr761/avx2/crypto_sort_int32.c       | 1210 +++++++++++++++++
 .../ntrulpr761/avx2/crypto_sort_int32.h       |   10 +
 .../ntrulpr761/avx2/crypto_sort_uint32.c      |   20 +
 .../ntrulpr761/avx2/crypto_sort_uint32.h      |   10 +
 .../ntrulpr761/avx2/crypto_stream_aes256ctr.c |   15 +
 .../ntrulpr761/avx2/crypto_stream_aes256ctr.h |   15 +
 .../ntrulpr761/avx2/crypto_verify_1167.c      |   36 +
 .../ntrulpr761/avx2/crypto_verify_1167.h      |    8 +
 crypto_kem/ntrulpr761/avx2/kem.c              |  287 ++++
 crypto_kem/ntrulpr761/avx2/params.h           |   61 +
 crypto_kem/ntrulpr761/clean/LICENSE           |    1 +
 crypto_kem/ntrulpr761/clean/Makefile          |   19 +
 .../ntrulpr761/clean/Makefile.Microsoft_nmake |   19 +
 crypto_kem/ntrulpr761/clean/api.h             |   16 +
 .../clean/crypto_core_multsntrup761.c         |   60 +
 .../clean/crypto_core_multsntrup761.h         |   11 +
 .../ntrulpr761/clean/crypto_decode_256x16.c   |   11 +
 .../ntrulpr761/clean/crypto_decode_256x16.h   |   10 +
 .../ntrulpr761/clean/crypto_decode_256x2.c    |   10 +
 .../ntrulpr761/clean/crypto_decode_256x2.h    |   10 +
 .../ntrulpr761/clean/crypto_decode_761x1531.c |  211 +++
 .../ntrulpr761/clean/crypto_decode_761x1531.h |   10 +
 .../ntrulpr761/clean/crypto_decode_761x3.c    |   24 +
 .../ntrulpr761/clean/crypto_decode_761x3.h    |   10 +
 .../clean/crypto_decode_761xint16.c           |   16 +
 .../clean/crypto_decode_761xint16.h           |   10 +
 .../clean/crypto_decode_761xint32.c           |   20 +
 .../clean/crypto_decode_761xint32.h           |   10 +
 .../ntrulpr761/clean/crypto_encode_256x16.c   |   10 +
 .../ntrulpr761/clean/crypto_encode_256x16.h   |   10 +
 .../ntrulpr761/clean/crypto_encode_256x2.c    |   13 +
 .../ntrulpr761/clean/crypto_encode_256x2.h    |   10 +
 .../ntrulpr761/clean/crypto_encode_761x1531.c |  119 ++
 .../ntrulpr761/clean/crypto_encode_761x1531.h |   10 +
 .../clean/crypto_encode_761x1531round.c       |   17 +
 .../clean/crypto_encode_761x1531round.h       |   10 +
 .../ntrulpr761/clean/crypto_encode_761x3.c    |   21 +
 .../ntrulpr761/clean/crypto_encode_761x3.h    |   10 +
 .../clean/crypto_encode_761xint16.c           |   13 +
 .../clean/crypto_encode_761xint16.h           |   10 +
 .../ntrulpr761/clean/crypto_sort_int32.c      |   86 ++
 .../ntrulpr761/clean/crypto_sort_int32.h      |   10 +
 .../ntrulpr761/clean/crypto_sort_uint32.c     |   20 +
 .../ntrulpr761/clean/crypto_sort_uint32.h     |   10 +
 .../clean/crypto_stream_aes256ctr.c           |   15 +
 .../clean/crypto_stream_aes256ctr.h           |   15 +
 .../ntrulpr761/clean/crypto_verify_1167.c     |   13 +
 .../ntrulpr761/clean/crypto_verify_1167.h     |    8 +
 crypto_kem/ntrulpr761/clean/kem.c             |  287 ++++
 crypto_kem/ntrulpr761/clean/params.h          |   63 +
 crypto_kem/ntrulpr857/META.yml                |   26 +
 crypto_kem/ntrulpr857/avx2/LICENSE            |    1 +
 crypto_kem/ntrulpr857/avx2/Makefile           |   22 +
 crypto_kem/ntrulpr857/avx2/api.h              |   16 +
 .../avx2/crypto_core_multsntrup857.c          |  421 ++++++
 .../avx2/crypto_core_multsntrup857.h          |   11 +
 .../avx2/crypto_core_multsntrup857_ntt.c      |  927 +++++++++++++
 .../avx2/crypto_core_multsntrup857_ntt.h      |   13 +
 .../ntrulpr857/avx2/crypto_decode_256x16.c    |   11 +
 .../ntrulpr857/avx2/crypto_decode_256x16.h    |   10 +
 .../ntrulpr857/avx2/crypto_decode_256x2.c     |   27 +
 .../ntrulpr857/avx2/crypto_decode_256x2.h     |   10 +
 .../ntrulpr857/avx2/crypto_decode_857x1723.c  |  430 ++++++
 .../ntrulpr857/avx2/crypto_decode_857x1723.h  |   10 +
 .../ntrulpr857/avx2/crypto_decode_857x3.c     |   65 +
 .../ntrulpr857/avx2/crypto_decode_857x3.h     |   10 +
 .../ntrulpr857/avx2/crypto_decode_857xint16.c |   16 +
 .../ntrulpr857/avx2/crypto_decode_857xint16.h |   10 +
 .../ntrulpr857/avx2/crypto_decode_857xint32.c |   20 +
 .../ntrulpr857/avx2/crypto_decode_857xint32.h |   10 +
 .../ntrulpr857/avx2/crypto_encode_256x16.c    |   10 +
 .../ntrulpr857/avx2/crypto_encode_256x16.h    |   10 +
 .../ntrulpr857/avx2/crypto_encode_256x2.c     |   88 ++
 .../ntrulpr857/avx2/crypto_encode_256x2.h     |   10 +
 .../ntrulpr857/avx2/crypto_encode_857x1723.c  |  283 ++++
 .../ntrulpr857/avx2/crypto_encode_857x1723.h  |   10 +
 .../avx2/crypto_encode_857x1723round.c        |  285 ++++
 .../avx2/crypto_encode_857x1723round.h        |   10 +
 .../ntrulpr857/avx2/crypto_encode_857x3.c     |   64 +
 .../ntrulpr857/avx2/crypto_encode_857x3.h     |   10 +
 .../ntrulpr857/avx2/crypto_encode_857xint16.c |   13 +
 .../ntrulpr857/avx2/crypto_encode_857xint16.h |   10 +
 .../ntrulpr857/avx2/crypto_sort_int32.c       | 1210 +++++++++++++++++
 .../ntrulpr857/avx2/crypto_sort_int32.h       |   10 +
 .../ntrulpr857/avx2/crypto_sort_uint32.c      |   20 +
 .../ntrulpr857/avx2/crypto_sort_uint32.h      |   10 +
 .../ntrulpr857/avx2/crypto_stream_aes256ctr.c |   15 +
 .../ntrulpr857/avx2/crypto_stream_aes256ctr.h |   15 +
 .../ntrulpr857/avx2/crypto_verify_1312.c      |   36 +
 .../ntrulpr857/avx2/crypto_verify_1312.h      |    8 +
 crypto_kem/ntrulpr857/avx2/kem.c              |  287 ++++
 crypto_kem/ntrulpr857/avx2/params.h           |   61 +
 crypto_kem/ntrulpr857/clean/LICENSE           |    1 +
 crypto_kem/ntrulpr857/clean/Makefile          |   19 +
 .../ntrulpr857/clean/Makefile.Microsoft_nmake |   19 +
 crypto_kem/ntrulpr857/clean/api.h             |   16 +
 .../clean/crypto_core_multsntrup857.c         |   60 +
 .../clean/crypto_core_multsntrup857.h         |   11 +
 .../ntrulpr857/clean/crypto_decode_256x16.c   |   11 +
 .../ntrulpr857/clean/crypto_decode_256x16.h   |   10 +
 .../ntrulpr857/clean/crypto_decode_256x2.c    |   10 +
 .../ntrulpr857/clean/crypto_decode_256x2.h    |   10 +
 .../ntrulpr857/clean/crypto_decode_857x1723.c |  202 +++
 .../ntrulpr857/clean/crypto_decode_857x1723.h |   10 +
 .../ntrulpr857/clean/crypto_decode_857x3.c    |   24 +
 .../ntrulpr857/clean/crypto_decode_857x3.h    |   10 +
 .../clean/crypto_decode_857xint16.c           |   16 +
 .../clean/crypto_decode_857xint16.h           |   10 +
 .../clean/crypto_decode_857xint32.c           |   20 +
 .../clean/crypto_decode_857xint32.h           |   10 +
 .../ntrulpr857/clean/crypto_encode_256x16.c   |   10 +
 .../ntrulpr857/clean/crypto_encode_256x16.h   |   10 +
 .../ntrulpr857/clean/crypto_encode_256x2.c    |   13 +
 .../ntrulpr857/clean/crypto_encode_256x2.h    |   10 +
 .../ntrulpr857/clean/crypto_encode_857x1723.c |  130 ++
 .../ntrulpr857/clean/crypto_encode_857x1723.h |   10 +
 .../clean/crypto_encode_857x1723round.c       |   17 +
 .../clean/crypto_encode_857x1723round.h       |   10 +
 .../ntrulpr857/clean/crypto_encode_857x3.c    |   21 +
 .../ntrulpr857/clean/crypto_encode_857x3.h    |   10 +
 .../clean/crypto_encode_857xint16.c           |   13 +
 .../clean/crypto_encode_857xint16.h           |   10 +
 .../ntrulpr857/clean/crypto_sort_int32.c      |   86 ++
 .../ntrulpr857/clean/crypto_sort_int32.h      |   10 +
 .../ntrulpr857/clean/crypto_sort_uint32.c     |   20 +
 .../ntrulpr857/clean/crypto_sort_uint32.h     |   10 +
 .../clean/crypto_stream_aes256ctr.c           |   15 +
 .../clean/crypto_stream_aes256ctr.h           |   15 +
 .../ntrulpr857/clean/crypto_verify_1312.c     |   13 +
 .../ntrulpr857/clean/crypto_verify_1312.h     |    8 +
 crypto_kem/ntrulpr857/clean/kem.c             |  287 ++++
 crypto_kem/ntrulpr857/clean/params.h          |   63 +
 crypto_kem/sntrup653/META.yml                 |   26 +
 crypto_kem/sntrup653/avx2/LICENSE             |    1 +
 crypto_kem/sntrup653/avx2/Makefile            |   22 +
 crypto_kem/sntrup653/avx2/api.h               |   16 +
 .../avx2/crypto_core_inv3sntrup653.c          |  542 ++++++++
 .../avx2/crypto_core_inv3sntrup653.h          |   11 +
 .../sntrup653/avx2/crypto_core_invsntrup653.c |  202 +++
 .../sntrup653/avx2/crypto_core_invsntrup653.h |   11 +
 .../avx2/crypto_core_mult3sntrup653.c         |  259 ++++
 .../avx2/crypto_core_mult3sntrup653.h         |   11 +
 .../avx2/crypto_core_multsntrup653.c          |  314 +++++
 .../avx2/crypto_core_multsntrup653.h          |   11 +
 .../avx2/crypto_core_multsntrup653_ntt.c      |  927 +++++++++++++
 .../avx2/crypto_core_multsntrup653_ntt.h      |   13 +
 .../avx2/crypto_core_scale3sntrup653.c        |   47 +
 .../avx2/crypto_core_scale3sntrup653.h        |   11 +
 .../avx2/crypto_core_weightsntrup653.c        |   45 +
 .../avx2/crypto_core_weightsntrup653.h        |   11 +
 .../avx2/crypto_core_wforcesntrup653.c        |   61 +
 .../avx2/crypto_core_wforcesntrup653.h        |   11 +
 .../sntrup653/avx2/crypto_decode_653x1541.c   |  408 ++++++
 .../sntrup653/avx2/crypto_decode_653x1541.h   |   10 +
 .../sntrup653/avx2/crypto_decode_653x3.c      |   65 +
 .../sntrup653/avx2/crypto_decode_653x3.h      |   10 +
 .../sntrup653/avx2/crypto_decode_653x4621.c   |  408 ++++++
 .../sntrup653/avx2/crypto_decode_653x4621.h   |   10 +
 .../sntrup653/avx2/crypto_decode_653xint16.c  |   16 +
 .../sntrup653/avx2/crypto_decode_653xint16.h  |   10 +
 .../sntrup653/avx2/crypto_decode_653xint32.c  |   20 +
 .../sntrup653/avx2/crypto_decode_653xint32.h  |   10 +
 .../sntrup653/avx2/crypto_decode_int16.c      |    9 +
 .../sntrup653/avx2/crypto_decode_int16.h      |    9 +
 .../sntrup653/avx2/crypto_encode_653x1541.c   |  286 ++++
 .../sntrup653/avx2/crypto_encode_653x1541.h   |   10 +
 .../avx2/crypto_encode_653x1541round.c        |  288 ++++
 .../avx2/crypto_encode_653x1541round.h        |   10 +
 .../sntrup653/avx2/crypto_encode_653x3.c      |   64 +
 .../sntrup653/avx2/crypto_encode_653x3.h      |   10 +
 .../sntrup653/avx2/crypto_encode_653x4621.c   |  288 ++++
 .../sntrup653/avx2/crypto_encode_653x4621.h   |   10 +
 .../avx2/crypto_encode_653xfreeze3.c          |   31 +
 .../avx2/crypto_encode_653xfreeze3.h          |   10 +
 .../sntrup653/avx2/crypto_encode_653xint16.c  |   13 +
 .../sntrup653/avx2/crypto_encode_653xint16.h  |   10 +
 .../sntrup653/avx2/crypto_encode_int16.c      |    9 +
 .../sntrup653/avx2/crypto_encode_int16.h      |   10 +
 crypto_kem/sntrup653/avx2/crypto_sort_int32.c | 1210 +++++++++++++++++
 crypto_kem/sntrup653/avx2/crypto_sort_int32.h |   10 +
 .../sntrup653/avx2/crypto_sort_uint32.c       |   20 +
 .../sntrup653/avx2/crypto_sort_uint32.h       |   10 +
 .../sntrup653/avx2/crypto_stream_aes256ctr.c  |   15 +
 .../sntrup653/avx2/crypto_stream_aes256ctr.h  |   15 +
 crypto_kem/sntrup653/avx2/crypto_verify_897.c |   36 +
 crypto_kem/sntrup653/avx2/crypto_verify_897.h |    8 +
 crypto_kem/sntrup653/avx2/kem.c               |  247 ++++
 crypto_kem/sntrup653/avx2/params.h            |   71 +
 crypto_kem/sntrup653/clean/LICENSE            |    1 +
 crypto_kem/sntrup653/clean/Makefile           |   19 +
 .../sntrup653/clean/Makefile.Microsoft_nmake  |   19 +
 crypto_kem/sntrup653/clean/api.h              |   16 +
 .../clean/crypto_core_inv3sntrup653.c         |  110 ++
 .../clean/crypto_core_inv3sntrup653.h         |   11 +
 .../clean/crypto_core_invsntrup653.c          |  131 ++
 .../clean/crypto_core_invsntrup653.h          |   11 +
 .../clean/crypto_core_mult3sntrup653.c        |   57 +
 .../clean/crypto_core_mult3sntrup653.h        |   11 +
 .../clean/crypto_core_multsntrup653.c         |   60 +
 .../clean/crypto_core_multsntrup653.h         |   11 +
 .../clean/crypto_core_scale3sntrup653.c       |   32 +
 .../clean/crypto_core_scale3sntrup653.h       |   11 +
 .../clean/crypto_core_weightsntrup653.c       |   21 +
 .../clean/crypto_core_weightsntrup653.h       |   11 +
 .../clean/crypto_core_wforcesntrup653.c       |   48 +
 .../clean/crypto_core_wforcesntrup653.h       |   11 +
 .../sntrup653/clean/crypto_decode_653x1541.c  |  200 +++
 .../sntrup653/clean/crypto_decode_653x1541.h  |   10 +
 .../sntrup653/clean/crypto_decode_653x3.c     |   24 +
 .../sntrup653/clean/crypto_decode_653x3.h     |   10 +
 .../sntrup653/clean/crypto_decode_653x4621.c  |  198 +++
 .../sntrup653/clean/crypto_decode_653x4621.h  |   10 +
 .../sntrup653/clean/crypto_decode_653xint16.c |   16 +
 .../sntrup653/clean/crypto_decode_653xint16.h |   10 +
 .../sntrup653/clean/crypto_decode_653xint32.c |   20 +
 .../sntrup653/clean/crypto_decode_653xint32.h |   10 +
 .../sntrup653/clean/crypto_encode_653x1541.c  |  127 ++
 .../sntrup653/clean/crypto_encode_653x1541.h  |   10 +
 .../clean/crypto_encode_653x1541round.c       |   17 +
 .../clean/crypto_encode_653x1541round.h       |   10 +
 .../sntrup653/clean/crypto_encode_653x3.c     |   21 +
 .../sntrup653/clean/crypto_encode_653x3.h     |   10 +
 .../sntrup653/clean/crypto_encode_653x4621.c  |  127 ++
 .../sntrup653/clean/crypto_encode_653x4621.h  |   10 +
 .../clean/crypto_encode_653xfreeze3.c         |   25 +
 .../clean/crypto_encode_653xfreeze3.h         |   10 +
 .../sntrup653/clean/crypto_encode_653xint16.c |   13 +
 .../sntrup653/clean/crypto_encode_653xint16.h |   10 +
 .../sntrup653/clean/crypto_encode_int16.c     |    9 +
 .../sntrup653/clean/crypto_encode_int16.h     |   10 +
 .../sntrup653/clean/crypto_sort_int32.c       |   86 ++
 .../sntrup653/clean/crypto_sort_int32.h       |   10 +
 .../sntrup653/clean/crypto_sort_uint32.c      |   20 +
 .../sntrup653/clean/crypto_sort_uint32.h      |   10 +
 .../sntrup653/clean/crypto_stream_aes256ctr.c |   15 +
 .../sntrup653/clean/crypto_stream_aes256ctr.h |   15 +
 .../sntrup653/clean/crypto_verify_897.c       |   13 +
 .../sntrup653/clean/crypto_verify_897.h       |    8 +
 crypto_kem/sntrup653/clean/kem.c              |  247 ++++
 crypto_kem/sntrup653/clean/params.h           |   68 +
 crypto_kem/sntrup761/META.yml                 |   26 +
 crypto_kem/sntrup761/avx2/LICENSE             |    1 +
 crypto_kem/sntrup761/avx2/Makefile            |   22 +
 crypto_kem/sntrup761/avx2/api.h               |   16 +
 .../avx2/crypto_core_inv3sntrup761.c          |  542 ++++++++
 .../avx2/crypto_core_inv3sntrup761.h          |   11 +
 .../sntrup761/avx2/crypto_core_invsntrup761.c |  202 +++
 .../sntrup761/avx2/crypto_core_invsntrup761.h |   11 +
 .../avx2/crypto_core_mult3sntrup761.c         |  259 ++++
 .../avx2/crypto_core_mult3sntrup761.h         |   11 +
 .../avx2/crypto_core_multsntrup761.c          |  314 +++++
 .../avx2/crypto_core_multsntrup761.h          |   11 +
 .../avx2/crypto_core_multsntrup761_ntt.c      |  927 +++++++++++++
 .../avx2/crypto_core_multsntrup761_ntt.h      |   13 +
 .../avx2/crypto_core_scale3sntrup761.c        |   47 +
 .../avx2/crypto_core_scale3sntrup761.h        |   11 +
 .../avx2/crypto_core_weightsntrup761.c        |   44 +
 .../avx2/crypto_core_weightsntrup761.h        |   11 +
 .../avx2/crypto_core_wforcesntrup761.c        |   61 +
 .../avx2/crypto_core_wforcesntrup761.h        |   11 +
 .../sntrup761/avx2/crypto_decode_761x1531.c   |  436 ++++++
 .../sntrup761/avx2/crypto_decode_761x1531.h   |   10 +
 .../sntrup761/avx2/crypto_decode_761x3.c      |   65 +
 .../sntrup761/avx2/crypto_decode_761x3.h      |   10 +
 .../sntrup761/avx2/crypto_decode_761x4591.c   |  436 ++++++
 .../sntrup761/avx2/crypto_decode_761x4591.h   |   10 +
 .../sntrup761/avx2/crypto_decode_761xint16.c  |   16 +
 .../sntrup761/avx2/crypto_decode_761xint16.h  |   10 +
 .../sntrup761/avx2/crypto_decode_761xint32.c  |   20 +
 .../sntrup761/avx2/crypto_decode_761xint32.h  |   10 +
 .../sntrup761/avx2/crypto_decode_int16.c      |    9 +
 .../sntrup761/avx2/crypto_decode_int16.h      |    9 +
 .../sntrup761/avx2/crypto_encode_761x1531.c   |  301 ++++
 .../sntrup761/avx2/crypto_encode_761x1531.h   |   10 +
 .../avx2/crypto_encode_761x1531round.c        |  303 +++++
 .../avx2/crypto_encode_761x1531round.h        |   10 +
 .../sntrup761/avx2/crypto_encode_761x3.c      |   64 +
 .../sntrup761/avx2/crypto_encode_761x3.h      |   10 +
 .../sntrup761/avx2/crypto_encode_761x4591.c   |  308 +++++
 .../sntrup761/avx2/crypto_encode_761x4591.h   |   10 +
 .../avx2/crypto_encode_761xfreeze3.c          |   31 +
 .../avx2/crypto_encode_761xfreeze3.h          |   10 +
 .../sntrup761/avx2/crypto_encode_761xint16.c  |   13 +
 .../sntrup761/avx2/crypto_encode_761xint16.h  |   10 +
 .../sntrup761/avx2/crypto_encode_int16.c      |    9 +
 .../sntrup761/avx2/crypto_encode_int16.h      |   10 +
 crypto_kem/sntrup761/avx2/crypto_sort_int32.c | 1210 +++++++++++++++++
 crypto_kem/sntrup761/avx2/crypto_sort_int32.h |   10 +
 .../sntrup761/avx2/crypto_sort_uint32.c       |   20 +
 .../sntrup761/avx2/crypto_sort_uint32.h       |   10 +
 .../sntrup761/avx2/crypto_stream_aes256ctr.c  |   15 +
 .../sntrup761/avx2/crypto_stream_aes256ctr.h  |   15 +
 .../sntrup761/avx2/crypto_verify_1039.c       |   36 +
 .../sntrup761/avx2/crypto_verify_1039.h       |    8 +
 crypto_kem/sntrup761/avx2/kem.c               |  247 ++++
 crypto_kem/sntrup761/avx2/params.h            |   71 +
 crypto_kem/sntrup761/clean/LICENSE            |    1 +
 crypto_kem/sntrup761/clean/Makefile           |   19 +
 .../sntrup761/clean/Makefile.Microsoft_nmake  |   19 +
 crypto_kem/sntrup761/clean/api.h              |   16 +
 .../clean/crypto_core_inv3sntrup761.c         |  110 ++
 .../clean/crypto_core_inv3sntrup761.h         |   11 +
 .../clean/crypto_core_invsntrup761.c          |  130 ++
 .../clean/crypto_core_invsntrup761.h          |   11 +
 .../clean/crypto_core_mult3sntrup761.c        |   57 +
 .../clean/crypto_core_mult3sntrup761.h        |   11 +
 .../clean/crypto_core_multsntrup761.c         |   60 +
 .../clean/crypto_core_multsntrup761.h         |   11 +
 .../clean/crypto_core_scale3sntrup761.c       |   32 +
 .../clean/crypto_core_scale3sntrup761.h       |   11 +
 .../clean/crypto_core_weightsntrup761.c       |   21 +
 .../clean/crypto_core_weightsntrup761.h       |   11 +
 .../clean/crypto_core_wforcesntrup761.c       |   48 +
 .../clean/crypto_core_wforcesntrup761.h       |   11 +
 .../sntrup761/clean/crypto_decode_761x1531.c  |  211 +++
 .../sntrup761/clean/crypto_decode_761x1531.h  |   10 +
 .../sntrup761/clean/crypto_decode_761x3.c     |   24 +
 .../sntrup761/clean/crypto_decode_761x3.h     |   10 +
 .../sntrup761/clean/crypto_decode_761x4591.c  |  211 +++
 .../sntrup761/clean/crypto_decode_761x4591.h  |   10 +
 .../sntrup761/clean/crypto_decode_761xint16.c |   16 +
 .../sntrup761/clean/crypto_decode_761xint16.h |   10 +
 .../sntrup761/clean/crypto_decode_761xint32.c |   20 +
 .../sntrup761/clean/crypto_decode_761xint32.h |   10 +
 .../sntrup761/clean/crypto_encode_761x1531.c  |  119 ++
 .../sntrup761/clean/crypto_encode_761x1531.h  |   10 +
 .../clean/crypto_encode_761x1531round.c       |   17 +
 .../clean/crypto_encode_761x1531round.h       |   10 +
 .../sntrup761/clean/crypto_encode_761x3.c     |   21 +
 .../sntrup761/clean/crypto_encode_761x3.h     |   10 +
 .../sntrup761/clean/crypto_encode_761x4591.c  |  147 ++
 .../sntrup761/clean/crypto_encode_761x4591.h  |   10 +
 .../clean/crypto_encode_761xfreeze3.c         |   25 +
 .../clean/crypto_encode_761xfreeze3.h         |   10 +
 .../sntrup761/clean/crypto_encode_761xint16.c |   13 +
 .../sntrup761/clean/crypto_encode_761xint16.h |   10 +
 .../sntrup761/clean/crypto_encode_int16.c     |    9 +
 .../sntrup761/clean/crypto_encode_int16.h     |   10 +
 .../sntrup761/clean/crypto_sort_int32.c       |   86 ++
 .../sntrup761/clean/crypto_sort_int32.h       |   10 +
 .../sntrup761/clean/crypto_sort_uint32.c      |   20 +
 .../sntrup761/clean/crypto_sort_uint32.h      |   10 +
 .../sntrup761/clean/crypto_stream_aes256ctr.c |   15 +
 .../sntrup761/clean/crypto_stream_aes256ctr.h |   15 +
 .../sntrup761/clean/crypto_verify_1039.c      |   13 +
 .../sntrup761/clean/crypto_verify_1039.h      |    8 +
 crypto_kem/sntrup761/clean/kem.c              |  247 ++++
 crypto_kem/sntrup761/clean/params.h           |   68 +
 crypto_kem/sntrup857/META.yml                 |   26 +
 crypto_kem/sntrup857/avx2/LICENSE             |    1 +
 crypto_kem/sntrup857/avx2/Makefile            |   22 +
 crypto_kem/sntrup857/avx2/api.h               |   16 +
 .../avx2/crypto_core_inv3sntrup857.c          |  658 +++++++++
 .../avx2/crypto_core_inv3sntrup857.h          |   11 +
 .../sntrup857/avx2/crypto_core_invsntrup857.c |  202 +++
 .../sntrup857/avx2/crypto_core_invsntrup857.h |   11 +
 .../avx2/crypto_core_mult3sntrup857.c         |  296 ++++
 .../avx2/crypto_core_mult3sntrup857.h         |   11 +
 .../avx2/crypto_core_multsntrup857.c          |  421 ++++++
 .../avx2/crypto_core_multsntrup857.h          |   11 +
 .../avx2/crypto_core_multsntrup857_ntt.c      |  927 +++++++++++++
 .../avx2/crypto_core_multsntrup857_ntt.h      |   13 +
 .../avx2/crypto_core_scale3sntrup857.c        |   47 +
 .../avx2/crypto_core_scale3sntrup857.h        |   11 +
 .../avx2/crypto_core_weightsntrup857.c        |   45 +
 .../avx2/crypto_core_weightsntrup857.h        |   11 +
 .../avx2/crypto_core_wforcesntrup857.c        |   61 +
 .../avx2/crypto_core_wforcesntrup857.h        |   11 +
 .../sntrup857/avx2/crypto_decode_857x1723.c   |  430 ++++++
 .../sntrup857/avx2/crypto_decode_857x1723.h   |   10 +
 .../sntrup857/avx2/crypto_decode_857x3.c      |   65 +
 .../sntrup857/avx2/crypto_decode_857x3.h      |   10 +
 .../sntrup857/avx2/crypto_decode_857x5167.c   |  424 ++++++
 .../sntrup857/avx2/crypto_decode_857x5167.h   |   10 +
 .../sntrup857/avx2/crypto_decode_857xint16.c  |   16 +
 .../sntrup857/avx2/crypto_decode_857xint16.h  |   10 +
 .../sntrup857/avx2/crypto_decode_857xint32.c  |   20 +
 .../sntrup857/avx2/crypto_decode_857xint32.h  |   10 +
 .../sntrup857/avx2/crypto_decode_int16.c      |    9 +
 .../sntrup857/avx2/crypto_decode_int16.h      |    9 +
 .../sntrup857/avx2/crypto_encode_857x1723.c   |  283 ++++
 .../sntrup857/avx2/crypto_encode_857x1723.h   |   10 +
 .../avx2/crypto_encode_857x1723round.c        |  285 ++++
 .../avx2/crypto_encode_857x1723round.h        |   10 +
 .../sntrup857/avx2/crypto_encode_857x3.c      |   64 +
 .../sntrup857/avx2/crypto_encode_857x3.h      |   10 +
 .../sntrup857/avx2/crypto_encode_857x5167.c   |  331 +++++
 .../sntrup857/avx2/crypto_encode_857x5167.h   |   10 +
 .../avx2/crypto_encode_857xfreeze3.c          |   31 +
 .../avx2/crypto_encode_857xfreeze3.h          |   10 +
 .../sntrup857/avx2/crypto_encode_857xint16.c  |   13 +
 .../sntrup857/avx2/crypto_encode_857xint16.h  |   10 +
 .../sntrup857/avx2/crypto_encode_int16.c      |    9 +
 .../sntrup857/avx2/crypto_encode_int16.h      |   10 +
 crypto_kem/sntrup857/avx2/crypto_sort_int32.c | 1210 +++++++++++++++++
 crypto_kem/sntrup857/avx2/crypto_sort_int32.h |   10 +
 .../sntrup857/avx2/crypto_sort_uint32.c       |   20 +
 .../sntrup857/avx2/crypto_sort_uint32.h       |   10 +
 .../sntrup857/avx2/crypto_stream_aes256ctr.c  |   15 +
 .../sntrup857/avx2/crypto_stream_aes256ctr.h  |   15 +
 .../sntrup857/avx2/crypto_verify_1184.c       |   36 +
 .../sntrup857/avx2/crypto_verify_1184.h       |    8 +
 crypto_kem/sntrup857/avx2/kem.c               |  247 ++++
 crypto_kem/sntrup857/avx2/params.h            |   71 +
 crypto_kem/sntrup857/clean/LICENSE            |    1 +
 crypto_kem/sntrup857/clean/Makefile           |   19 +
 .../sntrup857/clean/Makefile.Microsoft_nmake  |   19 +
 crypto_kem/sntrup857/clean/api.h              |   16 +
 .../clean/crypto_core_inv3sntrup857.c         |  110 ++
 .../clean/crypto_core_inv3sntrup857.h         |   11 +
 .../clean/crypto_core_invsntrup857.c          |  131 ++
 .../clean/crypto_core_invsntrup857.h          |   11 +
 .../clean/crypto_core_mult3sntrup857.c        |   57 +
 .../clean/crypto_core_mult3sntrup857.h        |   11 +
 .../clean/crypto_core_multsntrup857.c         |   60 +
 .../clean/crypto_core_multsntrup857.h         |   11 +
 .../clean/crypto_core_scale3sntrup857.c       |   32 +
 .../clean/crypto_core_scale3sntrup857.h       |   11 +
 .../clean/crypto_core_weightsntrup857.c       |   21 +
 .../clean/crypto_core_weightsntrup857.h       |   11 +
 .../clean/crypto_core_wforcesntrup857.c       |   48 +
 .../clean/crypto_core_wforcesntrup857.h       |   11 +
 .../sntrup857/clean/crypto_decode_857x1723.c  |  202 +++
 .../sntrup857/clean/crypto_decode_857x1723.h  |   10 +
 .../sntrup857/clean/crypto_decode_857x3.c     |   24 +
 .../sntrup857/clean/crypto_decode_857x3.h     |   10 +
 .../sntrup857/clean/crypto_decode_857x5167.c  |  205 +++
 .../sntrup857/clean/crypto_decode_857x5167.h  |   10 +
 .../sntrup857/clean/crypto_decode_857xint16.c |   16 +
 .../sntrup857/clean/crypto_decode_857xint16.h |   10 +
 .../sntrup857/clean/crypto_decode_857xint32.c |   20 +
 .../sntrup857/clean/crypto_decode_857xint32.h |   10 +
 .../sntrup857/clean/crypto_encode_857x1723.c  |  130 ++
 .../sntrup857/clean/crypto_encode_857x1723.h  |   10 +
 .../clean/crypto_encode_857x1723round.c       |   17 +
 .../clean/crypto_encode_857x1723round.h       |   10 +
 .../sntrup857/clean/crypto_encode_857x3.c     |   21 +
 .../sntrup857/clean/crypto_encode_857x3.h     |   10 +
 .../sntrup857/clean/crypto_encode_857x5167.c  |  138 ++
 .../sntrup857/clean/crypto_encode_857x5167.h  |   10 +
 .../clean/crypto_encode_857xfreeze3.c         |   25 +
 .../clean/crypto_encode_857xfreeze3.h         |   10 +
 .../sntrup857/clean/crypto_encode_857xint16.c |   13 +
 .../sntrup857/clean/crypto_encode_857xint16.h |   10 +
 .../sntrup857/clean/crypto_encode_int16.c     |    9 +
 .../sntrup857/clean/crypto_encode_int16.h     |   10 +
 .../sntrup857/clean/crypto_sort_int32.c       |   86 ++
 .../sntrup857/clean/crypto_sort_int32.h       |   10 +
 .../sntrup857/clean/crypto_sort_uint32.c      |   20 +
 .../sntrup857/clean/crypto_sort_uint32.h      |   10 +
 .../sntrup857/clean/crypto_stream_aes256ctr.c |   15 +
 .../sntrup857/clean/crypto_stream_aes256ctr.h |   15 +
 .../sntrup857/clean/crypto_verify_1184.c      |   13 +
 .../sntrup857/clean/crypto_verify_1184.h      |    8 +
 crypto_kem/sntrup857/clean/kem.c              |  247 ++++
 crypto_kem/sntrup857/clean/params.h           |   68 +
 .../duplicate_consistency/ntrulpr653_avx2.yml |  184 +++
 .../ntrulpr653_clean.yml                      |  182 +++
 .../duplicate_consistency/ntrulpr761_avx2.yml |  150 ++
 .../ntrulpr761_clean.yml                      |  148 ++
 .../duplicate_consistency/ntrulpr857_avx2.yml |  116 ++
 .../ntrulpr857_clean.yml                      |  114 ++
 test/duplicate_consistency/sntrup653_avx2.yml |   86 ++
 .../duplicate_consistency/sntrup653_clean.yml |   82 ++
 test/duplicate_consistency/sntrup761_avx2.yml |   60 +
 .../duplicate_consistency/sntrup761_clean.yml |   58 +
 test/duplicate_consistency/sntrup857_avx2.yml |   34 +
 .../duplicate_consistency/sntrup857_clean.yml |   34 +
 582 files changed, 43638 insertions(+)
 create mode 100644 crypto_kem/ntrulpr653/META.yml
 create mode 100644 crypto_kem/ntrulpr653/avx2/LICENSE
 create mode 100644 crypto_kem/ntrulpr653/avx2/Makefile
 create mode 100644 crypto_kem/ntrulpr653/avx2/api.h
 create mode 100644 crypto_kem/ntrulpr653/avx2/crypto_core_multsntrup653.c
 create mode 100644 crypto_kem/ntrulpr653/avx2/crypto_core_multsntrup653.h
 create mode 100644 crypto_kem/ntrulpr653/avx2/crypto_core_multsntrup653_ntt.c
 create mode 100644 crypto_kem/ntrulpr653/avx2/crypto_core_multsntrup653_ntt.h
 create mode 100644 crypto_kem/ntrulpr653/avx2/crypto_decode_256x16.c
 create mode 100644 crypto_kem/ntrulpr653/avx2/crypto_decode_256x16.h
 create mode 100644 crypto_kem/ntrulpr653/avx2/crypto_decode_256x2.c
 create mode 100644 crypto_kem/ntrulpr653/avx2/crypto_decode_256x2.h
 create mode 100644 crypto_kem/ntrulpr653/avx2/crypto_decode_653x1541.c
 create mode 100644 crypto_kem/ntrulpr653/avx2/crypto_decode_653x1541.h
 create mode 100644 crypto_kem/ntrulpr653/avx2/crypto_decode_653x3.c
 create mode 100644 crypto_kem/ntrulpr653/avx2/crypto_decode_653x3.h
 create mode 100644 crypto_kem/ntrulpr653/avx2/crypto_decode_653xint16.c
 create mode 100644 crypto_kem/ntrulpr653/avx2/crypto_decode_653xint16.h
 create mode 100644 crypto_kem/ntrulpr653/avx2/crypto_decode_653xint32.c
 create mode 100644 crypto_kem/ntrulpr653/avx2/crypto_decode_653xint32.h
 create mode 100644 crypto_kem/ntrulpr653/avx2/crypto_encode_256x16.c
 create mode 100644 crypto_kem/ntrulpr653/avx2/crypto_encode_256x16.h
 create mode 100644 crypto_kem/ntrulpr653/avx2/crypto_encode_256x2.c
 create mode 100644 crypto_kem/ntrulpr653/avx2/crypto_encode_256x2.h
 create mode 100644 crypto_kem/ntrulpr653/avx2/crypto_encode_653x1541.c
 create mode 100644 crypto_kem/ntrulpr653/avx2/crypto_encode_653x1541.h
 create mode 100644 crypto_kem/ntrulpr653/avx2/crypto_encode_653x1541round.c
 create mode 100644 crypto_kem/ntrulpr653/avx2/crypto_encode_653x1541round.h
 create mode 100644 crypto_kem/ntrulpr653/avx2/crypto_encode_653x3.c
 create mode 100644 crypto_kem/ntrulpr653/avx2/crypto_encode_653x3.h
 create mode 100644 crypto_kem/ntrulpr653/avx2/crypto_encode_653xint16.c
 create mode 100644 crypto_kem/ntrulpr653/avx2/crypto_encode_653xint16.h
 create mode 100644 crypto_kem/ntrulpr653/avx2/crypto_sort_int32.c
 create mode 100644 crypto_kem/ntrulpr653/avx2/crypto_sort_int32.h
 create mode 100644 crypto_kem/ntrulpr653/avx2/crypto_sort_uint32.c
 create mode 100644 crypto_kem/ntrulpr653/avx2/crypto_sort_uint32.h
 create mode 100644 crypto_kem/ntrulpr653/avx2/crypto_stream_aes256ctr.c
 create mode 100644 crypto_kem/ntrulpr653/avx2/crypto_stream_aes256ctr.h
 create mode 100644 crypto_kem/ntrulpr653/avx2/crypto_verify_1025.c
 create mode 100644 crypto_kem/ntrulpr653/avx2/crypto_verify_1025.h
 create mode 100644 crypto_kem/ntrulpr653/avx2/kem.c
 create mode 100644 crypto_kem/ntrulpr653/avx2/params.h
 create mode 100644 crypto_kem/ntrulpr653/clean/LICENSE
 create mode 100644 crypto_kem/ntrulpr653/clean/Makefile
 create mode 100644 crypto_kem/ntrulpr653/clean/Makefile.Microsoft_nmake
 create mode 100644 crypto_kem/ntrulpr653/clean/api.h
 create mode 100644 crypto_kem/ntrulpr653/clean/crypto_core_multsntrup653.c
 create mode 100644 crypto_kem/ntrulpr653/clean/crypto_core_multsntrup653.h
 create mode 100644 crypto_kem/ntrulpr653/clean/crypto_decode_256x16.c
 create mode 100644 crypto_kem/ntrulpr653/clean/crypto_decode_256x16.h
 create mode 100644 crypto_kem/ntrulpr653/clean/crypto_decode_256x2.c
 create mode 100644 crypto_kem/ntrulpr653/clean/crypto_decode_256x2.h
 create mode 100644 crypto_kem/ntrulpr653/clean/crypto_decode_653x1541.c
 create mode 100644 crypto_kem/ntrulpr653/clean/crypto_decode_653x1541.h
 create mode 100644 crypto_kem/ntrulpr653/clean/crypto_decode_653x3.c
 create mode 100644 crypto_kem/ntrulpr653/clean/crypto_decode_653x3.h
 create mode 100644 crypto_kem/ntrulpr653/clean/crypto_decode_653xint16.c
 create mode 100644 crypto_kem/ntrulpr653/clean/crypto_decode_653xint16.h
 create mode 100644 crypto_kem/ntrulpr653/clean/crypto_decode_653xint32.c
 create mode 100644 crypto_kem/ntrulpr653/clean/crypto_decode_653xint32.h
 create mode 100644 crypto_kem/ntrulpr653/clean/crypto_encode_256x16.c
 create mode 100644 crypto_kem/ntrulpr653/clean/crypto_encode_256x16.h
 create mode 100644 crypto_kem/ntrulpr653/clean/crypto_encode_256x2.c
 create mode 100644 crypto_kem/ntrulpr653/clean/crypto_encode_256x2.h
 create mode 100644 crypto_kem/ntrulpr653/clean/crypto_encode_653x1541.c
 create mode 100644 crypto_kem/ntrulpr653/clean/crypto_encode_653x1541.h
 create mode 100644 crypto_kem/ntrulpr653/clean/crypto_encode_653x1541round.c
 create mode 100644 crypto_kem/ntrulpr653/clean/crypto_encode_653x1541round.h
 create mode 100644 crypto_kem/ntrulpr653/clean/crypto_encode_653x3.c
 create mode 100644 crypto_kem/ntrulpr653/clean/crypto_encode_653x3.h
 create mode 100644 crypto_kem/ntrulpr653/clean/crypto_encode_653xint16.c
 create mode 100644 crypto_kem/ntrulpr653/clean/crypto_encode_653xint16.h
 create mode 100644 crypto_kem/ntrulpr653/clean/crypto_sort_int32.c
 create mode 100644 crypto_kem/ntrulpr653/clean/crypto_sort_int32.h
 create mode 100644 crypto_kem/ntrulpr653/clean/crypto_sort_uint32.c
 create mode 100644 crypto_kem/ntrulpr653/clean/crypto_sort_uint32.h
 create mode 100644 crypto_kem/ntrulpr653/clean/crypto_stream_aes256ctr.c
 create mode 100644 crypto_kem/ntrulpr653/clean/crypto_stream_aes256ctr.h
 create mode 100644 crypto_kem/ntrulpr653/clean/crypto_verify_1025.c
 create mode 100644 crypto_kem/ntrulpr653/clean/crypto_verify_1025.h
 create mode 100644 crypto_kem/ntrulpr653/clean/kem.c
 create mode 100644 crypto_kem/ntrulpr653/clean/params.h
 create mode 100644 crypto_kem/ntrulpr761/META.yml
 create mode 100644 crypto_kem/ntrulpr761/avx2/LICENSE
 create mode 100644 crypto_kem/ntrulpr761/avx2/Makefile
 create mode 100644 crypto_kem/ntrulpr761/avx2/api.h
 create mode 100644 crypto_kem/ntrulpr761/avx2/crypto_core_multsntrup761.c
 create mode 100644 crypto_kem/ntrulpr761/avx2/crypto_core_multsntrup761.h
 create mode 100644 crypto_kem/ntrulpr761/avx2/crypto_core_multsntrup761_ntt.c
 create mode 100644 crypto_kem/ntrulpr761/avx2/crypto_core_multsntrup761_ntt.h
 create mode 100644 crypto_kem/ntrulpr761/avx2/crypto_decode_256x16.c
 create mode 100644 crypto_kem/ntrulpr761/avx2/crypto_decode_256x16.h
 create mode 100644 crypto_kem/ntrulpr761/avx2/crypto_decode_256x2.c
 create mode 100644 crypto_kem/ntrulpr761/avx2/crypto_decode_256x2.h
 create mode 100644 crypto_kem/ntrulpr761/avx2/crypto_decode_761x1531.c
 create mode 100644 crypto_kem/ntrulpr761/avx2/crypto_decode_761x1531.h
 create mode 100644 crypto_kem/ntrulpr761/avx2/crypto_decode_761x3.c
 create mode 100644 crypto_kem/ntrulpr761/avx2/crypto_decode_761x3.h
 create mode 100644 crypto_kem/ntrulpr761/avx2/crypto_decode_761xint16.c
 create mode 100644 crypto_kem/ntrulpr761/avx2/crypto_decode_761xint16.h
 create mode 100644 crypto_kem/ntrulpr761/avx2/crypto_decode_761xint32.c
 create mode 100644 crypto_kem/ntrulpr761/avx2/crypto_decode_761xint32.h
 create mode 100644 crypto_kem/ntrulpr761/avx2/crypto_encode_256x16.c
 create mode 100644 crypto_kem/ntrulpr761/avx2/crypto_encode_256x16.h
 create mode 100644 crypto_kem/ntrulpr761/avx2/crypto_encode_256x2.c
 create mode 100644 crypto_kem/ntrulpr761/avx2/crypto_encode_256x2.h
 create mode 100644 crypto_kem/ntrulpr761/avx2/crypto_encode_761x1531.c
 create mode 100644 crypto_kem/ntrulpr761/avx2/crypto_encode_761x1531.h
 create mode 100644 crypto_kem/ntrulpr761/avx2/crypto_encode_761x1531round.c
 create mode 100644 crypto_kem/ntrulpr761/avx2/crypto_encode_761x1531round.h
 create mode 100644 crypto_kem/ntrulpr761/avx2/crypto_encode_761x3.c
 create mode 100644 crypto_kem/ntrulpr761/avx2/crypto_encode_761x3.h
 create mode 100644 crypto_kem/ntrulpr761/avx2/crypto_encode_761xint16.c
 create mode 100644 crypto_kem/ntrulpr761/avx2/crypto_encode_761xint16.h
 create mode 100644 crypto_kem/ntrulpr761/avx2/crypto_sort_int32.c
 create mode 100644 crypto_kem/ntrulpr761/avx2/crypto_sort_int32.h
 create mode 100644 crypto_kem/ntrulpr761/avx2/crypto_sort_uint32.c
 create mode 100644 crypto_kem/ntrulpr761/avx2/crypto_sort_uint32.h
 create mode 100644 crypto_kem/ntrulpr761/avx2/crypto_stream_aes256ctr.c
 create mode 100644 crypto_kem/ntrulpr761/avx2/crypto_stream_aes256ctr.h
 create mode 100644 crypto_kem/ntrulpr761/avx2/crypto_verify_1167.c
 create mode 100644 crypto_kem/ntrulpr761/avx2/crypto_verify_1167.h
 create mode 100644 crypto_kem/ntrulpr761/avx2/kem.c
 create mode 100644 crypto_kem/ntrulpr761/avx2/params.h
 create mode 100644 crypto_kem/ntrulpr761/clean/LICENSE
 create mode 100644 crypto_kem/ntrulpr761/clean/Makefile
 create mode 100644 crypto_kem/ntrulpr761/clean/Makefile.Microsoft_nmake
 create mode 100644 crypto_kem/ntrulpr761/clean/api.h
 create mode 100644 crypto_kem/ntrulpr761/clean/crypto_core_multsntrup761.c
 create mode 100644 crypto_kem/ntrulpr761/clean/crypto_core_multsntrup761.h
 create mode 100644 crypto_kem/ntrulpr761/clean/crypto_decode_256x16.c
 create mode 100644 crypto_kem/ntrulpr761/clean/crypto_decode_256x16.h
 create mode 100644 crypto_kem/ntrulpr761/clean/crypto_decode_256x2.c
 create mode 100644 crypto_kem/ntrulpr761/clean/crypto_decode_256x2.h
 create mode 100644 crypto_kem/ntrulpr761/clean/crypto_decode_761x1531.c
 create mode 100644 crypto_kem/ntrulpr761/clean/crypto_decode_761x1531.h
 create mode 100644 crypto_kem/ntrulpr761/clean/crypto_decode_761x3.c
 create mode 100644 crypto_kem/ntrulpr761/clean/crypto_decode_761x3.h
 create mode 100644 crypto_kem/ntrulpr761/clean/crypto_decode_761xint16.c
 create mode 100644 crypto_kem/ntrulpr761/clean/crypto_decode_761xint16.h
 create mode 100644 crypto_kem/ntrulpr761/clean/crypto_decode_761xint32.c
 create mode 100644 crypto_kem/ntrulpr761/clean/crypto_decode_761xint32.h
 create mode 100644 crypto_kem/ntrulpr761/clean/crypto_encode_256x16.c
 create mode 100644 crypto_kem/ntrulpr761/clean/crypto_encode_256x16.h
 create mode 100644 crypto_kem/ntrulpr761/clean/crypto_encode_256x2.c
 create mode 100644 crypto_kem/ntrulpr761/clean/crypto_encode_256x2.h
 create mode 100644 crypto_kem/ntrulpr761/clean/crypto_encode_761x1531.c
 create mode 100644 crypto_kem/ntrulpr761/clean/crypto_encode_761x1531.h
 create mode 100644 crypto_kem/ntrulpr761/clean/crypto_encode_761x1531round.c
 create mode 100644 crypto_kem/ntrulpr761/clean/crypto_encode_761x1531round.h
 create mode 100644 crypto_kem/ntrulpr761/clean/crypto_encode_761x3.c
 create mode 100644 crypto_kem/ntrulpr761/clean/crypto_encode_761x3.h
 create mode 100644 crypto_kem/ntrulpr761/clean/crypto_encode_761xint16.c
 create mode 100644 crypto_kem/ntrulpr761/clean/crypto_encode_761xint16.h
 create mode 100644 crypto_kem/ntrulpr761/clean/crypto_sort_int32.c
 create mode 100644 crypto_kem/ntrulpr761/clean/crypto_sort_int32.h
 create mode 100644 crypto_kem/ntrulpr761/clean/crypto_sort_uint32.c
 create mode 100644 crypto_kem/ntrulpr761/clean/crypto_sort_uint32.h
 create mode 100644 crypto_kem/ntrulpr761/clean/crypto_stream_aes256ctr.c
 create mode 100644 crypto_kem/ntrulpr761/clean/crypto_stream_aes256ctr.h
 create mode 100644 crypto_kem/ntrulpr761/clean/crypto_verify_1167.c
 create mode 100644 crypto_kem/ntrulpr761/clean/crypto_verify_1167.h
 create mode 100644 crypto_kem/ntrulpr761/clean/kem.c
 create mode 100644 crypto_kem/ntrulpr761/clean/params.h
 create mode 100644 crypto_kem/ntrulpr857/META.yml
 create mode 100644 crypto_kem/ntrulpr857/avx2/LICENSE
 create mode 100644 crypto_kem/ntrulpr857/avx2/Makefile
 create mode 100644 crypto_kem/ntrulpr857/avx2/api.h
 create mode 100644 crypto_kem/ntrulpr857/avx2/crypto_core_multsntrup857.c
 create mode 100644 crypto_kem/ntrulpr857/avx2/crypto_core_multsntrup857.h
 create mode 100644 crypto_kem/ntrulpr857/avx2/crypto_core_multsntrup857_ntt.c
 create mode 100644 crypto_kem/ntrulpr857/avx2/crypto_core_multsntrup857_ntt.h
 create mode 100644 crypto_kem/ntrulpr857/avx2/crypto_decode_256x16.c
 create mode 100644 crypto_kem/ntrulpr857/avx2/crypto_decode_256x16.h
 create mode 100644 crypto_kem/ntrulpr857/avx2/crypto_decode_256x2.c
 create mode 100644 crypto_kem/ntrulpr857/avx2/crypto_decode_256x2.h
 create mode 100644 crypto_kem/ntrulpr857/avx2/crypto_decode_857x1723.c
 create mode 100644 crypto_kem/ntrulpr857/avx2/crypto_decode_857x1723.h
 create mode 100644 crypto_kem/ntrulpr857/avx2/crypto_decode_857x3.c
 create mode 100644 crypto_kem/ntrulpr857/avx2/crypto_decode_857x3.h
 create mode 100644 crypto_kem/ntrulpr857/avx2/crypto_decode_857xint16.c
 create mode 100644 crypto_kem/ntrulpr857/avx2/crypto_decode_857xint16.h
 create mode 100644 crypto_kem/ntrulpr857/avx2/crypto_decode_857xint32.c
 create mode 100644 crypto_kem/ntrulpr857/avx2/crypto_decode_857xint32.h
 create mode 100644 crypto_kem/ntrulpr857/avx2/crypto_encode_256x16.c
 create mode 100644 crypto_kem/ntrulpr857/avx2/crypto_encode_256x16.h
 create mode 100644 crypto_kem/ntrulpr857/avx2/crypto_encode_256x2.c
 create mode 100644 crypto_kem/ntrulpr857/avx2/crypto_encode_256x2.h
 create mode 100644 crypto_kem/ntrulpr857/avx2/crypto_encode_857x1723.c
 create mode 100644 crypto_kem/ntrulpr857/avx2/crypto_encode_857x1723.h
 create mode 100644 crypto_kem/ntrulpr857/avx2/crypto_encode_857x1723round.c
 create mode 100644 crypto_kem/ntrulpr857/avx2/crypto_encode_857x1723round.h
 create mode 100644 crypto_kem/ntrulpr857/avx2/crypto_encode_857x3.c
 create mode 100644 crypto_kem/ntrulpr857/avx2/crypto_encode_857x3.h
 create mode 100644 crypto_kem/ntrulpr857/avx2/crypto_encode_857xint16.c
 create mode 100644 crypto_kem/ntrulpr857/avx2/crypto_encode_857xint16.h
 create mode 100644 crypto_kem/ntrulpr857/avx2/crypto_sort_int32.c
 create mode 100644 crypto_kem/ntrulpr857/avx2/crypto_sort_int32.h
 create mode 100644 crypto_kem/ntrulpr857/avx2/crypto_sort_uint32.c
 create mode 100644 crypto_kem/ntrulpr857/avx2/crypto_sort_uint32.h
 create mode 100644 crypto_kem/ntrulpr857/avx2/crypto_stream_aes256ctr.c
 create mode 100644 crypto_kem/ntrulpr857/avx2/crypto_stream_aes256ctr.h
 create mode 100644 crypto_kem/ntrulpr857/avx2/crypto_verify_1312.c
 create mode 100644 crypto_kem/ntrulpr857/avx2/crypto_verify_1312.h
 create mode 100644 crypto_kem/ntrulpr857/avx2/kem.c
 create mode 100644 crypto_kem/ntrulpr857/avx2/params.h
 create mode 100644 crypto_kem/ntrulpr857/clean/LICENSE
 create mode 100644 crypto_kem/ntrulpr857/clean/Makefile
 create mode 100644 crypto_kem/ntrulpr857/clean/Makefile.Microsoft_nmake
 create mode 100644 crypto_kem/ntrulpr857/clean/api.h
 create mode 100644 crypto_kem/ntrulpr857/clean/crypto_core_multsntrup857.c
 create mode 100644 crypto_kem/ntrulpr857/clean/crypto_core_multsntrup857.h
 create mode 100644 crypto_kem/ntrulpr857/clean/crypto_decode_256x16.c
 create mode 100644 crypto_kem/ntrulpr857/clean/crypto_decode_256x16.h
 create mode 100644 crypto_kem/ntrulpr857/clean/crypto_decode_256x2.c
 create mode 100644 crypto_kem/ntrulpr857/clean/crypto_decode_256x2.h
 create mode 100644 crypto_kem/ntrulpr857/clean/crypto_decode_857x1723.c
 create mode 100644 crypto_kem/ntrulpr857/clean/crypto_decode_857x1723.h
 create mode 100644 crypto_kem/ntrulpr857/clean/crypto_decode_857x3.c
 create mode 100644 crypto_kem/ntrulpr857/clean/crypto_decode_857x3.h
 create mode 100644 crypto_kem/ntrulpr857/clean/crypto_decode_857xint16.c
 create mode 100644 crypto_kem/ntrulpr857/clean/crypto_decode_857xint16.h
 create mode 100644 crypto_kem/ntrulpr857/clean/crypto_decode_857xint32.c
 create mode 100644 crypto_kem/ntrulpr857/clean/crypto_decode_857xint32.h
 create mode 100644 crypto_kem/ntrulpr857/clean/crypto_encode_256x16.c
 create mode 100644 crypto_kem/ntrulpr857/clean/crypto_encode_256x16.h
 create mode 100644 crypto_kem/ntrulpr857/clean/crypto_encode_256x2.c
 create mode 100644 crypto_kem/ntrulpr857/clean/crypto_encode_256x2.h
 create mode 100644 crypto_kem/ntrulpr857/clean/crypto_encode_857x1723.c
 create mode 100644 crypto_kem/ntrulpr857/clean/crypto_encode_857x1723.h
 create mode 100644 crypto_kem/ntrulpr857/clean/crypto_encode_857x1723round.c
 create mode 100644 crypto_kem/ntrulpr857/clean/crypto_encode_857x1723round.h
 create mode 100644 crypto_kem/ntrulpr857/clean/crypto_encode_857x3.c
 create mode 100644 crypto_kem/ntrulpr857/clean/crypto_encode_857x3.h
 create mode 100644 crypto_kem/ntrulpr857/clean/crypto_encode_857xint16.c
 create mode 100644 crypto_kem/ntrulpr857/clean/crypto_encode_857xint16.h
 create mode 100644 crypto_kem/ntrulpr857/clean/crypto_sort_int32.c
 create mode 100644 crypto_kem/ntrulpr857/clean/crypto_sort_int32.h
 create mode 100644 crypto_kem/ntrulpr857/clean/crypto_sort_uint32.c
 create mode 100644 crypto_kem/ntrulpr857/clean/crypto_sort_uint32.h
 create mode 100644 crypto_kem/ntrulpr857/clean/crypto_stream_aes256ctr.c
 create mode 100644 crypto_kem/ntrulpr857/clean/crypto_stream_aes256ctr.h
 create mode 100644 crypto_kem/ntrulpr857/clean/crypto_verify_1312.c
 create mode 100644 crypto_kem/ntrulpr857/clean/crypto_verify_1312.h
 create mode 100644 crypto_kem/ntrulpr857/clean/kem.c
 create mode 100644 crypto_kem/ntrulpr857/clean/params.h
 create mode 100644 crypto_kem/sntrup653/META.yml
 create mode 100644 crypto_kem/sntrup653/avx2/LICENSE
 create mode 100644 crypto_kem/sntrup653/avx2/Makefile
 create mode 100644 crypto_kem/sntrup653/avx2/api.h
 create mode 100644 crypto_kem/sntrup653/avx2/crypto_core_inv3sntrup653.c
 create mode 100644 crypto_kem/sntrup653/avx2/crypto_core_inv3sntrup653.h
 create mode 100644 crypto_kem/sntrup653/avx2/crypto_core_invsntrup653.c
 create mode 100644 crypto_kem/sntrup653/avx2/crypto_core_invsntrup653.h
 create mode 100644 crypto_kem/sntrup653/avx2/crypto_core_mult3sntrup653.c
 create mode 100644 crypto_kem/sntrup653/avx2/crypto_core_mult3sntrup653.h
 create mode 100644 crypto_kem/sntrup653/avx2/crypto_core_multsntrup653.c
 create mode 100644 crypto_kem/sntrup653/avx2/crypto_core_multsntrup653.h
 create mode 100644 crypto_kem/sntrup653/avx2/crypto_core_multsntrup653_ntt.c
 create mode 100644 crypto_kem/sntrup653/avx2/crypto_core_multsntrup653_ntt.h
 create mode 100644 crypto_kem/sntrup653/avx2/crypto_core_scale3sntrup653.c
 create mode 100644 crypto_kem/sntrup653/avx2/crypto_core_scale3sntrup653.h
 create mode 100644 crypto_kem/sntrup653/avx2/crypto_core_weightsntrup653.c
 create mode 100644 crypto_kem/sntrup653/avx2/crypto_core_weightsntrup653.h
 create mode 100644 crypto_kem/sntrup653/avx2/crypto_core_wforcesntrup653.c
 create mode 100644 crypto_kem/sntrup653/avx2/crypto_core_wforcesntrup653.h
 create mode 100644 crypto_kem/sntrup653/avx2/crypto_decode_653x1541.c
 create mode 100644 crypto_kem/sntrup653/avx2/crypto_decode_653x1541.h
 create mode 100644 crypto_kem/sntrup653/avx2/crypto_decode_653x3.c
 create mode 100644 crypto_kem/sntrup653/avx2/crypto_decode_653x3.h
 create mode 100644 crypto_kem/sntrup653/avx2/crypto_decode_653x4621.c
 create mode 100644 crypto_kem/sntrup653/avx2/crypto_decode_653x4621.h
 create mode 100644 crypto_kem/sntrup653/avx2/crypto_decode_653xint16.c
 create mode 100644 crypto_kem/sntrup653/avx2/crypto_decode_653xint16.h
 create mode 100644 crypto_kem/sntrup653/avx2/crypto_decode_653xint32.c
 create mode 100644 crypto_kem/sntrup653/avx2/crypto_decode_653xint32.h
 create mode 100644 crypto_kem/sntrup653/avx2/crypto_decode_int16.c
 create mode 100644 crypto_kem/sntrup653/avx2/crypto_decode_int16.h
 create mode 100644 crypto_kem/sntrup653/avx2/crypto_encode_653x1541.c
 create mode 100644 crypto_kem/sntrup653/avx2/crypto_encode_653x1541.h
 create mode 100644 crypto_kem/sntrup653/avx2/crypto_encode_653x1541round.c
 create mode 100644 crypto_kem/sntrup653/avx2/crypto_encode_653x1541round.h
 create mode 100644 crypto_kem/sntrup653/avx2/crypto_encode_653x3.c
 create mode 100644 crypto_kem/sntrup653/avx2/crypto_encode_653x3.h
 create mode 100644 crypto_kem/sntrup653/avx2/crypto_encode_653x4621.c
 create mode 100644 crypto_kem/sntrup653/avx2/crypto_encode_653x4621.h
 create mode 100644 crypto_kem/sntrup653/avx2/crypto_encode_653xfreeze3.c
 create mode 100644 crypto_kem/sntrup653/avx2/crypto_encode_653xfreeze3.h
 create mode 100644 crypto_kem/sntrup653/avx2/crypto_encode_653xint16.c
 create mode 100644 crypto_kem/sntrup653/avx2/crypto_encode_653xint16.h
 create mode 100644 crypto_kem/sntrup653/avx2/crypto_encode_int16.c
 create mode 100644 crypto_kem/sntrup653/avx2/crypto_encode_int16.h
 create mode 100644 crypto_kem/sntrup653/avx2/crypto_sort_int32.c
 create mode 100644 crypto_kem/sntrup653/avx2/crypto_sort_int32.h
 create mode 100644 crypto_kem/sntrup653/avx2/crypto_sort_uint32.c
 create mode 100644 crypto_kem/sntrup653/avx2/crypto_sort_uint32.h
 create mode 100644 crypto_kem/sntrup653/avx2/crypto_stream_aes256ctr.c
 create mode 100644 crypto_kem/sntrup653/avx2/crypto_stream_aes256ctr.h
 create mode 100644 crypto_kem/sntrup653/avx2/crypto_verify_897.c
 create mode 100644 crypto_kem/sntrup653/avx2/crypto_verify_897.h
 create mode 100644 crypto_kem/sntrup653/avx2/kem.c
 create mode 100644 crypto_kem/sntrup653/avx2/params.h
 create mode 100644 crypto_kem/sntrup653/clean/LICENSE
 create mode 100644 crypto_kem/sntrup653/clean/Makefile
 create mode 100644 crypto_kem/sntrup653/clean/Makefile.Microsoft_nmake
 create mode 100644 crypto_kem/sntrup653/clean/api.h
 create mode 100644 crypto_kem/sntrup653/clean/crypto_core_inv3sntrup653.c
 create mode 100644 crypto_kem/sntrup653/clean/crypto_core_inv3sntrup653.h
 create mode 100644 crypto_kem/sntrup653/clean/crypto_core_invsntrup653.c
 create mode 100644 crypto_kem/sntrup653/clean/crypto_core_invsntrup653.h
 create mode 100644 crypto_kem/sntrup653/clean/crypto_core_mult3sntrup653.c
 create mode 100644 crypto_kem/sntrup653/clean/crypto_core_mult3sntrup653.h
 create mode 100644 crypto_kem/sntrup653/clean/crypto_core_multsntrup653.c
 create mode 100644 crypto_kem/sntrup653/clean/crypto_core_multsntrup653.h
 create mode 100644 crypto_kem/sntrup653/clean/crypto_core_scale3sntrup653.c
 create mode 100644 crypto_kem/sntrup653/clean/crypto_core_scale3sntrup653.h
 create mode 100644 crypto_kem/sntrup653/clean/crypto_core_weightsntrup653.c
 create mode 100644 crypto_kem/sntrup653/clean/crypto_core_weightsntrup653.h
 create mode 100644 crypto_kem/sntrup653/clean/crypto_core_wforcesntrup653.c
 create mode 100644 crypto_kem/sntrup653/clean/crypto_core_wforcesntrup653.h
 create mode 100644 crypto_kem/sntrup653/clean/crypto_decode_653x1541.c
 create mode 100644 crypto_kem/sntrup653/clean/crypto_decode_653x1541.h
 create mode 100644 crypto_kem/sntrup653/clean/crypto_decode_653x3.c
 create mode 100644 crypto_kem/sntrup653/clean/crypto_decode_653x3.h
 create mode 100644 crypto_kem/sntrup653/clean/crypto_decode_653x4621.c
 create mode 100644 crypto_kem/sntrup653/clean/crypto_decode_653x4621.h
 create mode 100644 crypto_kem/sntrup653/clean/crypto_decode_653xint16.c
 create mode 100644 crypto_kem/sntrup653/clean/crypto_decode_653xint16.h
 create mode 100644 crypto_kem/sntrup653/clean/crypto_decode_653xint32.c
 create mode 100644 crypto_kem/sntrup653/clean/crypto_decode_653xint32.h
 create mode 100644 crypto_kem/sntrup653/clean/crypto_encode_653x1541.c
 create mode 100644 crypto_kem/sntrup653/clean/crypto_encode_653x1541.h
 create mode 100644 crypto_kem/sntrup653/clean/crypto_encode_653x1541round.c
 create mode 100644 crypto_kem/sntrup653/clean/crypto_encode_653x1541round.h
 create mode 100644 crypto_kem/sntrup653/clean/crypto_encode_653x3.c
 create mode 100644 crypto_kem/sntrup653/clean/crypto_encode_653x3.h
 create mode 100644 crypto_kem/sntrup653/clean/crypto_encode_653x4621.c
 create mode 100644 crypto_kem/sntrup653/clean/crypto_encode_653x4621.h
 create mode 100644 crypto_kem/sntrup653/clean/crypto_encode_653xfreeze3.c
 create mode 100644 crypto_kem/sntrup653/clean/crypto_encode_653xfreeze3.h
 create mode 100644 crypto_kem/sntrup653/clean/crypto_encode_653xint16.c
 create mode 100644 crypto_kem/sntrup653/clean/crypto_encode_653xint16.h
 create mode 100644 crypto_kem/sntrup653/clean/crypto_encode_int16.c
 create mode 100644 crypto_kem/sntrup653/clean/crypto_encode_int16.h
 create mode 100644 crypto_kem/sntrup653/clean/crypto_sort_int32.c
 create mode 100644 crypto_kem/sntrup653/clean/crypto_sort_int32.h
 create mode 100644 crypto_kem/sntrup653/clean/crypto_sort_uint32.c
 create mode 100644 crypto_kem/sntrup653/clean/crypto_sort_uint32.h
 create mode 100644 crypto_kem/sntrup653/clean/crypto_stream_aes256ctr.c
 create mode 100644 crypto_kem/sntrup653/clean/crypto_stream_aes256ctr.h
 create mode 100644 crypto_kem/sntrup653/clean/crypto_verify_897.c
 create mode 100644 crypto_kem/sntrup653/clean/crypto_verify_897.h
 create mode 100644 crypto_kem/sntrup653/clean/kem.c
 create mode 100644 crypto_kem/sntrup653/clean/params.h
 create mode 100644 crypto_kem/sntrup761/META.yml
 create mode 100644 crypto_kem/sntrup761/avx2/LICENSE
 create mode 100644 crypto_kem/sntrup761/avx2/Makefile
 create mode 100644 crypto_kem/sntrup761/avx2/api.h
 create mode 100644 crypto_kem/sntrup761/avx2/crypto_core_inv3sntrup761.c
 create mode 100644 crypto_kem/sntrup761/avx2/crypto_core_inv3sntrup761.h
 create mode 100644 crypto_kem/sntrup761/avx2/crypto_core_invsntrup761.c
 create mode 100644 crypto_kem/sntrup761/avx2/crypto_core_invsntrup761.h
 create mode 100644 crypto_kem/sntrup761/avx2/crypto_core_mult3sntrup761.c
 create mode 100644 crypto_kem/sntrup761/avx2/crypto_core_mult3sntrup761.h
 create mode 100644 crypto_kem/sntrup761/avx2/crypto_core_multsntrup761.c
 create mode 100644 crypto_kem/sntrup761/avx2/crypto_core_multsntrup761.h
 create mode 100644 crypto_kem/sntrup761/avx2/crypto_core_multsntrup761_ntt.c
 create mode 100644 crypto_kem/sntrup761/avx2/crypto_core_multsntrup761_ntt.h
 create mode 100644 crypto_kem/sntrup761/avx2/crypto_core_scale3sntrup761.c
 create mode 100644 crypto_kem/sntrup761/avx2/crypto_core_scale3sntrup761.h
 create mode 100644 crypto_kem/sntrup761/avx2/crypto_core_weightsntrup761.c
 create mode 100644 crypto_kem/sntrup761/avx2/crypto_core_weightsntrup761.h
 create mode 100644 crypto_kem/sntrup761/avx2/crypto_core_wforcesntrup761.c
 create mode 100644 crypto_kem/sntrup761/avx2/crypto_core_wforcesntrup761.h
 create mode 100644 crypto_kem/sntrup761/avx2/crypto_decode_761x1531.c
 create mode 100644 crypto_kem/sntrup761/avx2/crypto_decode_761x1531.h
 create mode 100644 crypto_kem/sntrup761/avx2/crypto_decode_761x3.c
 create mode 100644 crypto_kem/sntrup761/avx2/crypto_decode_761x3.h
 create mode 100644 crypto_kem/sntrup761/avx2/crypto_decode_761x4591.c
 create mode 100644 crypto_kem/sntrup761/avx2/crypto_decode_761x4591.h
 create mode 100644 crypto_kem/sntrup761/avx2/crypto_decode_761xint16.c
 create mode 100644 crypto_kem/sntrup761/avx2/crypto_decode_761xint16.h
 create mode 100644 crypto_kem/sntrup761/avx2/crypto_decode_761xint32.c
 create mode 100644 crypto_kem/sntrup761/avx2/crypto_decode_761xint32.h
 create mode 100644 crypto_kem/sntrup761/avx2/crypto_decode_int16.c
 create mode 100644 crypto_kem/sntrup761/avx2/crypto_decode_int16.h
 create mode 100644 crypto_kem/sntrup761/avx2/crypto_encode_761x1531.c
 create mode 100644 crypto_kem/sntrup761/avx2/crypto_encode_761x1531.h
 create mode 100644 crypto_kem/sntrup761/avx2/crypto_encode_761x1531round.c
 create mode 100644 crypto_kem/sntrup761/avx2/crypto_encode_761x1531round.h
 create mode 100644 crypto_kem/sntrup761/avx2/crypto_encode_761x3.c
 create mode 100644 crypto_kem/sntrup761/avx2/crypto_encode_761x3.h
 create mode 100644 crypto_kem/sntrup761/avx2/crypto_encode_761x4591.c
 create mode 100644 crypto_kem/sntrup761/avx2/crypto_encode_761x4591.h
 create mode 100644 crypto_kem/sntrup761/avx2/crypto_encode_761xfreeze3.c
 create mode 100644 crypto_kem/sntrup761/avx2/crypto_encode_761xfreeze3.h
 create mode 100644 crypto_kem/sntrup761/avx2/crypto_encode_761xint16.c
 create mode 100644 crypto_kem/sntrup761/avx2/crypto_encode_761xint16.h
 create mode 100644 crypto_kem/sntrup761/avx2/crypto_encode_int16.c
 create mode 100644 crypto_kem/sntrup761/avx2/crypto_encode_int16.h
 create mode 100644 crypto_kem/sntrup761/avx2/crypto_sort_int32.c
 create mode 100644 crypto_kem/sntrup761/avx2/crypto_sort_int32.h
 create mode 100644 crypto_kem/sntrup761/avx2/crypto_sort_uint32.c
 create mode 100644 crypto_kem/sntrup761/avx2/crypto_sort_uint32.h
 create mode 100644 crypto_kem/sntrup761/avx2/crypto_stream_aes256ctr.c
 create mode 100644 crypto_kem/sntrup761/avx2/crypto_stream_aes256ctr.h
 create mode 100644 crypto_kem/sntrup761/avx2/crypto_verify_1039.c
 create mode 100644 crypto_kem/sntrup761/avx2/crypto_verify_1039.h
 create mode 100644 crypto_kem/sntrup761/avx2/kem.c
 create mode 100644 crypto_kem/sntrup761/avx2/params.h
 create mode 100644 crypto_kem/sntrup761/clean/LICENSE
 create mode 100644 crypto_kem/sntrup761/clean/Makefile
 create mode 100644 crypto_kem/sntrup761/clean/Makefile.Microsoft_nmake
 create mode 100644 crypto_kem/sntrup761/clean/api.h
 create mode 100644 crypto_kem/sntrup761/clean/crypto_core_inv3sntrup761.c
 create mode 100644 crypto_kem/sntrup761/clean/crypto_core_inv3sntrup761.h
 create mode 100644 crypto_kem/sntrup761/clean/crypto_core_invsntrup761.c
 create mode 100644 crypto_kem/sntrup761/clean/crypto_core_invsntrup761.h
 create mode 100644 crypto_kem/sntrup761/clean/crypto_core_mult3sntrup761.c
 create mode 100644 crypto_kem/sntrup761/clean/crypto_core_mult3sntrup761.h
 create mode 100644 crypto_kem/sntrup761/clean/crypto_core_multsntrup761.c
 create mode 100644 crypto_kem/sntrup761/clean/crypto_core_multsntrup761.h
 create mode 100644 crypto_kem/sntrup761/clean/crypto_core_scale3sntrup761.c
 create mode 100644 crypto_kem/sntrup761/clean/crypto_core_scale3sntrup761.h
 create mode 100644 crypto_kem/sntrup761/clean/crypto_core_weightsntrup761.c
 create mode 100644 crypto_kem/sntrup761/clean/crypto_core_weightsntrup761.h
 create mode 100644 crypto_kem/sntrup761/clean/crypto_core_wforcesntrup761.c
 create mode 100644 crypto_kem/sntrup761/clean/crypto_core_wforcesntrup761.h
 create mode 100644 crypto_kem/sntrup761/clean/crypto_decode_761x1531.c
 create mode 100644 crypto_kem/sntrup761/clean/crypto_decode_761x1531.h
 create mode 100644 crypto_kem/sntrup761/clean/crypto_decode_761x3.c
 create mode 100644 crypto_kem/sntrup761/clean/crypto_decode_761x3.h
 create mode 100644 crypto_kem/sntrup761/clean/crypto_decode_761x4591.c
 create mode 100644 crypto_kem/sntrup761/clean/crypto_decode_761x4591.h
 create mode 100644 crypto_kem/sntrup761/clean/crypto_decode_761xint16.c
 create mode 100644 crypto_kem/sntrup761/clean/crypto_decode_761xint16.h
 create mode 100644 crypto_kem/sntrup761/clean/crypto_decode_761xint32.c
 create mode 100644 crypto_kem/sntrup761/clean/crypto_decode_761xint32.h
 create mode 100644 crypto_kem/sntrup761/clean/crypto_encode_761x1531.c
 create mode 100644 crypto_kem/sntrup761/clean/crypto_encode_761x1531.h
 create mode 100644 crypto_kem/sntrup761/clean/crypto_encode_761x1531round.c
 create mode 100644 crypto_kem/sntrup761/clean/crypto_encode_761x1531round.h
 create mode 100644 crypto_kem/sntrup761/clean/crypto_encode_761x3.c
 create mode 100644 crypto_kem/sntrup761/clean/crypto_encode_761x3.h
 create mode 100644 crypto_kem/sntrup761/clean/crypto_encode_761x4591.c
 create mode 100644 crypto_kem/sntrup761/clean/crypto_encode_761x4591.h
 create mode 100644 crypto_kem/sntrup761/clean/crypto_encode_761xfreeze3.c
 create mode 100644 crypto_kem/sntrup761/clean/crypto_encode_761xfreeze3.h
 create mode 100644 crypto_kem/sntrup761/clean/crypto_encode_761xint16.c
 create mode 100644 crypto_kem/sntrup761/clean/crypto_encode_761xint16.h
 create mode 100644 crypto_kem/sntrup761/clean/crypto_encode_int16.c
 create mode 100644 crypto_kem/sntrup761/clean/crypto_encode_int16.h
 create mode 100644 crypto_kem/sntrup761/clean/crypto_sort_int32.c
 create mode 100644 crypto_kem/sntrup761/clean/crypto_sort_int32.h
 create mode 100644 crypto_kem/sntrup761/clean/crypto_sort_uint32.c
 create mode 100644 crypto_kem/sntrup761/clean/crypto_sort_uint32.h
 create mode 100644 crypto_kem/sntrup761/clean/crypto_stream_aes256ctr.c
 create mode 100644 crypto_kem/sntrup761/clean/crypto_stream_aes256ctr.h
 create mode 100644 crypto_kem/sntrup761/clean/crypto_verify_1039.c
 create mode 100644 crypto_kem/sntrup761/clean/crypto_verify_1039.h
 create mode 100644 crypto_kem/sntrup761/clean/kem.c
 create mode 100644 crypto_kem/sntrup761/clean/params.h
 create mode 100644 crypto_kem/sntrup857/META.yml
 create mode 100644 crypto_kem/sntrup857/avx2/LICENSE
 create mode 100644 crypto_kem/sntrup857/avx2/Makefile
 create mode 100644 crypto_kem/sntrup857/avx2/api.h
 create mode 100644 crypto_kem/sntrup857/avx2/crypto_core_inv3sntrup857.c
 create mode 100644 crypto_kem/sntrup857/avx2/crypto_core_inv3sntrup857.h
 create mode 100644 crypto_kem/sntrup857/avx2/crypto_core_invsntrup857.c
 create mode 100644 crypto_kem/sntrup857/avx2/crypto_core_invsntrup857.h
 create mode 100644 crypto_kem/sntrup857/avx2/crypto_core_mult3sntrup857.c
 create mode 100644 crypto_kem/sntrup857/avx2/crypto_core_mult3sntrup857.h
 create mode 100644 crypto_kem/sntrup857/avx2/crypto_core_multsntrup857.c
 create mode 100644 crypto_kem/sntrup857/avx2/crypto_core_multsntrup857.h
 create mode 100644 crypto_kem/sntrup857/avx2/crypto_core_multsntrup857_ntt.c
 create mode 100644 crypto_kem/sntrup857/avx2/crypto_core_multsntrup857_ntt.h
 create mode 100644 crypto_kem/sntrup857/avx2/crypto_core_scale3sntrup857.c
 create mode 100644 crypto_kem/sntrup857/avx2/crypto_core_scale3sntrup857.h
 create mode 100644 crypto_kem/sntrup857/avx2/crypto_core_weightsntrup857.c
 create mode 100644 crypto_kem/sntrup857/avx2/crypto_core_weightsntrup857.h
 create mode 100644 crypto_kem/sntrup857/avx2/crypto_core_wforcesntrup857.c
 create mode 100644 crypto_kem/sntrup857/avx2/crypto_core_wforcesntrup857.h
 create mode 100644 crypto_kem/sntrup857/avx2/crypto_decode_857x1723.c
 create mode 100644 crypto_kem/sntrup857/avx2/crypto_decode_857x1723.h
 create mode 100644 crypto_kem/sntrup857/avx2/crypto_decode_857x3.c
 create mode 100644 crypto_kem/sntrup857/avx2/crypto_decode_857x3.h
 create mode 100644 crypto_kem/sntrup857/avx2/crypto_decode_857x5167.c
 create mode 100644 crypto_kem/sntrup857/avx2/crypto_decode_857x5167.h
 create mode 100644 crypto_kem/sntrup857/avx2/crypto_decode_857xint16.c
 create mode 100644 crypto_kem/sntrup857/avx2/crypto_decode_857xint16.h
 create mode 100644 crypto_kem/sntrup857/avx2/crypto_decode_857xint32.c
 create mode 100644 crypto_kem/sntrup857/avx2/crypto_decode_857xint32.h
 create mode 100644 crypto_kem/sntrup857/avx2/crypto_decode_int16.c
 create mode 100644 crypto_kem/sntrup857/avx2/crypto_decode_int16.h
 create mode 100644 crypto_kem/sntrup857/avx2/crypto_encode_857x1723.c
 create mode 100644 crypto_kem/sntrup857/avx2/crypto_encode_857x1723.h
 create mode 100644 crypto_kem/sntrup857/avx2/crypto_encode_857x1723round.c
 create mode 100644 crypto_kem/sntrup857/avx2/crypto_encode_857x1723round.h
 create mode 100644 crypto_kem/sntrup857/avx2/crypto_encode_857x3.c
 create mode 100644 crypto_kem/sntrup857/avx2/crypto_encode_857x3.h
 create mode 100644 crypto_kem/sntrup857/avx2/crypto_encode_857x5167.c
 create mode 100644 crypto_kem/sntrup857/avx2/crypto_encode_857x5167.h
 create mode 100644 crypto_kem/sntrup857/avx2/crypto_encode_857xfreeze3.c
 create mode 100644 crypto_kem/sntrup857/avx2/crypto_encode_857xfreeze3.h
 create mode 100644 crypto_kem/sntrup857/avx2/crypto_encode_857xint16.c
 create mode 100644 crypto_kem/sntrup857/avx2/crypto_encode_857xint16.h
 create mode 100644 crypto_kem/sntrup857/avx2/crypto_encode_int16.c
 create mode 100644 crypto_kem/sntrup857/avx2/crypto_encode_int16.h
 create mode 100644 crypto_kem/sntrup857/avx2/crypto_sort_int32.c
 create mode 100644 crypto_kem/sntrup857/avx2/crypto_sort_int32.h
 create mode 100644 crypto_kem/sntrup857/avx2/crypto_sort_uint32.c
 create mode 100644 crypto_kem/sntrup857/avx2/crypto_sort_uint32.h
 create mode 100644 crypto_kem/sntrup857/avx2/crypto_stream_aes256ctr.c
 create mode 100644 crypto_kem/sntrup857/avx2/crypto_stream_aes256ctr.h
 create mode 100644 crypto_kem/sntrup857/avx2/crypto_verify_1184.c
 create mode 100644 crypto_kem/sntrup857/avx2/crypto_verify_1184.h
 create mode 100644 crypto_kem/sntrup857/avx2/kem.c
 create mode 100644 crypto_kem/sntrup857/avx2/params.h
 create mode 100644 crypto_kem/sntrup857/clean/LICENSE
 create mode 100644 crypto_kem/sntrup857/clean/Makefile
 create mode 100644 crypto_kem/sntrup857/clean/Makefile.Microsoft_nmake
 create mode 100644 crypto_kem/sntrup857/clean/api.h
 create mode 100644 crypto_kem/sntrup857/clean/crypto_core_inv3sntrup857.c
 create mode 100644 crypto_kem/sntrup857/clean/crypto_core_inv3sntrup857.h
 create mode 100644 crypto_kem/sntrup857/clean/crypto_core_invsntrup857.c
 create mode 100644 crypto_kem/sntrup857/clean/crypto_core_invsntrup857.h
 create mode 100644 crypto_kem/sntrup857/clean/crypto_core_mult3sntrup857.c
 create mode 100644 crypto_kem/sntrup857/clean/crypto_core_mult3sntrup857.h
 create mode 100644 crypto_kem/sntrup857/clean/crypto_core_multsntrup857.c
 create mode 100644 crypto_kem/sntrup857/clean/crypto_core_multsntrup857.h
 create mode 100644 crypto_kem/sntrup857/clean/crypto_core_scale3sntrup857.c
 create mode 100644 crypto_kem/sntrup857/clean/crypto_core_scale3sntrup857.h
 create mode 100644 crypto_kem/sntrup857/clean/crypto_core_weightsntrup857.c
 create mode 100644 crypto_kem/sntrup857/clean/crypto_core_weightsntrup857.h
 create mode 100644 crypto_kem/sntrup857/clean/crypto_core_wforcesntrup857.c
 create mode 100644 crypto_kem/sntrup857/clean/crypto_core_wforcesntrup857.h
 create mode 100644 crypto_kem/sntrup857/clean/crypto_decode_857x1723.c
 create mode 100644 crypto_kem/sntrup857/clean/crypto_decode_857x1723.h
 create mode 100644 crypto_kem/sntrup857/clean/crypto_decode_857x3.c
 create mode 100644 crypto_kem/sntrup857/clean/crypto_decode_857x3.h
 create mode 100644 crypto_kem/sntrup857/clean/crypto_decode_857x5167.c
 create mode 100644 crypto_kem/sntrup857/clean/crypto_decode_857x5167.h
 create mode 100644 crypto_kem/sntrup857/clean/crypto_decode_857xint16.c
 create mode 100644 crypto_kem/sntrup857/clean/crypto_decode_857xint16.h
 create mode 100644 crypto_kem/sntrup857/clean/crypto_decode_857xint32.c
 create mode 100644 crypto_kem/sntrup857/clean/crypto_decode_857xint32.h
 create mode 100644 crypto_kem/sntrup857/clean/crypto_encode_857x1723.c
 create mode 100644 crypto_kem/sntrup857/clean/crypto_encode_857x1723.h
 create mode 100644 crypto_kem/sntrup857/clean/crypto_encode_857x1723round.c
 create mode 100644 crypto_kem/sntrup857/clean/crypto_encode_857x1723round.h
 create mode 100644 crypto_kem/sntrup857/clean/crypto_encode_857x3.c
 create mode 100644 crypto_kem/sntrup857/clean/crypto_encode_857x3.h
 create mode 100644 crypto_kem/sntrup857/clean/crypto_encode_857x5167.c
 create mode 100644 crypto_kem/sntrup857/clean/crypto_encode_857x5167.h
 create mode 100644 crypto_kem/sntrup857/clean/crypto_encode_857xfreeze3.c
 create mode 100644 crypto_kem/sntrup857/clean/crypto_encode_857xfreeze3.h
 create mode 100644 crypto_kem/sntrup857/clean/crypto_encode_857xint16.c
 create mode 100644 crypto_kem/sntrup857/clean/crypto_encode_857xint16.h
 create mode 100644 crypto_kem/sntrup857/clean/crypto_encode_int16.c
 create mode 100644 crypto_kem/sntrup857/clean/crypto_encode_int16.h
 create mode 100644 crypto_kem/sntrup857/clean/crypto_sort_int32.c
 create mode 100644 crypto_kem/sntrup857/clean/crypto_sort_int32.h
 create mode 100644 crypto_kem/sntrup857/clean/crypto_sort_uint32.c
 create mode 100644 crypto_kem/sntrup857/clean/crypto_sort_uint32.h
 create mode 100644 crypto_kem/sntrup857/clean/crypto_stream_aes256ctr.c
 create mode 100644 crypto_kem/sntrup857/clean/crypto_stream_aes256ctr.h
 create mode 100644 crypto_kem/sntrup857/clean/crypto_verify_1184.c
 create mode 100644 crypto_kem/sntrup857/clean/crypto_verify_1184.h
 create mode 100644 crypto_kem/sntrup857/clean/kem.c
 create mode 100644 crypto_kem/sntrup857/clean/params.h
 create mode 100644 test/duplicate_consistency/ntrulpr653_avx2.yml
 create mode 100644 test/duplicate_consistency/ntrulpr653_clean.yml
 create mode 100644 test/duplicate_consistency/ntrulpr761_avx2.yml
 create mode 100644 test/duplicate_consistency/ntrulpr761_clean.yml
 create mode 100644 test/duplicate_consistency/ntrulpr857_avx2.yml
 create mode 100644 test/duplicate_consistency/ntrulpr857_clean.yml
 create mode 100644 test/duplicate_consistency/sntrup653_avx2.yml
 create mode 100644 test/duplicate_consistency/sntrup653_clean.yml
 create mode 100644 test/duplicate_consistency/sntrup761_avx2.yml
 create mode 100644 test/duplicate_consistency/sntrup761_clean.yml
 create mode 100644 test/duplicate_consistency/sntrup857_avx2.yml
 create mode 100644 test/duplicate_consistency/sntrup857_clean.yml

diff --git a/crypto_kem/ntrulpr653/META.yml b/crypto_kem/ntrulpr653/META.yml
new file mode 100644
index 00000000..78ca966f
--- /dev/null
+++ b/crypto_kem/ntrulpr653/META.yml
@@ -0,0 +1,26 @@
+name: ntrulpr653
+type: kem
+claimed-nist-level: 2
+claimed-security: IND-CCA2
+length-public-key: 897
+length-secret-key: 1125
+length-ciphertext: 1025
+length-shared-secret: 32
+nistkat-sha256: 6f8be58bb5d9785a0693fa8d34f5d89193757e1244e26f6182372c3e6de84fb2
+principal-submitters:
+  - Daniel J. Bernstein
+  - Chitchanok Chuengsatiansup
+  - Tanja Lange
+  - Christine van Vredendaal
+implementations:
+    - name: clean
+      version: supercop-20200826
+    - name: avx2
+      version: supercop-20200826
+      supported_platforms:
+          - architecture: x86_64
+            operating_systems:
+                - Linux
+                - Darwin
+            required_flags:
+                - avx2
diff --git a/crypto_kem/ntrulpr653/avx2/LICENSE b/crypto_kem/ntrulpr653/avx2/LICENSE
new file mode 100644
index 00000000..d5d21fff
--- /dev/null
+++ b/crypto_kem/ntrulpr653/avx2/LICENSE
@@ -0,0 +1 @@
+Public Domain
diff --git a/crypto_kem/ntrulpr653/avx2/Makefile b/crypto_kem/ntrulpr653/avx2/Makefile
new file mode 100644
index 00000000..5d22a4ce
--- /dev/null
+++ b/crypto_kem/ntrulpr653/avx2/Makefile
@@ -0,0 +1,22 @@
+# This Makefile can be used with GNU Make or BSD Make
+
+LIB=libntrulpr653_avx2.a
+HEADERS=api.h crypto_core_multsntrup653.h crypto_core_multsntrup653_ntt.h crypto_decode_256x16.h crypto_decode_256x2.h crypto_decode_653x1541.h crypto_decode_653x3.h crypto_decode_653xint16.h crypto_decode_653xint32.h crypto_encode_256x16.h crypto_encode_256x2.h crypto_encode_653x1541.h crypto_encode_653x1541round.h crypto_encode_653x3.h crypto_encode_653xint16.h crypto_sort_int32.h crypto_sort_uint32.h crypto_stream_aes256ctr.h crypto_verify_1025.h params.h 
+OBJECTS=crypto_core_multsntrup653.o crypto_core_multsntrup653_ntt.o crypto_decode_256x16.o crypto_decode_256x2.o crypto_decode_653x1541.o crypto_decode_653x3.o crypto_decode_653xint16.o crypto_decode_653xint32.o crypto_encode_256x16.o crypto_encode_256x2.o crypto_encode_653x1541.o crypto_encode_653x1541round.o crypto_encode_653x3.o crypto_encode_653xint16.o crypto_sort_int32.o crypto_sort_uint32.o crypto_stream_aes256ctr.o crypto_verify_1025.o kem.o 
+
+CFLAGS=-O3 -mavx2 -mbmi2 -Wall -Wextra -Wpedantic -Wvla -Werror -Wredundant-decls -Wmissing-prototypes -std=c99 -I../../../common $(EXTRAFLAGS)
+
+all: $(LIB)
+
+%.o: %.s $(HEADERS)
+	$(AS) -o $@ $<
+
+%.o: %.c $(HEADERS)
+	$(CC) $(CFLAGS) -c -o $@ $<
+
+$(LIB): $(OBJECTS)
+	$(AR) -r $@ $(OBJECTS)
+
+clean:
+	$(RM) $(OBJECTS)
+	$(RM) $(LIB)
diff --git a/crypto_kem/ntrulpr653/avx2/api.h b/crypto_kem/ntrulpr653/avx2/api.h
new file mode 100644
index 00000000..d1f32496
--- /dev/null
+++ b/crypto_kem/ntrulpr653/avx2/api.h
@@ -0,0 +1,16 @@
+#ifndef PQCLEAN_NTRULPR653_AVX2_API_H
+#define PQCLEAN_NTRULPR653_AVX2_API_H
+
+
+
+#define PQCLEAN_NTRULPR653_AVX2_CRYPTO_ALGNAME "ntrulpr653"
+
+#define PQCLEAN_NTRULPR653_AVX2_CRYPTO_SECRETKEYBYTES 1125
+#define PQCLEAN_NTRULPR653_AVX2_CRYPTO_PUBLICKEYBYTES 897
+#define PQCLEAN_NTRULPR653_AVX2_CRYPTO_CIPHERTEXTBYTES 1025
+#define PQCLEAN_NTRULPR653_AVX2_CRYPTO_BYTES 32
+
+int PQCLEAN_NTRULPR653_AVX2_crypto_kem_keypair(unsigned char *pk, unsigned char *sk);
+int PQCLEAN_NTRULPR653_AVX2_crypto_kem_enc(unsigned char *c, unsigned char *k, const unsigned char *pk);
+int PQCLEAN_NTRULPR653_AVX2_crypto_kem_dec(unsigned char *k, const unsigned char *c, const unsigned char *sk);
+#endif
diff --git a/crypto_kem/ntrulpr653/avx2/crypto_core_multsntrup653.c b/crypto_kem/ntrulpr653/avx2/crypto_core_multsntrup653.c
new file mode 100644
index 00000000..7381e5ca
--- /dev/null
+++ b/crypto_kem/ntrulpr653/avx2/crypto_core_multsntrup653.c
@@ -0,0 +1,314 @@
+#include "crypto_core_multsntrup653.h"
+#include "crypto_core_multsntrup653_ntt.h"
+#include "crypto_decode_653xint16.h"
+#include "crypto_encode_653xint16.h"
+#include <immintrin.h>
+
+typedef int8_t int8;
+typedef int16_t int16;
+
+#define int16x16 __m256i
+#define load_x16(p) _mm256_loadu_si256((int16x16 *) (p))
+#define store_x16(p,v) _mm256_storeu_si256((int16x16 *) (p),(v))
+#define const_x16 _mm256_set1_epi16
+#define add_x16 _mm256_add_epi16
+#define sub_x16 _mm256_sub_epi16
+#define mullo_x16 _mm256_mullo_epi16
+#define mulhi_x16 _mm256_mulhi_epi16
+#define mulhrs_x16 _mm256_mulhrs_epi16
+#define signmask_x16(x) _mm256_srai_epi16((x),15)
+
+typedef union {
+    int16 v[3][512];
+    int16x16 _dummy;
+} vec3x512;
+
+typedef union {
+    int16 v[768];
+    int16x16 _dummy;
+} vec768;
+
+typedef union {
+    int16 v[3 * 512];
+    int16x16 _dummy;
+} vec1536;
+
+static inline int16x16 squeeze_4621_x16(int16x16 x) {
+    return sub_x16(x, mullo_x16(mulhrs_x16(x, const_x16(7)), const_x16(4621)));
+}
+
+static inline int16x16 squeeze_7681_x16(int16x16 x) {
+    return sub_x16(x, mullo_x16(mulhrs_x16(x, const_x16(4)), const_x16(7681)));
+}
+
+static inline int16x16 squeeze_10753_x16(int16x16 x) {
+    return sub_x16(x, mullo_x16(mulhrs_x16(x, const_x16(3)), const_x16(10753)));
+}
+
+static inline int16x16 mulmod_4621_x16(int16x16 x, int16x16 y) {
+    int16x16 yqinv = mullo_x16(y, const_x16(-29499)); /* XXX: precompute */
+    int16x16 b = mulhi_x16(x, y);
+    int16x16 d = mullo_x16(x, yqinv);
+    int16x16 e = mulhi_x16(d, const_x16(4621));
+    return sub_x16(b, e);
+}
+
+static inline int16x16 mulmod_7681_x16(int16x16 x, int16x16 y) {
+    int16x16 yqinv = mullo_x16(y, const_x16(-7679)); /* XXX: precompute */
+    int16x16 b = mulhi_x16(x, y);
+    int16x16 d = mullo_x16(x, yqinv);
+    int16x16 e = mulhi_x16(d, const_x16(7681));
+    return sub_x16(b, e);
+}
+
+static inline int16x16 mulmod_10753_x16(int16x16 x, int16x16 y) {
+    int16x16 yqinv = mullo_x16(y, const_x16(-10751)); /* XXX: precompute */
+    int16x16 b = mulhi_x16(x, y);
+    int16x16 d = mullo_x16(x, yqinv);
+    int16x16 e = mulhi_x16(d, const_x16(10753));
+    return sub_x16(b, e);
+}
+
+#define mask0 _mm256_set_epi16(-1,0,0,-1,0,0,-1,0,0,-1,0,0,-1,0,0,-1)
+#define mask1 _mm256_set_epi16(0,0,-1,0,0,-1,0,0,-1,0,0,-1,0,0,-1,0)
+#define mask2 _mm256_set_epi16(0,-1,0,0,-1,0,0,-1,0,0,-1,0,0,-1,0,0)
+
+static void good(int16 fpad[3][512], const int16 f[768]) {
+    int j;
+    int16x16 f0, f1;
+
+    j = 0;
+    for (;;) {
+        f0 = load_x16(f + j);
+        f1 = load_x16(f + 512 + j);
+        store_x16(&fpad[0][j], (f0 & mask0) | (f1 & mask1));
+        store_x16(&fpad[1][j], (f0 & mask1) | (f1 & mask2));
+        store_x16(&fpad[2][j], (f0 & mask2) | (f1 & mask0));
+        j += 16;
+        if (j == 256) {
+            break;
+        }
+
+        f0 = load_x16(f + j);
+        f1 = load_x16(f + 512 + j);
+        store_x16(&fpad[0][j], (f0 & mask2) | (f1 & mask0));
+        store_x16(&fpad[1][j], (f0 & mask0) | (f1 & mask1));
+        store_x16(&fpad[2][j], (f0 & mask1) | (f1 & mask2));
+        j += 16;
+
+        f0 = load_x16(f + j);
+        f1 = load_x16(f + 512 + j);
+        store_x16(&fpad[0][j], (f0 & mask1) | (f1 & mask2));
+        store_x16(&fpad[1][j], (f0 & mask2) | (f1 & mask0));
+        store_x16(&fpad[2][j], (f0 & mask0) | (f1 & mask1));
+        j += 16;
+    }
+    for (;;) {
+        f0 = load_x16(f + j);
+        store_x16(&fpad[0][j], f0 & mask2);
+        store_x16(&fpad[1][j], f0 & mask0);
+        store_x16(&fpad[2][j], f0 & mask1);
+        j += 16;
+        if (j == 512) {
+            break;
+        }
+
+        f0 = load_x16(f + j);
+        store_x16(&fpad[0][j], f0 & mask1);
+        store_x16(&fpad[1][j], f0 & mask2);
+        store_x16(&fpad[2][j], f0 & mask0);
+        j += 16;
+
+        f0 = load_x16(f + j);
+        store_x16(&fpad[0][j], f0 & mask0);
+        store_x16(&fpad[1][j], f0 & mask1);
+        store_x16(&fpad[2][j], f0 & mask2);
+        j += 16;
+    }
+}
+
+static void ungood(int16 f[1536], const int16 fpad[3][512]) {
+    int j;
+    int16x16 f0, f1, f2, g0, g1, g2;
+
+    j = 0;
+
+    for (;;) {
+        f0 = load_x16(&fpad[0][j]);
+        f1 = load_x16(&fpad[1][j]);
+        f2 = load_x16(&fpad[2][j]);
+        g0 = (f0 & mask0) | (f1 & mask1) | (f2 & mask2);
+        g1 = (f0 & mask1) | (f1 & mask2) | (f2 & mask0);
+        g2 = f0 ^ f1 ^ f2 ^ g0 ^ g1; /* same as (f0&mask2)|(f1&mask0)|(f2&mask1) */
+        store_x16(f + 0 + j, g0);
+        store_x16(f + 512 + j, g1);
+        store_x16(f + 1024 + j, g2);
+        j += 16;
+
+        f0 = load_x16(&fpad[0][j]);
+        f1 = load_x16(&fpad[1][j]);
+        f2 = load_x16(&fpad[2][j]);
+        g0 = (f0 & mask2) | (f1 & mask0) | (f2 & mask1);
+        g1 = (f0 & mask0) | (f1 & mask1) | (f2 & mask2);
+        g2 = f0 ^ f1 ^ f2 ^ g0 ^ g1; /* same as (f0&mask1)|(f1&mask2)|(f2&mask0) */
+        store_x16(f + 0 + j, g0);
+        store_x16(f + 512 + j, g1);
+        store_x16(f + 1024 + j, g2);
+        j += 16;
+        if (j == 512) {
+            break;
+        }
+
+        f0 = load_x16(&fpad[0][j]);
+        f1 = load_x16(&fpad[1][j]);
+        f2 = load_x16(&fpad[2][j]);
+        g0 = (f0 & mask1) | (f1 & mask2) | (f2 & mask0);
+        g1 = (f0 & mask2) | (f1 & mask0) | (f2 & mask1);
+        g2 = f0 ^ f1 ^ f2 ^ g0 ^ g1; /* same as (f0&mask0)|(f1&mask1)|(f2&mask2) */
+        store_x16(f + 0 + j, g0);
+        store_x16(f + 512 + j, g1);
+        store_x16(f + 1024 + j, g2);
+        j += 16;
+    }
+}
+
+static void mult768(int16 h[1536], const int16 f[768], const int16 g[768]) {
+    vec3x512 x1, x2;
+    vec1536 x3, x4;
+#define fpad (x1.v)
+#define gpad (x2.v)
+#define hpad fpad
+#define h_7681 (x3.v)
+#define h_10753 (x4.v)
+    int i;
+
+    good(fpad, f);
+    PQCLEAN_NTRULPR653_AVX2_ntt512_7681(fpad[0], 3);
+
+    good(gpad, g);
+    PQCLEAN_NTRULPR653_AVX2_ntt512_7681(gpad[0], 3);
+
+    for (i = 0; i < 512; i += 16) {
+        int16x16 f0 = squeeze_7681_x16(load_x16(&fpad[0][i]));
+        int16x16 f1 = squeeze_7681_x16(load_x16(&fpad[1][i]));
+        int16x16 f2 = squeeze_7681_x16(load_x16(&fpad[2][i]));
+        int16x16 g0 = squeeze_7681_x16(load_x16(&gpad[0][i]));
+        int16x16 g1 = squeeze_7681_x16(load_x16(&gpad[1][i]));
+        int16x16 g2 = squeeze_7681_x16(load_x16(&gpad[2][i]));
+        int16x16 d0 = mulmod_7681_x16(f0, g0);
+        int16x16 d1 = mulmod_7681_x16(f1, g1);
+        int16x16 d2 = mulmod_7681_x16(f2, g2);
+        int16x16 dsum = add_x16(add_x16(d0, d1), d2);
+        int16x16 h0 = add_x16(dsum, mulmod_7681_x16(sub_x16(f2, f1), sub_x16(g1, g2)));
+        int16x16 h1 = add_x16(dsum, mulmod_7681_x16(sub_x16(f1, f0), sub_x16(g0, g1)));
+        int16x16 h2 = add_x16(dsum, mulmod_7681_x16(sub_x16(f0, f2), sub_x16(g2, g0)));
+        store_x16(&hpad[0][i], squeeze_7681_x16(h0));
+        store_x16(&hpad[1][i], squeeze_7681_x16(h1));
+        store_x16(&hpad[2][i], squeeze_7681_x16(h2));
+    }
+
+    PQCLEAN_NTRULPR653_AVX2_invntt512_7681(hpad[0], 3);
+    ungood(h_7681, (const int16(*)[512]) hpad);
+
+    good(fpad, f);
+    PQCLEAN_NTRULPR653_AVX2_ntt512_10753(fpad[0], 3);
+
+    good(gpad, g);
+    PQCLEAN_NTRULPR653_AVX2_ntt512_10753(gpad[0], 3);
+
+    for (i = 0; i < 512; i += 16) {
+        int16x16 f0 = squeeze_10753_x16(load_x16(&fpad[0][i]));
+        int16x16 f1 = squeeze_10753_x16(load_x16(&fpad[1][i]));
+        int16x16 f2 = squeeze_10753_x16(load_x16(&fpad[2][i]));
+        int16x16 g0 = squeeze_10753_x16(load_x16(&gpad[0][i]));
+        int16x16 g1 = squeeze_10753_x16(load_x16(&gpad[1][i]));
+        int16x16 g2 = squeeze_10753_x16(load_x16(&gpad[2][i]));
+        int16x16 d0 = mulmod_10753_x16(f0, g0);
+        int16x16 d1 = mulmod_10753_x16(f1, g1);
+        int16x16 d2 = mulmod_10753_x16(f2, g2);
+        int16x16 dsum = add_x16(add_x16(d0, d1), d2);
+        int16x16 h0 = add_x16(dsum, mulmod_10753_x16(sub_x16(f2, f1), sub_x16(g1, g2)));
+        int16x16 h1 = add_x16(dsum, mulmod_10753_x16(sub_x16(f1, f0), sub_x16(g0, g1)));
+        int16x16 h2 = add_x16(dsum, mulmod_10753_x16(sub_x16(f0, f2), sub_x16(g2, g0)));
+        store_x16(&hpad[0][i], squeeze_10753_x16(h0));
+        store_x16(&hpad[1][i], squeeze_10753_x16(h1));
+        store_x16(&hpad[2][i], squeeze_10753_x16(h2));
+    }
+
+    PQCLEAN_NTRULPR653_AVX2_invntt512_10753(hpad[0], 3);
+    ungood(h_10753, (const int16(*)[512]) hpad);
+
+    for (i = 0; i < 1536; i += 16) {
+        int16x16 u1 = load_x16(&h_10753[i]);
+        int16x16 u2 = load_x16(&h_7681[i]);
+        int16x16 t;
+        u1 = mulmod_10753_x16(u1, const_x16(1268));
+        u2 = mulmod_7681_x16(u2, const_x16(956));
+        t = mulmod_7681_x16(sub_x16(u2, u1), const_x16(-2539));
+        t = add_x16(u1, mulmod_4621_x16(t, const_x16(1487)));
+        store_x16(&h[i], t);
+    }
+}
+
+#define crypto_decode_pxint16 PQCLEAN_NTRULPR653_AVX2_crypto_decode_653xint16
+#define crypto_encode_pxint16 PQCLEAN_NTRULPR653_AVX2_crypto_encode_653xint16
+
+#define p 653
+#define q 4621
+
+static inline int16x16 freeze_4621_x16(int16x16 x) {
+    int16x16 mask, xq;
+    x = add_x16(x, const_x16(q)&signmask_x16(x));
+    mask = signmask_x16(sub_x16(x, const_x16((q + 1) / 2)));
+    xq = sub_x16(x, const_x16(q));
+    x = _mm256_blendv_epi8(xq, x, mask);
+    return x;
+}
+
+int PQCLEAN_NTRULPR653_AVX2_crypto_core_multsntrup653(unsigned char *outbytes, const unsigned char *inbytes, const unsigned char *kbytes) {
+    vec768 x1, x2;
+    vec1536 x3;
+#define f (x1.v)
+#define g (x2.v)
+#define fg (x3.v)
+#define h f
+    int i;
+    int16x16 x;
+
+    x = const_x16(0);
+    for (i = p & ~15; i < 768; i += 16) {
+        store_x16(&f[i], x);
+    }
+    for (i = p & ~15; i < 768; i += 16) {
+        store_x16(&g[i], x);
+    }
+
+    crypto_decode_pxint16(f, inbytes);
+
+    for (i = 0; i < 768; i += 16) {
+        x = load_x16(&f[i]);
+        x = freeze_4621_x16(squeeze_4621_x16(x));
+        store_x16(&f[i], x);
+    }
+    for (i = 0; i < p; ++i) {
+        int8 gi = kbytes[i];
+        int8 gi0 = gi & 1;
+        g[i] = gi0 - (gi & (gi0 << 1));
+    }
+
+    mult768(fg, f, g);
+
+    fg[0] -= fg[p - 1];
+    for (i = 0; i < 768; i += 16) {
+        int16x16 fgi = load_x16(&fg[i]);
+        int16x16 fgip = load_x16(&fg[i + p]);
+        int16x16 fgip1 = load_x16(&fg[i + p - 1]);
+        x = add_x16(fgi, add_x16(fgip, fgip1));
+        x = freeze_4621_x16(squeeze_4621_x16(x));
+        store_x16(&h[i], x);
+    }
+
+    crypto_encode_pxint16(outbytes, h);
+
+    return 0;
+}
diff --git a/crypto_kem/ntrulpr653/avx2/crypto_core_multsntrup653.h b/crypto_kem/ntrulpr653/avx2/crypto_core_multsntrup653.h
new file mode 100644
index 00000000..0637b00c
--- /dev/null
+++ b/crypto_kem/ntrulpr653/avx2/crypto_core_multsntrup653.h
@@ -0,0 +1,11 @@
+#ifndef PQCLEAN_NTRULPR653_AVX2_CRYPTO_CORE_MULTSNTRUP653_H
+#define PQCLEAN_NTRULPR653_AVX2_CRYPTO_CORE_MULTSNTRUP653_H
+
+#include <stdint.h>
+#define PQCLEAN_NTRULPR653_AVX2_crypto_core_multsntrup653_OUTPUTBYTES 1306
+#define PQCLEAN_NTRULPR653_AVX2_crypto_core_multsntrup653_INPUTBYTES 1306
+#define PQCLEAN_NTRULPR653_AVX2_crypto_core_multsntrup653_KEYBYTES 653
+#define PQCLEAN_NTRULPR653_AVX2_crypto_core_multsntrup653_CONSTBYTES 0
+
+int PQCLEAN_NTRULPR653_AVX2_crypto_core_multsntrup653(unsigned char *outbytes, const unsigned char *inbytes, const unsigned char *kbytes);
+#endif
diff --git a/crypto_kem/ntrulpr653/avx2/crypto_core_multsntrup653_ntt.c b/crypto_kem/ntrulpr653/avx2/crypto_core_multsntrup653_ntt.c
new file mode 100644
index 00000000..461f626a
--- /dev/null
+++ b/crypto_kem/ntrulpr653/avx2/crypto_core_multsntrup653_ntt.c
@@ -0,0 +1,927 @@
+#include "crypto_core_multsntrup653.h"
+#include "crypto_core_multsntrup653_ntt.h"
+#include <immintrin.h>
+#include <stdint.h>
+
+/* auto-generated; do not edit */
+
+
+typedef int8_t int8;
+typedef int16_t int16;
+
+#define zeta(n,i) (((__m256i *) zeta_##n)[(i)])
+#define zeta_x4(n,i) (((__m256i *) zeta_x4_##n)[(i)])
+#define zeta_qinv(n,i) (((__m256i *) qinvzeta_##n)[(i)])
+#define zeta_x4_qinv(n,i) (((__m256i *) qinvzeta_x4_##n)[(i)])
+#define zetainv(n,i) _mm256_loadu_reverse16((__m256i *) ((int16 *) zeta_##n+(n)/2+1-16*((i)+1)))
+#define zetainv_x4(n,i) _mm256_loadu_reverse16((__m256i *) ((int16 *) zeta_x4_##n+2*(n)+4-16*((i)+1)))
+#define zetainv_qinv(n,i) _mm256_loadu_reverse16((__m256i *) ((int16 *) qinvzeta_##n+(n)/2+1-16*((i)+1)))
+#define zetainv_x4_qinv(n,i) _mm256_loadu_reverse16((__m256i *) ((int16 *) qinvzeta_x4_##n+2*(n)+4-16*((i)+1)))
+
+typedef union {
+    int16 data[93 * 16];
+    __m256i _dummy;
+} vec1488;
+
+static const vec1488 qdata_7681 = { .data = {
+
+#define q_x16 (qdata[0])
+        7681, 7681, 7681, 7681, 7681, 7681, 7681, 7681, 7681, 7681, 7681, 7681, 7681, 7681, 7681, 7681,
+
+#define qrecip_x16 (qdata[1])
+        17474, 17474, 17474, 17474, 17474, 17474, 17474, 17474, 17474, 17474, 17474, 17474, 17474, 17474, 17474, 17474,
+
+#define qshift_x16 (qdata[2])
+        16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+
+#define zeta4_x16 (qdata[3])
+        -3777, -3777, -3777, -3777, -3777, -3777, -3777, -3777, -3777, -3777, -3777, -3777, -3777, -3777, -3777, -3777,
+
+#define zeta4_x16_qinv (qdata[4])
+        -28865, -28865, -28865, -28865, -28865, -28865, -28865, -28865, -28865, -28865, -28865, -28865, -28865, -28865, -28865, -28865,
+
+#define zeta8_x16 (qdata[5])
+        -3625, -3625, -3625, -3625, -3625, -3625, -3625, -3625, -3625, -3625, -3625, -3625, -3625, -3625, -3625, -3625,
+
+#define zeta8_x16_qinv (qdata[6])
+        -16425, -16425, -16425, -16425, -16425, -16425, -16425, -16425, -16425, -16425, -16425, -16425, -16425, -16425, -16425, -16425,
+
+#define zetainv8_x16 (qdata[7])
+        -3182, -3182, -3182, -3182, -3182, -3182, -3182, -3182, -3182, -3182, -3182, -3182, -3182, -3182, -3182, -3182,
+
+#define zetainv8_x16_qinv (qdata[8])
+        -10350, -10350, -10350, -10350, -10350, -10350, -10350, -10350, -10350, -10350, -10350, -10350, -10350, -10350, -10350, -10350,
+
+#define zeta_x4_16 (qdata+9)
+        -3593, -3593, -3593, -3593, -2194, -2194, -2194, -2194, -3625, -3625, -3625, -3625, 1100, 1100, 1100, 1100,
+        -3777, -3777, -3777, -3777, -2456, -2456, -2456, -2456, 3182, 3182, 3182, 3182, 3696, 3696, 3696, 3696,
+        3593, 3593, 3593, 3593, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+#define qinvzeta_x4_16 (qdata+12)
+        -9, -9, -9, -9, 4974, 4974, 4974, 4974, -16425, -16425, -16425, -16425, 7244, 7244, 7244, 7244,
+        -28865, -28865, -28865, -28865, -14744, -14744, -14744, -14744, 10350, 10350, 10350, 10350, -4496, -4496, -4496, -4496,
+        9, 9, 9, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+#define zeta_x4_32 (qdata+15)
+        -3593, -3593, -3593, -3593, 1414, 1414, 1414, 1414, -2194, -2194, -2194, -2194, -2495, -2495, -2495, -2495,
+        -3625, -3625, -3625, -3625, 2876, 2876, 2876, 2876, 1100, 1100, 1100, 1100, -2250, -2250, -2250, -2250,
+        -3777, -3777, -3777, -3777, -1701, -1701, -1701, -1701, -2456, -2456, -2456, -2456, 834, 834, 834, 834,
+        3182, 3182, 3182, 3182, -2319, -2319, -2319, -2319, 3696, 3696, 3696, 3696, 121, 121, 121, 121,
+        3593, 3593, 3593, 3593, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+#define qinvzeta_x4_32 (qdata+20)
+        -9, -9, -9, -9, 20870, 20870, 20870, 20870, 4974, 4974, 4974, 4974, 22593, 22593, 22593, 22593,
+        -16425, -16425, -16425, -16425, 828, 828, 828, 828, 7244, 7244, 7244, 7244, -23754, -23754, -23754, -23754,
+        -28865, -28865, -28865, -28865, 20315, 20315, 20315, 20315, -14744, -14744, -14744, -14744, 18242, 18242, 18242, 18242,
+        10350, 10350, 10350, 10350, -18191, -18191, -18191, -18191, -4496, -4496, -4496, -4496, -11655, -11655, -11655, -11655,
+        9, 9, 9, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+#define zeta_64 (qdata+25)
+        -3593, -617, 1414, 3706, -2194, -1296, -2495, -2237, -3625, 2830, 2876, -1599, 1100, 1525, -2250, 2816,
+        -3777, 1921, -1701, 2006, -2456, 1483, 834, -1986, 3182, 3364, -2319, -1993, 3696, -2557, 121, 2088,
+        3593, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+#define qinvzeta_64 (qdata+28)
+        -9, 19351, 20870, -15750, 4974, -9488, 22593, 7491, -16425, 26382, 828, 23489, 7244, 20469, -23754, 2816,
+        -28865, -5759, 20315, -3114, -14744, 15307, 18242, -19394, 10350, -10972, -18191, -31177, -4496, -25597, -11655, 22568,
+        9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+#define zeta_128 (qdata+31)
+        -3593, -2804, -617, -396, 1414, -549, 3706, 810, -2194, -1321, -1296, 438, -2495, -2535, -2237, -3689,
+        -3625, 2043, 2830, -1881, 2876, 3153, -1599, 7, 1100, -514, 1525, -1760, -2250, -2440, 2816, 3600,
+        -3777, 103, 1921, -3174, -1701, 1535, 2006, -1887, -2456, 1399, 1483, -679, 834, 3772, -1986, 1738,
+        3182, -1431, 3364, -3555, -2319, -2310, -1993, 638, 3696, -2956, -2557, -1305, 121, 2555, 2088, -3266,
+        3593, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+#define qinvzeta_128 (qdata+36)
+        -9, -29428, 19351, 26228, 20870, 21467, -15750, 5930, 4974, -14121, -9488, -21066, 22593, 2073, 7491, 16279,
+        -16425, -25093, 26382, 26279, 828, -29103, 23489, 11783, 7244, 14846, 20469, 14624, -23754, -6536, 2816, 11792,
+        -28865, -4505, -5759, -6246, 20315, 9215, -3114, 6817, -14744, 4983, 15307, -28839, 18242, 1724, -19394, 23242,
+        10350, -21399, -10972, -29667, -18191, -21766, -31177, 15998, -4496, 23668, -25597, -5913, -11655, -24581, 22568, -20674,
+        9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+#define zeta_256 (qdata+41)
+        -3593, 2665, -2804, -2572, -617, 727, -396, 3417, 1414, 2579, -549, 373, 3706, 3750, 810, -1054,
+        -2194, -2133, -1321, 1681, -1296, -1386, 438, -2732, -2495, 1919, -2535, -2391, -2237, 2835, -3689, 2,
+        -3625, -783, 2043, 3145, 2830, 1533, -1881, 2789, 2876, 2649, 3153, 3692, -1599, -1390, 7, -1166,
+        1100, 3310, -514, 2224, 1525, -2743, -1760, 2385, -2250, -486, -2440, -1756, 2816, -3816, 3600, -3831,
+        -3777, -1799, 103, 1497, 1921, 1521, -3174, -194, -1701, -859, 1535, 2175, 2006, -2762, -1887, -1698,
+        -2456, -3480, 1399, 2883, 1483, -3428, -679, -2113, 834, 1532, 3772, -660, -1986, -2764, 1738, -915,
+        3182, 1056, -1431, 1350, 3364, 1464, -3555, 2919, -2319, -2160, -2310, 730, -1993, -1598, 638, 3456,
+        3696, -1168, -2956, -3588, -2557, -921, -1305, 3405, 121, -404, 2555, -3135, 2088, 2233, -3266, -2426,
+        3593, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+#define qinvzeta_256 (qdata+50)
+        -9, -17303, -29428, 24052, 19351, -12073, 26228, -24743, 20870, -12269, 21467, 19317, -15750, -25946, 5930, 32738,
+        4974, -4693, -14121, 2193, -9488, 26262, -21066, 7508, 22593, 9599, 2073, 10409, 7491, -12013, 16279, -15358,
+        -16425, -16655, -25093, 32329, 26382, 24573, 26279, 13541, 828, -25511, -29103, 26220, 23489, -8558, 11783, -24718,
+        7244, 10478, 14846, 26800, 20469, 26441, 14624, -29871, -23754, -3558, -6536, -16092, 2816, 8472, 11792, -7415,
+        -28865, -13575, -4505, -26663, -5759, -14351, -6246, -17602, 20315, -22875, 9215, 9855, -3114, -24266, 6817, -2722,
+        -14744, -15768, 4983, 12611, 15307, -21860, -28839, -27201, 18242, 32252, 1724, 21868, -19394, -8908, 23242, 13933,
+        10350, 17440, -21399, -11962, -10972, 30136, -29667, -1689, -18191, 6032, -21766, 30426, -31177, 15810, 15998, 3456,
+        -4496, -9360, 23668, 27132, -25597, -5529, -5913, 1869, -11655, 22124, -24581, 21953, 22568, 23225, -20674, 17030,
+        9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+#define zeta_512 (qdata+59)
+        -3593, 2005, 2665, 2891, -2804, 2345, -2572, 1121, -617, -188, 727, 2786, -396, -3208, 3417, -17,
+        1414, -3752, 2579, 2815, -549, 1837, 373, 151, 3706, -1012, 3750, -1509, 810, -3214, -1054, 3177,
+        -2194, -1403, -2133, -3314, -1321, 83, 1681, -658, -1296, 2070, -1386, -3547, 438, 3781, -2732, 2230,
+        -2495, -1669, 1919, 2589, -2535, -3312, -2391, -3542, -2237, -1441, 2835, -3568, -3689, -402, 2, -1070,
+        -3625, 3763, -783, -3550, 2043, -2303, 3145, -436, 2830, -893, 1533, 1712, -1881, 124, 2789, -2001,
+        2876, -2460, 2649, 3770, 3153, 2965, 3692, -1203, -1599, 2874, -1390, -1407, 7, -3745, -1166, 1649,
+        1100, 2937, 3310, 3461, -514, -1526, 2224, 715, 1525, -1689, -2743, 434, -1760, -3163, 2385, -929,
+        -2250, -2167, -486, -1144, -2440, -370, -1756, 2378, 2816, -1084, -3816, -1586, 3600, 1931, -3831, -1242,
+        -3777, 592, -1799, 2340, 103, -1338, 1497, -2071, 1921, 1519, 1521, 451, -3174, 589, -194, -3744,
+        -1701, 3677, -859, -1295, 1535, 642, 2175, -3794, 2006, 2130, -2762, 2918, -1887, 3334, -1698, 2072,
+        -2456, 509, -3480, 2998, 1399, -3408, 2883, 1476, 1483, -2262, -3428, -1779, -679, 2258, -2113, 1348,
+        834, -692, 1532, 2247, 3772, 2083, -660, -226, -1986, 2532, -2764, -3693, 1738, -429, -915, -2059,
+        3182, 2812, 1056, 3434, -1431, -2515, 1350, -236, 3364, -2386, 1464, 222, -3555, -2963, 2919, -2422,
+        -2319, -3657, -2160, 3450, -2310, -791, 730, 1181, -1993, -1404, -1598, 2339, 638, -3366, 3456, 2161,
+        3696, -3343, -1168, 2719, -2956, -826, -3588, -670, -2557, 777, -921, 1151, -1305, -796, 3405, -1278,
+        121, -3287, -404, 1072, 2555, 293, -3135, 2767, 2088, -3335, 2233, 3581, -3266, 3723, -2426, -179,
+        3593, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+#define qinvzeta_512 (qdata+76)
+        -9, 4565, -17303, 16715, -29428, 15145, 24052, -22943, 19351, 1860, -12073, -28958, 26228, -7304, -24743, -529,
+        20870, -24232, -12269, 10495, 21467, -16083, 19317, 20119, -15750, -27636, -25946, -12261, 5930, -26766, 32738, -16791,
+        4974, 25733, -4693, 20238, -14121, 18003, 2193, 6510, -9488, 29718, 26262, -25563, -21066, -1851, 7508, -19274,
+        22593, -28805, 9599, -23523, 2073, 4880, 10409, 1578, 7491, -10145, -12013, 4624, 16279, 6766, -15358, 24530,
+        -16425, 5299, -16655, -2526, -25093, -9983, 32329, 5708, 26382, -23933, 24573, 26288, 26279, 30844, 13541, 30255,
+        828, 15972, -25511, 17082, -29103, -27243, 26220, -2739, 23489, 16186, -8558, -9087, 11783, -12449, -24718, -14223,
+        7244, -8839, 10478, 30597, 14846, -12790, 26800, 14539, 20469, -6297, 26441, 9650, 14624, -25179, -29871, -9633,
+        -23754, -5751, -3558, 2952, -6536, 23182, -16092, 23882, 2816, 964, 8472, -10802, 11792, -17013, -7415, -30938,
+        -28865, -23984, -13575, -11996, -4505, -14650, -26663, -22039, -5759, 1007, -14351, 10179, -6246, -947, -17602, -20128,
+        20315, 10333, -22875, -17167, 9215, -14718, 9855, -29394, -3114, 27730, -24266, 5990, 6817, 22790, -2722, 14360,
+        -14744, 23549, -15768, -18506, 4983, 21168, 12611, 3524, 15307, 2858, -21860, 29453, -28839, 27858, -27201, 3396,
+        18242, 5452, 32252, -18745, 1724, -4573, 21868, 31518, -19394, 20964, -8908, -18541, 23242, 17491, 13933, 16885,
+        10350, -32004, 17440, -24214, -21399, -20435, -11962, -22764, -10972, -27986, 30136, -802, -29667, 11885, -1689, -13686,
+        -18191, 32695, 6032, -16006, -21766, -20759, 30426, -24931, -31177, -32124, 15810, -4317, 15998, 26330, 3456, -13711,
+        -4496, -19215, -9360, 26783, 23668, -14138, 27132, -32414, -25597, -2807, -5529, 8831, -5913, 17636, 1869, -16638,
+        -11655, 9513, 22124, 25648, -24581, -21723, 21953, -14129, 22568, -15111, 23225, 26621, -20674, -15221, 17030, -1715,
+        9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    }
+};
+
+static const vec1488 qdata_10753 = { .data = {
+
+        10753, 10753, 10753, 10753, 10753, 10753, 10753, 10753, 10753, 10753, 10753, 10753, 10753, 10753, 10753, 10753,
+
+        24964, 24964, 24964, 24964, 24964, 24964, 24964, 24964, 24964, 24964, 24964, 24964, 24964, 24964, 24964, 24964,
+
+        8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+
+        223, 223, 223, 223, 223, 223, 223, 223, 223, 223, 223, 223, 223, 223, 223, 223,
+
+        27359, 27359, 27359, 27359, 27359, 27359, 27359, 27359, 27359, 27359, 27359, 27359, 27359, 27359, 27359, 27359,
+
+        4188, 4188, 4188, 4188, 4188, 4188, 4188, 4188, 4188, 4188, 4188, 4188, 4188, 4188, 4188, 4188,
+
+        -1956, -1956, -1956, -1956, -1956, -1956, -1956, -1956, -1956, -1956, -1956, -1956, -1956, -1956, -1956, -1956,
+
+        3688, 3688, 3688, 3688, 3688, 3688, 3688, 3688, 3688, 3688, 3688, 3688, 3688, 3688, 3688, 3688,
+
+        -408, -408, -408, -408, -408, -408, -408, -408, -408, -408, -408, -408, -408, -408, -408, -408,
+
+        1018, 1018, 1018, 1018, 2413, 2413, 2413, 2413, 4188, 4188, 4188, 4188, 357, 357, 357, 357,
+        223, 223, 223, 223, -3686, -3686, -3686, -3686, -3688, -3688, -3688, -3688, -376, -376, -376, -376,
+        -1018, -1018, -1018, -1018, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+        -6, -6, -6, -6, 10093, 10093, 10093, 10093, -1956, -1956, -1956, -1956, 28517, 28517, 28517, 28517,
+        27359, 27359, 27359, 27359, -21094, -21094, -21094, -21094, 408, 408, 408, 408, -20856, -20856, -20856, -20856,
+        6, 6, 6, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+        1018, 1018, 1018, 1018, -2695, -2695, -2695, -2695, 2413, 2413, 2413, 2413, 425, 425, 425, 425,
+        4188, 4188, 4188, 4188, -4855, -4855, -4855, -4855, 357, 357, 357, 357, -3364, -3364, -3364, -3364,
+        223, 223, 223, 223, 730, 730, 730, 730, -3686, -3686, -3686, -3686, -4544, -4544, -4544, -4544,
+        -3688, -3688, -3688, -3688, -2236, -2236, -2236, -2236, -376, -376, -376, -376, 3784, 3784, 3784, 3784,
+        -1018, -1018, -1018, -1018, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+        -6, -6, -6, -6, 7033, 7033, 7033, 7033, 10093, 10093, 10093, 10093, 18345, 18345, 18345, 18345,
+        -1956, -1956, -1956, -1956, 29449, 29449, 29449, 29449, 28517, 28517, 28517, 28517, -9508, -9508, -9508, -9508,
+        27359, 27359, 27359, 27359, 16090, 16090, 16090, 16090, -21094, -21094, -21094, -21094, 28224, 28224, 28224, 28224,
+        408, 408, 408, 408, -12476, -12476, -12476, -12476, -20856, -20856, -20856, -20856, 16072, 16072, 16072, 16072,
+        6, 6, 6, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+        1018, -1520, -2695, 1341, 2413, 918, 425, 5175, 4188, -4035, -4855, 341, 357, 4347, -3364, 5213,
+        223, -4875, 730, 1931, -3686, -2503, -4544, -4095, -3688, 5063, -2236, -3823, -376, 3012, 3784, -2629,
+        -1018, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+        -6, 23056, 7033, 829, 10093, 26518, 18345, 3639, -1956, -4547, 29449, 3925, 28517, -7429, -9508, -11683,
+        27359, -17675, 16090, 14731, -21094, -25543, 28224, -14847, 408, 28103, -12476, 10001, -20856, -7228, 16072, 18363,
+        6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+        1018, -2935, -1520, -4744, -2695, -205, 1341, 1299, 2413, 4, 918, -4379, 425, -4616, 5175, -544,
+        4188, 4129, -4035, 4102, -4855, -1287, 341, -2388, 357, 1284, 4347, 2984, -3364, 2178, 5213, -2576,
+        223, 2790, -4875, 4876, 730, -4513, 1931, -3085, -3686, 3550, -2503, 847, -4544, 193, -4095, 1085,
+        -3688, 3091, 5063, -4742, -2236, 2982, -3823, -1009, -376, -268, 3012, 3062, 3784, -2565, -2629, 4189,
+        -1018, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+        -6, 31369, 23056, 15736, 7033, -24269, 829, -6381, 10093, 22532, 26518, 23781, 18345, 15864, 3639, 15840,
+        -1956, -23007, -4547, 5126, 29449, 8441, 3925, -16724, 28517, 23812, -7429, 31656, -9508, -19326, -11683, -27152,
+        27359, 20198, -17675, 6924, 16090, 22623, 14731, 5619, -21094, -24098, -25543, 3407, 28224, 22209, -14847, 573,
+        408, -4589, 28103, -5766, -12476, -12378, 10001, -31217, -20856, -2316, -7228, -20490, 16072, -14341, 18363, -12707,
+        6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+        1018, -4734, -2935, -400, -1520, 4977, -4744, -2973, -2695, 512, -205, -779, 1341, -1356, 1299, 635,
+        2413, 567, 4, -4286, 918, -5114, -4379, -1586, 425, 1615, -4616, -336, 5175, -1841, -544, 2234,
+        4188, -3441, 4129, 636, -4035, -4580, 4102, 2684, -4855, 3057, -1287, -2740, 341, -5156, -2388, -472,
+        357, -794, 1284, 578, 4347, 3615, 2984, -3715, -3364, 2271, 2178, -326, 5213, 454, -2576, -3337,
+        223, 2998, 2790, -151, -4875, 2981, 4876, 1324, 730, 2774, -4513, 2206, 1931, 886, -3085, -970,
+        -3686, 3198, 3550, 2737, -2503, -909, 847, 1068, -4544, -2213, 193, 2884, -4095, -4808, 1085, 4123,
+        -3688, 5341, 3091, 5294, 5063, -116, -4742, -5116, -2236, -2045, 2982, -1572, -3823, 4828, -1009, 467,
+        -376, 5023, -268, -3169, 3012, -1458, 3062, -1268, 3784, -675, -2565, 1006, -2629, 5064, 4189, 864,
+        -1018, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+        -6, -26238, 31369, -24976, 23056, -30351, 15736, -18845, 7033, 512, -24269, -13579, 829, 29364, -6381, -11141,
+        10093, -969, 22532, 6978, 26518, -4090, 23781, 11726, 18345, 4175, 15864, 7856, 3639, 719, 15840, -31558,
+        -1956, 31887, -23007, -21892, -4547, 22044, 5126, -19844, 29449, -32271, 8441, 32076, 3925, -11300, -16724, 28200,
+        28517, 16614, 23812, 11842, -7429, -2017, 31656, 28541, -9508, 29407, -19326, 31418, -11683, -31290, -27152, 27895,
+        27359, 12214, 20198, -14999, -17675, -1627, 6924, -13012, 16090, -4394, 22623, 7326, 14731, -22666, 5619, 8246,
+        -21094, 24702, -24098, 177, -25543, 7795, 3407, -13268, 28224, 2395, 22209, -7356, -14847, -17096, 573, -24037,
+        408, -11555, -4589, -30546, 28103, 1932, -5766, 17412, -12476, 31235, -12378, -7716, 10001, -1316, -31217, 25555,
+        -20856, -609, -2316, -8801, -7228, 11854, -20490, 780, 16072, -17571, -14341, -2066, 18363, 17352, -12707, 17248,
+        6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+        1018, 3453, -4734, 4519, -2935, 2118, -400, -554, -1520, 2196, 4977, 1893, -4744, -1409, -2973, -1053,
+        -2695, 4601, 512, 279, -205, -3241, -779, 4889, 1341, 3524, -1356, -1663, 1299, 2283, 635, 73,
+        2413, 2428, 567, 624, 4, -1930, -4286, 3419, 918, -2062, -5114, 5068, -4379, -97, -1586, 1782,
+        425, 4621, 1615, 355, -4616, 1349, -336, 825, 5175, 3135, -1841, 1160, -544, 4408, 2234, -2605,
+        4188, 854, -3441, -1056, 4129, 2439, 636, 4967, -4035, -4782, -4580, -5268, 4102, -663, 2684, -4670,
+        -4855, 3760, 3057, 3535, -1287, 2680, -2740, -569, 341, 2139, -5156, 3827, -2388, 1639, -472, 1927,
+        357, 5172, -794, -4003, 1284, 4144, 578, 693, 4347, 4784, 3615, 3125, 2984, 1122, -3715, 2113,
+        -3364, -573, 2271, -4328, 2178, 2909, -326, -4000, 5213, -4447, 454, -3995, -2576, -4428, -3337, 2529,
+        223, 5309, 2998, 5120, 2790, -2050, -151, 2963, -4875, 2657, 2981, -2807, 4876, 2237, 1324, -4403,
+        730, 2624, 2774, -5083, -4513, 40, 2206, 152, 1931, -1573, 886, 2625, -3085, -778, -970, -5107,
+        -3686, 4250, 3198, -5356, 3550, -3148, 2737, -3360, -2503, -2015, -909, 3096, 847, 5313, 1068, 834,
+        -4544, -1132, -2213, -2151, 193, -1722, 2884, -4393, -4095, 2662, -4808, -2788, 1085, -1992, 4123, 5334,
+        -3688, 5215, 5341, -1689, 3091, -2117, 5294, 4859, 5063, 3410, -116, 2205, -4742, -2374, -5116, -4720,
+        -2236, 3570, -2045, 2813, 2982, 2087, -1572, -4973, -3823, 458, 4828, 3891, -1009, -2419, 467, -4891,
+        -376, -1381, 5023, 1204, -268, 274, -3169, -3260, 3012, -1635, -1458, 4540, 3062, -4254, -1268, -1111,
+        3784, 2230, -675, -2279, -2565, -4359, 1006, -1510, -2629, 5015, 5064, -2449, 4189, -5005, 864, 2487,
+        -1018, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+        -6, -29827, -26238, -21593, 31369, -29626, -24976, -7722, 23056, -16236, -30351, 30053, 15736, 9343, -18845, -16925,
+        7033, 14329, 512, 15127, -24269, -21161, -13579, -1767, 829, -6716, 29364, -12415, -6381, 31467, -11141, 1609,
+        10093, -20100, -969, -23952, 22532, -25482, 6978, 8027, 26518, 17394, -4090, -25652, 23781, -5729, 11726, -21770,
+        18345, -4083, 4175, -15517, 15864, -19643, 7856, -22215, 3639, -18881, 719, -19320, 15840, -7880, -31558, 22483,
+        -1956, -6314, 31887, 15328, -23007, -7289, -21892, 11623, -4547, 31058, 22044, 13164, 5126, -15511, -19844, 6594,
+        29449, 11952, -32271, 6095, 8441, 23160, 32076, 22471, 3925, 6747, -11300, 12531, -16724, 8295, 28200, -7801,
+        28517, -29644, 16614, -20899, 23812, 12336, 11842, 20661, -7429, 12976, -2017, 23093, 31656, -3998, 28541, 24129,
+        -9508, -61, 29407, -232, -19326, -13987, 31418, 12384, -11683, -31583, -31290, 24165, -27152, 26292, 27895, 8161,
+        27359, 4797, 12214, 5120, 20198, 19454, -14999, -4717, -17675, 8289, -1627, 31497, 6924, 1725, -13012, 19661,
+        16090, -30144, -4394, -9691, 22623, 28712, 7326, 4248, 14731, 3035, -22666, 24641, 5619, -24330, 8246, -13811,
+        -21094, -13158, 24702, -23788, -24098, 27572, 177, 13024, -25543, -29151, 7795, 7192, 3407, 27329, -13268, 12098,
+        28224, -19564, 2395, -8807, 22209, 32070, -7356, -22313, -14847, 20070, -17096, 23836, 573, -14280, -24037, -1834,
+        408, 32351, -11555, 4967, -4589, 18875, -30546, -6917, 28103, -26286, 1932, 18077, -5766, 29370, 17412, 19856,
+        -12476, 23026, 31235, -30467, -12378, -24025, -7716, -12653, 10001, -8758, -1316, -20173, -31217, -11123, 25555, 23269,
+        -20856, -29541, -609, 31924, -2316, 3346, -8801, -13500, -7228, 14237, 11854, 14780, -20490, -9374, 780, 16809,
+        16072, 11446, -17571, -8935, -14341, 5369, -2066, -18918, 18363, 19863, 17352, -16273, -12707, 3699, 17248, 951,
+        6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    }
+};
+
+static inline __m256i sub_x16(__m256i a, __m256i b) {
+    //__asm__("vpsubw %1,%0,%0" : "+x"(a),"+x"(b));
+    return _mm256_sub_epi16(a, b);
+}
+
+static inline __m256i add_x16(__m256i a, __m256i b) {
+    return _mm256_add_epi16(a, b);
+}
+
+static inline __m256i reduce_x16(const __m256i *qdata, __m256i x) {
+    __m256i y = _mm256_mulhi_epi16(x, qrecip_x16);
+    y = _mm256_mulhrs_epi16(y, qshift_x16);
+    y = _mm256_mullo_epi16(y, q_x16);
+    return sub_x16(x, y);
+}
+
+static inline __m256i mulmod_x16_scaled(const __m256i *qdata, __m256i x, __m256i y, __m256i yqinv) {
+    __m256i b = _mm256_mulhi_epi16(x, y);
+    __m256i d = _mm256_mullo_epi16(x, yqinv);
+    __m256i e = _mm256_mulhi_epi16(d, q_x16);
+    return sub_x16(b, e);
+}
+
+typedef union {
+    int8 data[32];
+    __m256i _dummy;
+} byte32;
+static const byte32 shuffle_buf = { .data = {
+        14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1,
+        14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1,
+    }
+};
+#define shuffle (*(__m256i *) shuffle_buf.data)
+
+static inline __m256i _mm256_loadu_reverse16(const __m256i *p) {
+    __m256i x = _mm256_loadu_si256(p);
+    x = _mm256_permute2x128_si256(x, x, 1);
+    x = _mm256_shuffle_epi8(x, shuffle);
+    return x;
+}
+
+static void ntt128(int16 *f, int reps, const __m256i *qdata) {
+    __m256i f0, f1, f2, f3, g0, g1, g2, g3, h0, h1, h2, h3;
+    int16 *origf = f;
+    int rep;
+    __m256i zetainv_128_0 = zetainv(128, 0);
+    __m256i zetainv_qinv_128_0 = zetainv_qinv(128, 0);
+    __m256i zetainv_x4_32_0 = zetainv_x4(32, 0);
+    __m256i zetainv_x4_qinv_32_0 = zetainv_x4_qinv(32, 0);
+    __m256i zetainv_128_1 = zetainv(128, 1);
+    __m256i zetainv_qinv_128_1 = zetainv_qinv(128, 1);
+    __m256i zetainv_x4_32_1 = zetainv_x4(32, 1);
+    __m256i zetainv_x4_qinv_32_1 = zetainv_x4_qinv(32, 1);
+    for (rep = 0; rep < reps; ++rep) {
+        f1 = _mm256_loadu_si256((__m256i *) (f + 32));
+        f3 = _mm256_loadu_si256((__m256i *) (f + 96));
+        g3 = sub_x16(f1, f3);
+        g3 = mulmod_x16_scaled(qdata, g3, zeta4_x16, zeta4_x16_qinv);
+        g1 = add_x16(f1, f3);
+
+        f0 = _mm256_loadu_si256((__m256i *) (f + 0));
+        f2 = _mm256_loadu_si256((__m256i *) (f + 64));
+        g2 = sub_x16(f0, f2);
+        g0 = add_x16(f0, f2);
+
+        f3 = sub_x16(g3, g2);
+        f2 = add_x16(g2, g3);
+        f3 = mulmod_x16_scaled(qdata, f3, zetainv_128_0, zetainv_qinv_128_0);
+        f2 = mulmod_x16_scaled(qdata, f2, zeta(128, 0), zeta_qinv(128, 0));
+
+        g2 = _mm256_unpacklo_epi16(f2, f3);
+        g3 = _mm256_unpackhi_epi16(f2, f3);
+
+        f1 = sub_x16(g0, g1);
+        f0 = add_x16(g0, g1);
+        f1 = mulmod_x16_scaled(qdata, f1, zeta(64, 0), zeta_qinv(64, 0));
+        f0 = reduce_x16(qdata, f0);
+
+        g0 = _mm256_unpacklo_epi16(f0, f1);
+        h0 = _mm256_unpacklo_epi32(g0, g2);
+        h1 = _mm256_unpackhi_epi32(g0, g2);
+        g1 = _mm256_unpackhi_epi16(f0, f1);
+        h2 = _mm256_unpacklo_epi32(g1, g3);
+        h3 = _mm256_unpackhi_epi32(g1, g3);
+        f0 = _mm256_permute2x128_si256(h0, h1, 0x20);
+        f2 = _mm256_permute2x128_si256(h0, h1, 0x31);
+        f1 = _mm256_permute2x128_si256(h2, h3, 0x20);
+        f3 = _mm256_permute2x128_si256(h2, h3, 0x31);
+
+        _mm256_storeu_si256((__m256i *) (f + 0), f0);
+        _mm256_storeu_si256((__m256i *) (f + 64), f2);
+        _mm256_storeu_si256((__m256i *) (f + 32), f1);
+        _mm256_storeu_si256((__m256i *) (f + 96), f3);
+
+        f1 = _mm256_loadu_si256((__m256i *) (f + 48));
+        f3 = _mm256_loadu_si256((__m256i *) (f + 112));
+        g3 = sub_x16(f1, f3);
+        g3 = mulmod_x16_scaled(qdata, g3, zeta4_x16, zeta4_x16_qinv);
+        g1 = add_x16(f1, f3);
+
+        f0 = _mm256_loadu_si256((__m256i *) (f + 16));
+        f2 = _mm256_loadu_si256((__m256i *) (f + 80));
+        g2 = sub_x16(f0, f2);
+        g0 = add_x16(f0, f2);
+
+        f3 = sub_x16(g3, g2);
+        f2 = add_x16(g2, g3);
+        f3 = mulmod_x16_scaled(qdata, f3, zetainv_128_1, zetainv_qinv_128_1);
+        f2 = mulmod_x16_scaled(qdata, f2, zeta(128, 1), zeta_qinv(128, 1));
+
+        g2 = _mm256_unpacklo_epi16(f2, f3);
+        g3 = _mm256_unpackhi_epi16(f2, f3);
+
+        f1 = sub_x16(g0, g1);
+        f0 = add_x16(g0, g1);
+        f1 = mulmod_x16_scaled(qdata, f1, zeta(64, 1), zeta_qinv(64, 1));
+        f0 = reduce_x16(qdata, f0);
+
+        g0 = _mm256_unpacklo_epi16(f0, f1);
+        h0 = _mm256_unpacklo_epi32(g0, g2);
+        h1 = _mm256_unpackhi_epi32(g0, g2);
+        g1 = _mm256_unpackhi_epi16(f0, f1);
+        h2 = _mm256_unpacklo_epi32(g1, g3);
+        h3 = _mm256_unpackhi_epi32(g1, g3);
+        f0 = _mm256_permute2x128_si256(h0, h1, 0x20);
+        f2 = _mm256_permute2x128_si256(h0, h1, 0x31);
+        f1 = _mm256_permute2x128_si256(h2, h3, 0x20);
+        f3 = _mm256_permute2x128_si256(h2, h3, 0x31);
+
+        _mm256_storeu_si256((__m256i *) (f + 16), f0);
+        _mm256_storeu_si256((__m256i *) (f + 80), f2);
+        _mm256_storeu_si256((__m256i *) (f + 48), f1);
+        _mm256_storeu_si256((__m256i *) (f + 112), f3);
+
+        f += 128;
+    }
+    f = origf;
+    for (rep = 0; rep < reps; ++rep) {
+        f1 = _mm256_loadu_si256((__m256i *) (f + 64));
+        f3 = _mm256_loadu_si256((__m256i *) (f + 80));
+        g3 = sub_x16(f1, f3);
+        g3 = mulmod_x16_scaled(qdata, g3, zeta4_x16, zeta4_x16_qinv);
+        g1 = add_x16(f1, f3);
+
+        f0 = _mm256_loadu_si256((__m256i *) (f + 0));
+        f2 = _mm256_loadu_si256((__m256i *) (f + 16));
+        g2 = sub_x16(f0, f2);
+        g0 = add_x16(f0, f2);
+
+        f3 = sub_x16(g3, g2);
+        f2 = add_x16(g2, g3);
+        f3 = mulmod_x16_scaled(qdata, f3, zetainv_x4_32_0, zetainv_x4_qinv_32_0);
+        f2 = mulmod_x16_scaled(qdata, f2, zeta_x4(32, 0), zeta_x4_qinv(32, 0));
+
+        g2 = _mm256_unpacklo_epi64(f2, f3);
+        g3 = _mm256_unpackhi_epi64(f2, f3);
+
+        f1 = sub_x16(g0, g1);
+        f0 = add_x16(g0, g1);
+        f1 = mulmod_x16_scaled(qdata, f1, zeta_x4(16, 0), zeta_x4_qinv(16, 0));
+        f0 = reduce_x16(qdata, f0);
+
+        g1 = _mm256_unpackhi_epi64(f0, f1);
+        g0 = _mm256_unpacklo_epi64(f0, f1);
+        f1 = _mm256_permute2x128_si256(g1, g3, 0x20);
+        f3 = _mm256_permute2x128_si256(g1, g3, 0x31);
+        f0 = _mm256_permute2x128_si256(g0, g2, 0x20);
+        f2 = _mm256_permute2x128_si256(g0, g2, 0x31);
+
+        _mm256_storeu_si256((__m256i *) (f + 64), f1);
+        _mm256_storeu_si256((__m256i *) (f + 80), f3);
+        _mm256_storeu_si256((__m256i *) (f + 0), f0);
+        _mm256_storeu_si256((__m256i *) (f + 16), f2);
+
+        f1 = _mm256_loadu_si256((__m256i *) (f + 96));
+        f3 = _mm256_loadu_si256((__m256i *) (f + 112));
+        g3 = sub_x16(f1, f3);
+        g3 = mulmod_x16_scaled(qdata, g3, zeta4_x16, zeta4_x16_qinv);
+        g1 = add_x16(f1, f3);
+
+        f0 = _mm256_loadu_si256((__m256i *) (f + 32));
+        f2 = _mm256_loadu_si256((__m256i *) (f + 48));
+        g2 = sub_x16(f0, f2);
+        g0 = add_x16(f0, f2);
+
+        f3 = sub_x16(g3, g2);
+        f2 = add_x16(g2, g3);
+        f3 = mulmod_x16_scaled(qdata, f3, zetainv_x4_32_1, zetainv_x4_qinv_32_1);
+        f2 = mulmod_x16_scaled(qdata, f2, zeta_x4(32, 1), zeta_x4_qinv(32, 1));
+
+        g2 = _mm256_unpacklo_epi64(f2, f3);
+        g3 = _mm256_unpackhi_epi64(f2, f3);
+
+        f1 = sub_x16(g0, g1);
+        f0 = add_x16(g0, g1);
+        f1 = mulmod_x16_scaled(qdata, f1, zeta_x4(16, 1), zeta_x4_qinv(16, 1));
+        f0 = reduce_x16(qdata, f0);
+
+        g1 = _mm256_unpackhi_epi64(f0, f1);
+        g0 = _mm256_unpacklo_epi64(f0, f1);
+        f1 = _mm256_permute2x128_si256(g1, g3, 0x20);
+        f3 = _mm256_permute2x128_si256(g1, g3, 0x31);
+        f0 = _mm256_permute2x128_si256(g0, g2, 0x20);
+        f2 = _mm256_permute2x128_si256(g0, g2, 0x31);
+
+        _mm256_storeu_si256((__m256i *) (f + 96), f1);
+        _mm256_storeu_si256((__m256i *) (f + 112), f3);
+        _mm256_storeu_si256((__m256i *) (f + 32), f0);
+        _mm256_storeu_si256((__m256i *) (f + 48), f2);
+
+        f += 128;
+    }
+    f = origf;
+    for (rep = 0; rep < reps; ++rep) {
+
+        f1 = _mm256_loadu_si256((__m256i *) (f + 16));
+        f3 = _mm256_loadu_si256((__m256i *) (f + 48));
+        g3 = sub_x16(f1, f3);
+        g3 = mulmod_x16_scaled(qdata, g3, zeta4_x16, zeta4_x16_qinv);
+        g1 = add_x16(f1, f3);
+
+        f0 = _mm256_loadu_si256((__m256i *) (f + 0));
+        f2 = _mm256_loadu_si256((__m256i *) (f + 32));
+        g2 = sub_x16(f0, f2);
+        g0 = add_x16(f0, f2);
+
+        f2 = add_x16(g2, g3);
+        f3 = sub_x16(g2, g3);
+        f2 = reduce_x16(qdata, f2);
+        f3 = reduce_x16(qdata, f3);
+
+        f1 = sub_x16(g0, g1);
+        f0 = add_x16(g0, g1);
+        f0 = reduce_x16(qdata, f0);
+
+        h0 = f0;
+        h1 = f1;
+        h2 = f2;
+        h3 = f3;
+
+        f1 = _mm256_loadu_si256((__m256i *) (f + 80));
+        f3 = _mm256_loadu_si256((__m256i *) (f + 112));
+        g3 = sub_x16(f1, f3);
+        g3 = mulmod_x16_scaled(qdata, g3, zeta4_x16, zeta4_x16_qinv);
+        g1 = add_x16(f1, f3);
+
+        f0 = _mm256_loadu_si256((__m256i *) (f + 64));
+        f2 = _mm256_loadu_si256((__m256i *) (f + 96));
+        g2 = sub_x16(f0, f2);
+        g0 = add_x16(f0, f2);
+
+        f3 = sub_x16(g3, g2);
+        f2 = add_x16(g2, g3);
+        f3 = mulmod_x16_scaled(qdata, f3, zetainv8_x16, zetainv8_x16_qinv);
+        f2 = mulmod_x16_scaled(qdata, f2, zeta8_x16, zeta8_x16_qinv);
+
+        f1 = sub_x16(g0, g1);
+        f0 = add_x16(g0, g1);
+        f1 = mulmod_x16_scaled(qdata, f1, zeta4_x16, zeta4_x16_qinv);
+        f0 = reduce_x16(qdata, f0);
+
+        g0 = add_x16(h0, f0);
+        g1 = add_x16(h1, f1);
+        g2 = add_x16(h2, f2);
+        g3 = add_x16(h3, f3);
+        _mm256_storeu_si256((__m256i *) (f + 0), g0);
+        _mm256_storeu_si256((__m256i *) (f + 16), g1);
+        _mm256_storeu_si256((__m256i *) (f + 32), g2);
+        _mm256_storeu_si256((__m256i *) (f + 48), g3);
+        g0 = sub_x16(h0, f0);
+        g1 = sub_x16(h1, f1);
+        g2 = sub_x16(h2, f2);
+        g3 = sub_x16(h3, f3);
+        _mm256_storeu_si256((__m256i *) (f + 64), g0);
+        _mm256_storeu_si256((__m256i *) (f + 80), g1);
+        _mm256_storeu_si256((__m256i *) (f + 96), g2);
+        _mm256_storeu_si256((__m256i *) (f + 112), g3);
+        f += 128;
+    }
+}
+
+static void ntt512(int16 *f, int reps, const __m256i *qdata) {
+    __m256i f0, f1, f2, f3, g0, g1, g2, g3; /* [-Werror=unused-variable] */ /* ,h0,h1,h2,h3; */
+    int16 *origf = f;
+    int rep;
+    __m256i zetainv_512[8];
+    __m256i zetainv_qinv_512[8];
+    int i;
+    for (i = 0; i < 8; ++i) {
+        zetainv_512[i] = zetainv(512, i);
+    }
+    for (i = 0; i < 8; ++i) {
+        zetainv_qinv_512[i] = zetainv_qinv(512, i);
+    }
+    for (rep = 0; rep < reps; ++rep) {
+        for (i = 0; i < 8; ++i) {
+            f1 = _mm256_loadu_si256((__m256i *) (f + 16 * i + 128));
+            f3 = _mm256_loadu_si256((__m256i *) (f + 16 * i + 384));
+            g3 = sub_x16(f1, f3);
+            g3 = mulmod_x16_scaled(qdata, g3, zeta4_x16, zeta4_x16_qinv);
+            g1 = add_x16(f1, f3);
+
+            f0 = _mm256_loadu_si256((__m256i *) (f + 16 * i));
+            f2 = _mm256_loadu_si256((__m256i *) (f + 16 * i + 256));
+            g2 = sub_x16(f0, f2);
+            g0 = add_x16(f0, f2);
+
+            f3 = sub_x16(g3, g2);
+            f2 = add_x16(g2, g3);
+            f3 = mulmod_x16_scaled(qdata, f3, zetainv_512[i], zetainv_qinv_512[i]);
+            f2 = mulmod_x16_scaled(qdata, f2, zeta(512, i), zeta_qinv(512, i));
+
+            f1 = sub_x16(g0, g1);
+            f0 = add_x16(g0, g1);
+            f1 = mulmod_x16_scaled(qdata, f1, zeta(256, i), zeta_qinv(256, i));
+            f0 = reduce_x16(qdata, f0);
+
+            _mm256_storeu_si256((__m256i *) (f + 16 * i + 384), f3);
+            _mm256_storeu_si256((__m256i *) (f + 16 * i + 256), f2);
+            _mm256_storeu_si256((__m256i *) (f + 16 * i + 128), f1);
+            _mm256_storeu_si256((__m256i *) (f + 16 * i), f0);
+
+        }
+        f += 512;
+    }
+    f = origf;
+    ntt128(f, reps * 4, qdata);
+}
+
+void PQCLEAN_NTRULPR653_AVX2_ntt512_7681(int16 *f, int reps) {
+    ntt512(f, reps, (const __m256i *) qdata_7681.data);
+}
+
+void PQCLEAN_NTRULPR653_AVX2_ntt512_10753(int16 *f, int reps) {
+    ntt512(f, reps, (const __m256i *) qdata_10753.data);
+}
+
+static void invntt128(int16 *f, int reps, const __m256i *qdata) {
+    __m256i f0, f1, f2, f3, g0, g1, g2, g3, h0, h1, h2, h3;
+    int16 *origf = f;
+    int rep;
+    __m256i zetainv_x4_16_0 = zetainv_x4(16, 0);
+    __m256i zetainv_x4_qinv_16_0 = zetainv_x4_qinv(16, 0);
+    __m256i zetainv_x4_32_0 = zetainv_x4(32, 0);
+    __m256i zetainv_x4_qinv_32_0 = zetainv_x4_qinv(32, 0);
+    __m256i zetainv_64_0 = zetainv(64, 0);
+    __m256i zetainv_qinv_64_0 = zetainv_qinv(64, 0);
+    __m256i zetainv_128_0 = zetainv(128, 0);
+    __m256i zetainv_qinv_128_0 = zetainv_qinv(128, 0);
+    __m256i zetainv_x4_16_1 = zetainv_x4(16, 1);
+    __m256i zetainv_x4_qinv_16_1 = zetainv_x4_qinv(16, 1);
+    __m256i zetainv_x4_32_1 = zetainv_x4(32, 1);
+    __m256i zetainv_x4_qinv_32_1 = zetainv_x4_qinv(32, 1);
+    __m256i zetainv_64_1 = zetainv(64, 1);
+    __m256i zetainv_qinv_64_1 = zetainv_qinv(64, 1);
+    __m256i zetainv_128_1 = zetainv(128, 1);
+    __m256i zetainv_qinv_128_1 = zetainv_qinv(128, 1);
+    for (rep = 0; rep < reps; ++rep) {
+        f0 = _mm256_loadu_si256((__m256i *) (f +   0));
+        f1 = _mm256_loadu_si256((__m256i *) (f +  64));
+        f2 = _mm256_loadu_si256((__m256i *) (f +  16));
+        f3 = _mm256_loadu_si256((__m256i *) (f +  80));
+        g0 = _mm256_loadu_si256((__m256i *) (f +  32));
+        g1 = _mm256_loadu_si256((__m256i *) (f +  96));
+        g2 = _mm256_loadu_si256((__m256i *) (f +  48));
+        g3 = _mm256_loadu_si256((__m256i *) (f + 112));
+
+        h1 = sub_x16(f0, f1);
+        h1 = reduce_x16(qdata, h1);
+        h0 = add_x16(f0, f1);
+        h3 = sub_x16(f2, f3);
+        h3 = mulmod_x16_scaled(qdata, h3, zeta4_x16, zeta4_x16_qinv);
+        h2 = add_x16(f2, f3);
+        f1 = sub_x16(g0, g1);
+        f1 = mulmod_x16_scaled(qdata, f1, zetainv8_x16, zetainv8_x16_qinv);
+        f0 = add_x16(g0, g1);
+        f3 = sub_x16(g2, g3);
+        f3 = mulmod_x16_scaled(qdata, f3, zeta8_x16, zeta8_x16_qinv);
+        f2 = add_x16(g2, g3);
+
+        g0 = add_x16(h0, h2);
+        g0 = reduce_x16(qdata, g0);
+        g2 = sub_x16(h0, h2);
+        g2 = reduce_x16(qdata, g2);
+        g1 = sub_x16(h1, h3);
+        g3 = add_x16(h1, h3);
+        h2 = sub_x16(f0, f2);
+        h2 = mulmod_x16_scaled(qdata, h2, zeta4_x16, zeta4_x16_qinv);
+        h0 = add_x16(f0, f2);
+        h3 = add_x16(f1, f3);
+        h3 = mulmod_x16_scaled(qdata, h3, zeta4_x16, zeta4_x16_qinv);
+        h1 = sub_x16(f1, f3);
+
+        f0 = add_x16(g0, h0);
+        g0 = sub_x16(g0, h0);
+        f1 = add_x16(g1, h1);
+        g1 = sub_x16(g1, h1);
+        f2 = sub_x16(g2, h2);
+        g2 = add_x16(g2, h2);
+        f3 = sub_x16(g3, h3);
+        g3 = add_x16(g3, h3);
+
+        _mm256_storeu_si256((__m256i *) (f +   0), f0);
+        _mm256_storeu_si256((__m256i *) (f +  32), g0);
+        _mm256_storeu_si256((__m256i *) (f +  64), f1);
+        _mm256_storeu_si256((__m256i *) (f +  96), g1);
+        _mm256_storeu_si256((__m256i *) (f +  16), f2);
+        _mm256_storeu_si256((__m256i *) (f +  48), g2);
+        _mm256_storeu_si256((__m256i *) (f +  80), f3);
+        _mm256_storeu_si256((__m256i *) (f + 112), g3);
+
+        f += 128;
+    }
+    f = origf;
+    for (rep = 0; rep < reps; ++rep) {
+        f0 = _mm256_loadu_si256((__m256i *) (f + 0));
+        f1 = _mm256_loadu_si256((__m256i *) (f + 64));
+        f2 = _mm256_loadu_si256((__m256i *) (f + 16));
+        f3 = _mm256_loadu_si256((__m256i *) (f + 80));
+
+        g0 = _mm256_unpacklo_epi64(f0, f1);
+        g1 = _mm256_unpacklo_epi64(f2, f3);
+        g2 = _mm256_unpackhi_epi64(f0, f1);
+        g3 = _mm256_unpackhi_epi64(f2, f3);
+        f2 = _mm256_permute2x128_si256(g0, g1, 0x31);
+        f3 = _mm256_permute2x128_si256(g2, g3, 0x31);
+        f0 = _mm256_permute2x128_si256(g0, g1, 0x20);
+        f1 = _mm256_permute2x128_si256(g2, g3, 0x20);
+
+        f2 = mulmod_x16_scaled(qdata, f2, zetainv_x4_32_0, zetainv_x4_qinv_32_0);
+        f3 = mulmod_x16_scaled(qdata, f3, zeta_x4(32, 0), zeta_x4_qinv(32, 0));
+
+        g3 = add_x16(f3, f2);
+        g3 = mulmod_x16_scaled(qdata, g3, zeta4_x16, zeta4_x16_qinv);
+        g2 = sub_x16(f3, f2);
+
+        f0 = reduce_x16(qdata, f0);
+        f1 = mulmod_x16_scaled(qdata, f1, zetainv_x4_16_0, zetainv_x4_qinv_16_0);
+
+        g1 = add_x16(f0, f1);
+        g0 = sub_x16(f0, f1);
+
+        f1 = add_x16(g1, g3);
+        f3 = sub_x16(g1, g3);
+        f0 = add_x16(g0, g2);
+        f2 = sub_x16(g0, g2);
+
+        _mm256_storeu_si256((__m256i *) (f + 64), f1);
+        _mm256_storeu_si256((__m256i *) (f + 80), f3);
+        _mm256_storeu_si256((__m256i *) (f + 0), f0);
+        _mm256_storeu_si256((__m256i *) (f + 16), f2);
+
+        f0 = _mm256_loadu_si256((__m256i *) (f + 32));
+        f1 = _mm256_loadu_si256((__m256i *) (f + 96));
+        f2 = _mm256_loadu_si256((__m256i *) (f + 48));
+        f3 = _mm256_loadu_si256((__m256i *) (f + 112));
+
+        g0 = _mm256_unpacklo_epi64(f0, f1);
+        g1 = _mm256_unpacklo_epi64(f2, f3);
+        g2 = _mm256_unpackhi_epi64(f0, f1);
+        g3 = _mm256_unpackhi_epi64(f2, f3);
+        f2 = _mm256_permute2x128_si256(g0, g1, 0x31);
+        f3 = _mm256_permute2x128_si256(g2, g3, 0x31);
+        f0 = _mm256_permute2x128_si256(g0, g1, 0x20);
+        f1 = _mm256_permute2x128_si256(g2, g3, 0x20);
+
+        f2 = mulmod_x16_scaled(qdata, f2, zetainv_x4_32_1, zetainv_x4_qinv_32_1);
+        f3 = mulmod_x16_scaled(qdata, f3, zeta_x4(32, 1), zeta_x4_qinv(32, 1));
+
+        g3 = add_x16(f3, f2);
+        g3 = mulmod_x16_scaled(qdata, g3, zeta4_x16, zeta4_x16_qinv);
+        g2 = sub_x16(f3, f2);
+
+        f0 = reduce_x16(qdata, f0);
+        f1 = mulmod_x16_scaled(qdata, f1, zetainv_x4_16_1, zetainv_x4_qinv_16_1);
+
+        g1 = add_x16(f0, f1);
+        g0 = sub_x16(f0, f1);
+
+        f1 = add_x16(g1, g3);
+        f3 = sub_x16(g1, g3);
+        f0 = add_x16(g0, g2);
+        f2 = sub_x16(g0, g2);
+
+        _mm256_storeu_si256((__m256i *) (f + 96), f1);
+        _mm256_storeu_si256((__m256i *) (f + 112), f3);
+        _mm256_storeu_si256((__m256i *) (f + 32), f0);
+        _mm256_storeu_si256((__m256i *) (f + 48), f2);
+
+        f += 128;
+    }
+    f = origf;
+    for (rep = 0; rep < reps; ++rep) {
+        f0 = _mm256_loadu_si256((__m256i *) (f + 0));
+        f2 = _mm256_loadu_si256((__m256i *) (f + 64));
+        f1 = _mm256_loadu_si256((__m256i *) (f + 32));
+        f3 = _mm256_loadu_si256((__m256i *) (f + 96));
+
+        g0 = _mm256_permute2x128_si256(f0, f2, 0x20);
+        g2 = _mm256_permute2x128_si256(f0, f2, 0x31);
+        f0 = _mm256_unpacklo_epi16(g0, g2);
+        f2 = _mm256_unpackhi_epi16(g0, g2);
+        g1 = _mm256_permute2x128_si256(f1, f3, 0x20);
+        g3 = _mm256_permute2x128_si256(f1, f3, 0x31);
+        f1 = _mm256_unpacklo_epi16(g1, g3);
+        f3 = _mm256_unpackhi_epi16(g1, g3);
+        g1 = _mm256_unpackhi_epi16(f0, f2);
+        g0 = _mm256_unpacklo_epi16(f0, f2);
+        g3 = _mm256_unpackhi_epi16(f1, f3);
+        g2 = _mm256_unpacklo_epi16(f1, f3);
+        f2 = _mm256_unpacklo_epi64(g1, g3);
+        f3 = _mm256_unpackhi_epi64(g1, g3);
+        f0 = _mm256_unpacklo_epi64(g0, g2);
+        f1 = _mm256_unpackhi_epi64(g0, g2);
+
+        f2 = mulmod_x16_scaled(qdata, f2, zetainv_128_0, zetainv_qinv_128_0);
+        f3 = mulmod_x16_scaled(qdata, f3, zeta(128, 0), zeta_qinv(128, 0));
+        f0 = reduce_x16(qdata, f0);
+        f1 = mulmod_x16_scaled(qdata, f1, zetainv_64_0, zetainv_qinv_64_0);
+
+        g3 = add_x16(f3, f2);
+        g3 = mulmod_x16_scaled(qdata, g3, zeta4_x16, zeta4_x16_qinv);
+        g1 = add_x16(f0, f1);
+        g2 = sub_x16(f3, f2);
+        g0 = sub_x16(f0, f1);
+
+        f1 = add_x16(g1, g3);
+        f3 = sub_x16(g1, g3);
+        f0 = add_x16(g0, g2);
+        f2 = sub_x16(g0, g2);
+
+        _mm256_storeu_si256((__m256i *) (f + 32), f1);
+        _mm256_storeu_si256((__m256i *) (f + 96), f3);
+        _mm256_storeu_si256((__m256i *) (f + 0), f0);
+        _mm256_storeu_si256((__m256i *) (f + 64), f2);
+
+        f0 = _mm256_loadu_si256((__m256i *) (f + 16));
+        f2 = _mm256_loadu_si256((__m256i *) (f + 80));
+        f1 = _mm256_loadu_si256((__m256i *) (f + 48));
+        f3 = _mm256_loadu_si256((__m256i *) (f + 112));
+
+        g0 = _mm256_permute2x128_si256(f0, f2, 0x20);
+        g2 = _mm256_permute2x128_si256(f0, f2, 0x31);
+        f0 = _mm256_unpacklo_epi16(g0, g2);
+        f2 = _mm256_unpackhi_epi16(g0, g2);
+        g1 = _mm256_permute2x128_si256(f1, f3, 0x20);
+        g3 = _mm256_permute2x128_si256(f1, f3, 0x31);
+        f1 = _mm256_unpacklo_epi16(g1, g3);
+        f3 = _mm256_unpackhi_epi16(g1, g3);
+        g1 = _mm256_unpackhi_epi16(f0, f2);
+        g0 = _mm256_unpacklo_epi16(f0, f2);
+        g3 = _mm256_unpackhi_epi16(f1, f3);
+        g2 = _mm256_unpacklo_epi16(f1, f3);
+        f2 = _mm256_unpacklo_epi64(g1, g3);
+        f3 = _mm256_unpackhi_epi64(g1, g3);
+        f0 = _mm256_unpacklo_epi64(g0, g2);
+        f1 = _mm256_unpackhi_epi64(g0, g2);
+
+        f2 = mulmod_x16_scaled(qdata, f2, zetainv_128_1, zetainv_qinv_128_1);
+        f3 = mulmod_x16_scaled(qdata, f3, zeta(128, 1), zeta_qinv(128, 1));
+        f0 = reduce_x16(qdata, f0);
+        f1 = mulmod_x16_scaled(qdata, f1, zetainv_64_1, zetainv_qinv_64_1);
+
+        g3 = add_x16(f3, f2);
+        g3 = mulmod_x16_scaled(qdata, g3, zeta4_x16, zeta4_x16_qinv);
+        g1 = add_x16(f0, f1);
+        g2 = sub_x16(f3, f2);
+        g0 = sub_x16(f0, f1);
+
+        f1 = add_x16(g1, g3);
+        f3 = sub_x16(g1, g3);
+        f0 = add_x16(g0, g2);
+        f2 = sub_x16(g0, g2);
+
+        _mm256_storeu_si256((__m256i *) (f + 48), f1);
+        _mm256_storeu_si256((__m256i *) (f + 112), f3);
+        _mm256_storeu_si256((__m256i *) (f + 16), f0);
+        _mm256_storeu_si256((__m256i *) (f + 80), f2);
+
+        f += 128;
+    }
+}
+
+static void invntt512(int16 *f, int reps, const __m256i *qdata) {
+    __m256i f0, f1, f2, f3, g0, g1, g2, g3; /* [-Werror=unused-variable] */ /* ,h0,h1,h2,h3; */
+    /* [-Werror=unused-variable] */ /* int16 *origf = f; */
+    int rep;
+    __m256i zetainv_512[8];
+    __m256i zetainv_qinv_512[8];
+    __m256i zetainv_256[8];
+    __m256i zetainv_qinv_256[8];
+    int i;
+    for (i = 0; i < 8; ++i) {
+        zetainv_512[i] = zetainv(512, i);
+    }
+    for (i = 0; i < 8; ++i) {
+        zetainv_qinv_512[i] = zetainv_qinv(512, i);
+    }
+    for (i = 0; i < 8; ++i) {
+        zetainv_256[i] = zetainv(256, i);
+    }
+    for (i = 0; i < 8; ++i) {
+        zetainv_qinv_256[i] = zetainv_qinv(256, i);
+    }
+    invntt128(f, 4 * reps, qdata);
+    for (rep = 0; rep < reps; ++rep) {
+        for (i = 0; i < 8; ++i) {
+            f2 = _mm256_loadu_si256((__m256i *) (f + 16 * i + 256));
+            f3 = _mm256_loadu_si256((__m256i *) (f + 16 * i + 384));
+
+            f2 = mulmod_x16_scaled(qdata, f2, zetainv_512[i], zetainv_qinv_512[i]);
+            f3 = mulmod_x16_scaled(qdata, f3, zeta(512, i), zeta_qinv(512, i));
+            g3 = add_x16(f3, f2);
+            g3 = mulmod_x16_scaled(qdata, g3, zeta4_x16, zeta4_x16_qinv);
+            g2 = sub_x16(f3, f2);
+
+            f0 = _mm256_loadu_si256((__m256i *) (f + 16 * i + 0));
+            f1 = _mm256_loadu_si256((__m256i *) (f + 16 * i + 128));
+
+            f0 = reduce_x16(qdata, f0);
+            f1 = mulmod_x16_scaled(qdata, f1, zetainv_256[i], zetainv_qinv_256[i]);
+            g1 = add_x16(f0, f1);
+            g0 = sub_x16(f0, f1);
+
+            f1 = add_x16(g1, g3);
+            f3 = sub_x16(g1, g3);
+            f0 = add_x16(g0, g2);
+            f2 = sub_x16(g0, g2);
+
+            _mm256_storeu_si256((__m256i *) (f + 16 * i + 128), f1);
+            _mm256_storeu_si256((__m256i *) (f + 16 * i + 384), f3);
+            _mm256_storeu_si256((__m256i *) (f + 16 * i + 0), f0);
+            _mm256_storeu_si256((__m256i *) (f + 16 * i + 256), f2);
+        }
+        f += 512;
+    }
+}
+
+void PQCLEAN_NTRULPR653_AVX2_invntt512_7681(int16 *f, int reps) {
+    invntt512(f, reps, (const __m256i *) qdata_7681.data);
+}
+
+void PQCLEAN_NTRULPR653_AVX2_invntt512_10753(int16 *f, int reps) {
+    invntt512(f, reps, (const __m256i *) qdata_10753.data);
+}
diff --git a/crypto_kem/ntrulpr653/avx2/crypto_core_multsntrup653_ntt.h b/crypto_kem/ntrulpr653/avx2/crypto_core_multsntrup653_ntt.h
new file mode 100644
index 00000000..2d04b742
--- /dev/null
+++ b/crypto_kem/ntrulpr653/avx2/crypto_core_multsntrup653_ntt.h
@@ -0,0 +1,13 @@
+#ifndef ntt_H
+#define ntt_H
+
+#include <stdint.h>
+
+
+
+extern void PQCLEAN_NTRULPR653_AVX2_ntt512_7681(int16_t *f, int reps);
+extern void PQCLEAN_NTRULPR653_AVX2_ntt512_10753(int16_t *f, int reps);
+extern void PQCLEAN_NTRULPR653_AVX2_invntt512_7681(int16_t *f, int reps);
+extern void PQCLEAN_NTRULPR653_AVX2_invntt512_10753(int16_t *f, int reps);
+
+#endif
diff --git a/crypto_kem/ntrulpr653/avx2/crypto_decode_256x16.c b/crypto_kem/ntrulpr653/avx2/crypto_decode_256x16.c
new file mode 100644
index 00000000..cbb0080f
--- /dev/null
+++ b/crypto_kem/ntrulpr653/avx2/crypto_decode_256x16.c
@@ -0,0 +1,11 @@
+#include "crypto_decode_256x16.h"
+
+
+void PQCLEAN_NTRULPR653_AVX2_crypto_decode_256x16(void *v, const unsigned char *s) {
+    unsigned char *T = v;
+    int i;
+    for (i = 0; i < 128; ++i) {
+        T[2 * i] = s[i] & 15;
+        T[2 * i + 1] = s[i] >> 4;
+    }
+}
diff --git a/crypto_kem/ntrulpr653/avx2/crypto_decode_256x16.h b/crypto_kem/ntrulpr653/avx2/crypto_decode_256x16.h
new file mode 100644
index 00000000..449637a0
--- /dev/null
+++ b/crypto_kem/ntrulpr653/avx2/crypto_decode_256x16.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_NTRULPR653_AVX2_CRYPTO_DECODE_256X16_H
+#define PQCLEAN_NTRULPR653_AVX2_CRYPTO_DECODE_256X16_H
+
+#include <stdint.h>
+#define PQCLEAN_NTRULPR653_AVX2_crypto_decode_256x16_STRBYTES 128
+#define PQCLEAN_NTRULPR653_AVX2_crypto_decode_256x16_ITEMS 256
+#define PQCLEAN_NTRULPR653_AVX2_crypto_decode_256x16_ITEMBYTES 1
+
+void PQCLEAN_NTRULPR653_AVX2_crypto_decode_256x16(void *v, const unsigned char *s);
+#endif
diff --git a/crypto_kem/ntrulpr653/avx2/crypto_decode_256x2.c b/crypto_kem/ntrulpr653/avx2/crypto_decode_256x2.c
new file mode 100644
index 00000000..506266a5
--- /dev/null
+++ b/crypto_kem/ntrulpr653/avx2/crypto_decode_256x2.c
@@ -0,0 +1,27 @@
+#include "crypto_decode_256x2.h"
+#include <immintrin.h>
+#include <stdint.h>
+
+#define COPY _mm256_set_epi64x(0x0303030303030303,0x0202020202020202,0x0101010101010101,0x0000000000000000)
+#define MASK _mm256_set1_epi64x(0x8040201008040201)
+#define MASK2 _mm256_set1_epi64x(0x0101010101010101)
+
+void PQCLEAN_NTRULPR653_AVX2_crypto_decode_256x2(void *v, const unsigned char *s) {
+    __m256i *r = v;
+    int i;
+
+    for (i = 0; i < 8; ++i) {
+        /* bytes s0 s1 s2 s3 */
+        __m256i x = _mm256_set1_epi32(*(int32_t *) s);
+        /* s0 s1 s2 s3 s0 s1 s2 s3 s0 s1 s2 s3 s0 s1 s2 s3 s0 s1 s2 s3 s0 s1 s2 s3 s0 s1 s2 s3 s0 s1 s2 s3 */
+        x = _mm256_shuffle_epi8(x, COPY);
+        /* s0 s0 s0 s0 s0 s0 s0 s0 s1 s1 s1 s1 s1 s1 s1 s1 s2 s2 s2 s2 s2 s2 s2 s2 s3 s3 s3 s3 s3 s3 s3 s3 */
+        x = _mm256_andnot_si256(x, MASK);
+        x = _mm256_cmpeq_epi8(x, _mm256_setzero_si256());
+        x &= MASK2;
+        _mm256_storeu_si256(r, x);
+
+        s += 4;
+        r += 1;
+    }
+}
diff --git a/crypto_kem/ntrulpr653/avx2/crypto_decode_256x2.h b/crypto_kem/ntrulpr653/avx2/crypto_decode_256x2.h
new file mode 100644
index 00000000..f4b515da
--- /dev/null
+++ b/crypto_kem/ntrulpr653/avx2/crypto_decode_256x2.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_NTRULPR653_AVX2_CRYPTO_DECODE_256X2_H
+#define PQCLEAN_NTRULPR653_AVX2_CRYPTO_DECODE_256X2_H
+
+#include <stdint.h>
+#define PQCLEAN_NTRULPR653_AVX2_crypto_decode_256x2_STRBYTES 32
+#define PQCLEAN_NTRULPR653_AVX2_crypto_decode_256x2_ITEMS 256
+#define PQCLEAN_NTRULPR653_AVX2_crypto_decode_256x2_ITEMBYTES 1
+
+void PQCLEAN_NTRULPR653_AVX2_crypto_decode_256x2(void *v, const unsigned char *s);
+#endif
diff --git a/crypto_kem/ntrulpr653/avx2/crypto_decode_653x1541.c b/crypto_kem/ntrulpr653/avx2/crypto_decode_653x1541.c
new file mode 100644
index 00000000..21ae3335
--- /dev/null
+++ b/crypto_kem/ntrulpr653/avx2/crypto_decode_653x1541.c
@@ -0,0 +1,408 @@
+#include "crypto_decode_653x1541.h"
+#include <immintrin.h>
+/* auto-generated; do not edit */
+
+#define int16 int16_t
+#define int32 int32_t
+
+static inline int16 mullo(int16 x, int16 y) {
+    return x * y;
+}
+
+static inline int16 mulhi(int16 x, int16 y) {
+    return (x * (int32)y) >> 16;
+}
+
+static inline __m256i add(__m256i x, __m256i y) {
+    return _mm256_add_epi16(x, y);
+}
+
+static inline __m256i sub(__m256i x, __m256i y) {
+    return _mm256_sub_epi16(x, y);
+}
+
+static inline __m256i shiftleftconst(__m256i x, int16 y) {
+    return _mm256_slli_epi16(x, y);
+}
+
+static inline __m256i signedshiftrightconst(__m256i x, int16 y) {
+    return _mm256_srai_epi16(x, y);
+}
+
+static inline __m256i addconst(__m256i x, int16 y) {
+    return add(x, _mm256_set1_epi16(y));
+}
+
+static inline __m256i subconst(__m256i x, int16 y) {
+    return sub(x, _mm256_set1_epi16(y));
+}
+
+static inline __m256i mulloconst(__m256i x, int16 y) {
+    return _mm256_mullo_epi16(x, _mm256_set1_epi16(y));
+}
+
+static inline __m256i mulhiconst(__m256i x, int16 y) {
+    return _mm256_mulhi_epi16(x, _mm256_set1_epi16(y));
+}
+
+static inline __m256i ifgesubconst(__m256i x, int16 y) {
+    __m256i y16 = _mm256_set1_epi16(y);
+    __m256i top16 = _mm256_set1_epi16((int16)(y - 1));
+    return sub(x, _mm256_cmpgt_epi16(x, top16) & y16);
+}
+
+static inline __m256i ifnegaddconst(__m256i x, int16 y) {
+    return add(x, signedshiftrightconst(x, 15) & _mm256_set1_epi16(y));
+}
+
+void PQCLEAN_NTRULPR653_AVX2_crypto_decode_653x1541(void *v, const unsigned char *s) {
+    int16 *R0 = v;
+    int16 R1[327], R2[164], R3[82], R4[41], R5[21], R6[11], R7[6], R8[3], R9[2], R10[1];
+    long long i;
+    int16 a0, a1, a2;
+    __m256i A0, A1, A2, S0, S1, B0, B1, C0, C1;
+
+    s += PQCLEAN_NTRULPR653_AVX2_crypto_decode_653x1541_STRBYTES;
+    a1 = 0;
+    a1 += *--s; /* 0...255 */
+    a1 = mulhi(a1, -48) - mulhi(mullo(a1, -6433), 2608);
+    a1 += *--s; /* -1304...1558 */
+    a1 += (a1 >> 15) & 2608; /* 0...2607 */
+    R10[0] = a1;
+
+    /* R10 ------> R9: reconstruct mod 1*[71]+[9402] */
+
+    i = 0;
+    s -= 1;
+    a2 = a0 = R10[0];
+    a0 = mulhi(a0, -13) - mulhi(mullo(a0, 25845), 71); /* -39...35 */
+    a0 += s[1 * i + 0]; /* -39...290 */
+    a0 = mulhi(a0, 3) - mulhi(mullo(a0, -923), 71); /* -36...35 */
+    a0 += (a0 >> 15) & 71; /* 0...70 */
+    a1 = (a2 << 8) + s[i] - a0;
+    a1 = mullo(a1, -22153);
+
+    /* invalid inputs might need reduction mod 9402 */
+    a1 -= 9402;
+    a1 += (a1 >> 15) & 9402;
+
+    R9[0] = a0;
+    R9[1] = a1;
+    s -= 0;
+
+    /* R9 ------> R8: reconstruct mod 2*[134]+[9402] */
+
+    R8[2] = R9[1];
+    s -= 1;
+    for (i = 0; i >= 0; --i) {
+        a2 = a0 = R9[i];
+        a0 = mulhi(a0, 14) - mulhi(mullo(a0, 5869), 134); /* -67...70 */
+        a0 += s[1 * i + 0]; /* -67...325 */
+        a0 = mulhi(a0, 10) - mulhi(mullo(a0, -489), 134); /* -68...67 */
+        a0 += (a0 >> 15) & 134; /* 0...133 */
+        a1 = (a2 << 7) + ((s[i] - a0) >> 1);
+        a1 = mullo(a1, 19563);
+
+        /* invalid inputs might need reduction mod 134 */
+        a1 -= 134;
+        a1 += (a1 >> 15) & 134;
+
+        R8[2 * i] = a0;
+        R8[2 * i + 1] = a1;
+    }
+
+    /* R8 ------> R7: reconstruct mod 5*[2953]+[815] */
+
+    i = 0;
+    s -= 1;
+    a2 = a0 = R8[2];
+    a0 = mulhi(a0, 1223) - mulhi(mullo(a0, -5681), 2953); /* -1477...1782 */
+    a0 += s[1 * i + 0]; /* -1477...2037 */
+    a0 += (a0 >> 15) & 2953; /* 0...2952 */
+    a1 = (a2 << 8) + s[i] - a0;
+    a1 = mullo(a1, -9543);
+
+    /* invalid inputs might need reduction mod 815 */
+    a1 -= 815;
+    a1 += (a1 >> 15) & 815;
+
+    R7[4] = a0;
+    R7[5] = a1;
+    s -= 4;
+    for (i = 1; i >= 0; --i) {
+        a0 = R8[i];
+        a0 = mulhi(a0, 1223) - mulhi(mullo(a0, -5681), 2953); /* -1477...1782 */
+        a0 += s[2 * i + 1]; /* -1477...2037 */
+        a0 = mulhi(a0, 1223) - mulhi(mullo(a0, -5681), 2953); /* -1505...1514 */
+        a0 += s[2 * i + 0]; /* -1505...1769 */
+        a0 += (a0 >> 15) & 2953; /* 0...2952 */
+        a1 = (s[2 * i + 1] << 8) + s[2 * i] - a0;
+        a1 = mullo(a1, -9543);
+
+        /* invalid inputs might need reduction mod 2953 */
+        a1 -= 2953;
+        a1 += (a1 >> 15) & 2953;
+
+        R7[2 * i] = a0;
+        R7[2 * i + 1] = a1;
+    }
+
+    /* R7 ------> R6: reconstruct mod 10*[13910]+[815] */
+
+    R6[10] = R7[5];
+    s -= 10;
+    for (i = 4; i >= 0; --i) {
+        a2 = a0 = R7[i];
+        a0 = mulhi(a0, 1756) - mulhi(mullo(a0, -1206), 13910); /* -6955...7394 */
+        a0 += s[2 * i + 1]; /* -6955...7649 */
+        a0 = mulhi(a0, 1756) - mulhi(mullo(a0, -1206), 13910); /* -7142...7159 */
+        a0 += s[2 * i + 0]; /* -7142...7414 */
+        a0 += (a0 >> 15) & 13910; /* 0...13909 */
+        a1 = (a2 << 15) + (s[2 * i + 1] << 7) + ((s[2 * i] - a0) >> 1);
+        a1 = mullo(a1, -13437);
+
+        /* invalid inputs might need reduction mod 13910 */
+        a1 -= 13910;
+        a1 += (a1 >> 15) & 13910;
+
+        R6[2 * i] = a0;
+        R6[2 * i + 1] = a1;
+    }
+
+    /* R6 ------> R5: reconstruct mod 20*[1887]+[815] */
+
+    R5[20] = R6[10];
+    s -= 10;
+    for (i = 9; i >= 0; --i) {
+        a2 = a0 = R6[i];
+        a0 = mulhi(a0, -101) - mulhi(mullo(a0, -8891), 1887); /* -969...943 */
+        a0 += s[1 * i + 0]; /* -969...1198 */
+        a0 += (a0 >> 15) & 1887; /* 0...1886 */
+        a1 = (a2 << 8) + s[i] - a0;
+        a1 = mullo(a1, 5279);
+
+        /* invalid inputs might need reduction mod 1887 */
+        a1 -= 1887;
+        a1 += (a1 >> 15) & 1887;
+
+        R5[2 * i] = a0;
+        R5[2 * i + 1] = a1;
+    }
+
+    /* R5 ------> R4: reconstruct mod 40*[695]+[815] */
+
+    R4[40] = R5[20];
+    s -= 20;
+    i = 4;
+    for (;;) {
+        A2 = A0 = _mm256_loadu_si256((__m256i *) &R5[i]);
+        S0 = _mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i *) (s + i)));
+        A0 = sub(mulhiconst(A0, -84), mulhiconst(mulloconst(A0, -24140), 695)); /* -369...347 */
+        A0 = add(A0, S0); /* -369...602 */
+        A0 = ifnegaddconst(A0, 695); /* 0...694 */
+        A1 = add(shiftleftconst(A2, 8), sub(S0, A0));
+        A1 = mulloconst(A1, 31495);
+
+        /* invalid inputs might need reduction mod 695 */
+        A1 = ifgesubconst(A1, 695);
+
+        /* A0: r0r2r4r6r8r10r12r14 r16r18r20r22r24r26r28r30 */
+        /* A1: r1r3r5r7r9r11r13r15 r17r19r21r23r25r27r29r31 */
+        B0 = _mm256_unpacklo_epi16(A0, A1);
+        B1 = _mm256_unpackhi_epi16(A0, A1);
+        /* B0: r0r1r2r3r4r5r6r7 r16r17r18r19r20r21r22r23 */
+        /* B1: r8r9r10r11r12r13r14r15 r24r25r26r27r28r29r30r31 */
+        C0 = _mm256_permute2x128_si256(B0, B1, 0x20);
+        C1 = _mm256_permute2x128_si256(B0, B1, 0x31);
+        /* C0: r0r1r2r3r4r5r6r7 r8r9r10r11r12r13r14r15 */
+        /* C1: r16r17r18r19r20r21r22r23 r24r25r26r27r28r29r30r31 */
+        _mm256_storeu_si256((__m256i *) (&R4[2 * i]), C0);
+        _mm256_storeu_si256((__m256i *) (16 + &R4[2 * i]), C1);
+        if (!i) {
+            break;
+        }
+        i = -16 - ((~15) & -i);
+    }
+
+    /* R4 ------> R3: reconstruct mod 81*[6745]+[7910] */
+
+    i = 0;
+    s -= 2;
+    a0 = R4[40];
+    a0 = mulhi(a0, 2401) - mulhi(mullo(a0, -2487), 6745); /* -3373...3972 */
+    a0 += s[2 * i + 1]; /* -3373...4227 */
+    a0 = mulhi(a0, 2401) - mulhi(mullo(a0, -2487), 6745); /* -3497...3527 */
+    a0 += s[2 * i + 0]; /* -3497...3782 */
+    a0 += (a0 >> 15) & 6745; /* 0...6744 */
+    a1 = (s[2 * i + 1] << 8) + s[2 * i] - a0;
+    a1 = mullo(a1, -29207);
+
+    /* invalid inputs might need reduction mod 7910 */
+    a1 -= 7910;
+    a1 += (a1 >> 15) & 7910;
+
+    R3[80] = a0;
+    R3[81] = a1;
+    s -= 80;
+    i = 24;
+    for (;;) {
+        A0 = _mm256_loadu_si256((__m256i *) &R4[i]);
+        S0 = _mm256_loadu_si256((__m256i *) (s + 2 * i));
+        S1 = _mm256_srli_epi16(S0, 8);
+        S0 &= _mm256_set1_epi16(255);
+        A0 = sub(mulhiconst(A0, 2401), mulhiconst(mulloconst(A0, -2487), 6745)); /* -3373...3972 */
+        A0 = add(A0, S1); /* -3373...4227 */
+        A0 = sub(mulhiconst(A0, 2401), mulhiconst(mulloconst(A0, -2487), 6745)); /* -3497...3527 */
+        A0 = add(A0, S0); /* -3497...3782 */
+        A0 = ifnegaddconst(A0, 6745); /* 0...6744 */
+        A1 = add(shiftleftconst(S1, 8), sub(S0, A0));
+        A1 = mulloconst(A1, -29207);
+
+        /* invalid inputs might need reduction mod 6745 */
+        A1 = ifgesubconst(A1, 6745);
+
+        /* A0: r0r2r4r6r8r10r12r14 r16r18r20r22r24r26r28r30 */
+        /* A1: r1r3r5r7r9r11r13r15 r17r19r21r23r25r27r29r31 */
+        B0 = _mm256_unpacklo_epi16(A0, A1);
+        B1 = _mm256_unpackhi_epi16(A0, A1);
+        /* B0: r0r1r2r3r4r5r6r7 r16r17r18r19r20r21r22r23 */
+        /* B1: r8r9r10r11r12r13r14r15 r24r25r26r27r28r29r30r31 */
+        C0 = _mm256_permute2x128_si256(B0, B1, 0x20);
+        C1 = _mm256_permute2x128_si256(B0, B1, 0x31);
+        /* C0: r0r1r2r3r4r5r6r7 r8r9r10r11r12r13r14r15 */
+        /* C1: r16r17r18r19r20r21r22r23 r24r25r26r27r28r29r30r31 */
+        _mm256_storeu_si256((__m256i *) (&R3[2 * i]), C0);
+        _mm256_storeu_si256((__m256i *) (16 + &R3[2 * i]), C1);
+        if (!i) {
+            break;
+        }
+        i = -16 - ((~15) & -i);
+    }
+
+    /* R3 ------> R2: reconstruct mod 163*[1314]+[1541] */
+
+    i = 0;
+    s -= 1;
+    a2 = a0 = R3[81];
+    a0 = mulhi(a0, 64) - mulhi(mullo(a0, -12768), 1314); /* -657...673 */
+    a0 += s[1 * i + 0]; /* -657...928 */
+    a0 += (a0 >> 15) & 1314; /* 0...1313 */
+    a1 = (a2 << 7) + ((s[i] - a0) >> 1);
+    a1 = mullo(a1, -399);
+
+    /* invalid inputs might need reduction mod 1541 */
+    a1 -= 1541;
+    a1 += (a1 >> 15) & 1541;
+
+    R2[162] = a0;
+    R2[163] = a1;
+    s -= 81;
+    i = 65;
+    for (;;) {
+        A2 = A0 = _mm256_loadu_si256((__m256i *) &R3[i]);
+        S0 = _mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i *) (s + i)));
+        A0 = sub(mulhiconst(A0, 64), mulhiconst(mulloconst(A0, -12768), 1314)); /* -657...673 */
+        A0 = add(A0, S0); /* -657...928 */
+        A0 = ifnegaddconst(A0, 1314); /* 0...1313 */
+        A1 = add(shiftleftconst(A2, 7), signedshiftrightconst(sub(S0, A0), 1));
+        A1 = mulloconst(A1, -399);
+
+        /* invalid inputs might need reduction mod 1314 */
+        A1 = ifgesubconst(A1, 1314);
+
+        /* A0: r0r2r4r6r8r10r12r14 r16r18r20r22r24r26r28r30 */
+        /* A1: r1r3r5r7r9r11r13r15 r17r19r21r23r25r27r29r31 */
+        B0 = _mm256_unpacklo_epi16(A0, A1);
+        B1 = _mm256_unpackhi_epi16(A0, A1);
+        /* B0: r0r1r2r3r4r5r6r7 r16r17r18r19r20r21r22r23 */
+        /* B1: r8r9r10r11r12r13r14r15 r24r25r26r27r28r29r30r31 */
+        C0 = _mm256_permute2x128_si256(B0, B1, 0x20);
+        C1 = _mm256_permute2x128_si256(B0, B1, 0x31);
+        /* C0: r0r1r2r3r4r5r6r7 r8r9r10r11r12r13r14r15 */
+        /* C1: r16r17r18r19r20r21r22r23 r24r25r26r27r28r29r30r31 */
+        _mm256_storeu_si256((__m256i *) (&R2[2 * i]), C0);
+        _mm256_storeu_si256((__m256i *) (16 + &R2[2 * i]), C1);
+        if (!i) {
+            break;
+        }
+        i = -16 - ((~15) & -i);
+    }
+
+    /* R2 ------> R1: reconstruct mod 326*[9277]+[1541] */
+
+    R1[326] = R2[163];
+    s -= 326;
+    i = 147;
+    for (;;) {
+        A0 = _mm256_loadu_si256((__m256i *) &R2[i]);
+        S0 = _mm256_loadu_si256((__m256i *) (s + 2 * i));
+        S1 = _mm256_srli_epi16(S0, 8);
+        S0 &= _mm256_set1_epi16(255);
+        A0 = sub(mulhiconst(A0, 4400), mulhiconst(mulloconst(A0, -1808), 9277)); /* -4639...5738 */
+        A0 = add(A0, S1); /* -4639...5993 */
+        A0 = sub(mulhiconst(A0, 4400), mulhiconst(mulloconst(A0, -1808), 9277)); /* -4950...5040 */
+        A0 = add(A0, S0); /* -4950...5295 */
+        A0 = ifnegaddconst(A0, 9277); /* 0...9276 */
+        A1 = add(shiftleftconst(S1, 8), sub(S0, A0));
+        A1 = mulloconst(A1, -27883);
+
+        /* invalid inputs might need reduction mod 9277 */
+        A1 = ifgesubconst(A1, 9277);
+
+        /* A0: r0r2r4r6r8r10r12r14 r16r18r20r22r24r26r28r30 */
+        /* A1: r1r3r5r7r9r11r13r15 r17r19r21r23r25r27r29r31 */
+        B0 = _mm256_unpacklo_epi16(A0, A1);
+        B1 = _mm256_unpackhi_epi16(A0, A1);
+        /* B0: r0r1r2r3r4r5r6r7 r16r17r18r19r20r21r22r23 */
+        /* B1: r8r9r10r11r12r13r14r15 r24r25r26r27r28r29r30r31 */
+        C0 = _mm256_permute2x128_si256(B0, B1, 0x20);
+        C1 = _mm256_permute2x128_si256(B0, B1, 0x31);
+        /* C0: r0r1r2r3r4r5r6r7 r8r9r10r11r12r13r14r15 */
+        /* C1: r16r17r18r19r20r21r22r23 r24r25r26r27r28r29r30r31 */
+        _mm256_storeu_si256((__m256i *) (&R1[2 * i]), C0);
+        _mm256_storeu_si256((__m256i *) (16 + &R1[2 * i]), C1);
+        if (!i) {
+            break;
+        }
+        i = -16 - ((~15) & -i);
+    }
+
+    /* R1 ------> R0: reconstruct mod 653*[1541] */
+
+    R0[652] = 3 * R1[326] - 2310;
+    s -= 326;
+    i = 310;
+    for (;;) {
+        A2 = A0 = _mm256_loadu_si256((__m256i *) &R1[i]);
+        S0 = _mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i *) (s + i)));
+        A0 = sub(mulhiconst(A0, 349), mulhiconst(mulloconst(A0, -10887), 1541)); /* -771...857 */
+        A0 = add(A0, S0); /* -771...1112 */
+        A0 = ifnegaddconst(A0, 1541); /* 0...1540 */
+        A1 = add(shiftleftconst(A2, 8), sub(S0, A0));
+        A1 = mulloconst(A1, -10547);
+
+        /* invalid inputs might need reduction mod 1541 */
+        A1 = ifgesubconst(A1, 1541);
+
+        A0 = mulloconst(A0, 3);
+        A1 = mulloconst(A1, 3);
+        A0 = subconst(A0, 2310);
+        A1 = subconst(A1, 2310);
+        /* A0: r0r2r4r6r8r10r12r14 r16r18r20r22r24r26r28r30 */
+        /* A1: r1r3r5r7r9r11r13r15 r17r19r21r23r25r27r29r31 */
+        B0 = _mm256_unpacklo_epi16(A0, A1);
+        B1 = _mm256_unpackhi_epi16(A0, A1);
+        /* B0: r0r1r2r3r4r5r6r7 r16r17r18r19r20r21r22r23 */
+        /* B1: r8r9r10r11r12r13r14r15 r24r25r26r27r28r29r30r31 */
+        C0 = _mm256_permute2x128_si256(B0, B1, 0x20);
+        C1 = _mm256_permute2x128_si256(B0, B1, 0x31);
+        /* C0: r0r1r2r3r4r5r6r7 r8r9r10r11r12r13r14r15 */
+        /* C1: r16r17r18r19r20r21r22r23 r24r25r26r27r28r29r30r31 */
+        _mm256_storeu_si256((__m256i *) (&R0[2 * i]), C0);
+        _mm256_storeu_si256((__m256i *) (16 + &R0[2 * i]), C1);
+        if (!i) {
+            break;
+        }
+        i = -16 - ((~15) & -i);
+    }
+}
diff --git a/crypto_kem/ntrulpr653/avx2/crypto_decode_653x1541.h b/crypto_kem/ntrulpr653/avx2/crypto_decode_653x1541.h
new file mode 100644
index 00000000..8bdcdcfa
--- /dev/null
+++ b/crypto_kem/ntrulpr653/avx2/crypto_decode_653x1541.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_NTRULPR653_AVX2_CRYPTO_DECODE_653X1541_H
+#define PQCLEAN_NTRULPR653_AVX2_CRYPTO_DECODE_653X1541_H
+
+#include <stdint.h>
+#define PQCLEAN_NTRULPR653_AVX2_crypto_decode_653x1541_STRBYTES 865
+#define PQCLEAN_NTRULPR653_AVX2_crypto_decode_653x1541_ITEMS 653
+#define PQCLEAN_NTRULPR653_AVX2_crypto_decode_653x1541_ITEMBYTES 2
+
+void PQCLEAN_NTRULPR653_AVX2_crypto_decode_653x1541(void *v, const unsigned char *s);
+#endif
diff --git a/crypto_kem/ntrulpr653/avx2/crypto_decode_653x3.c b/crypto_kem/ntrulpr653/avx2/crypto_decode_653x3.c
new file mode 100644
index 00000000..de701e22
--- /dev/null
+++ b/crypto_kem/ntrulpr653/avx2/crypto_decode_653x3.c
@@ -0,0 +1,65 @@
+#include "crypto_decode_653x3.h"
+#include <immintrin.h>
+#define uint8 uint8_t
+
+#define p 653
+#define loops 6
+#define overshoot 29
+
+void PQCLEAN_NTRULPR653_AVX2_crypto_decode_653x3(void *v, const unsigned char *s) {
+    uint8 *f = v;
+    int loop;
+    uint8 *nextf = f + 128 - 4 * overshoot;
+    const unsigned char *nexts = s + 32 - overshoot;
+
+    for (loop = loops; loop > 0; --loop) {
+        __m256i s0 = _mm256_loadu_si256((const __m256i *) s);
+        s = nexts;
+        nexts += 32;
+
+        __m256i s1 = _mm256_srli_epi16(s0 & _mm256_set1_epi8(-16), 4);
+        s0 &= _mm256_set1_epi8(15);
+
+        __m256i a0 = _mm256_unpacklo_epi8(s0, s1);
+        /* 0 0>>4 1 1>>4 2 2>>4 3 3>>4 4 4>>4 5 5>>4 6 6>>4 7 7>>4 */
+        /* 16 16>>4 ... */
+        __m256i a1 = _mm256_unpackhi_epi8(s0, s1);
+        /* 8 8>>4 9 9>>4 10 10>>4 ... */
+        /* 24 24>>4 ... */
+
+        __m256i a2 = _mm256_srli_epi16(a0 & _mm256_set1_epi8(12), 2);
+        __m256i a3 = _mm256_srli_epi16(a1 & _mm256_set1_epi8(12), 2);
+        a0 &= _mm256_set1_epi8(3);
+        a1 &= _mm256_set1_epi8(3);
+
+        __m256i b0 = _mm256_unpacklo_epi8(a0, a2);
+        /* 0 0>>2 0>>4 0>>6 1 1>>2 1>>4 1>>6 */
+        /* 2 2>>2 2>>4 2>>6 3 3>>2 3>>4 3>.6 */
+        /* 16 16>>2 16>>4 16>>6 ... */
+        __m256i b2 = _mm256_unpackhi_epi8(a0, a2);
+        /* 4 4>>2 ... */
+        __m256i b1 = _mm256_unpacklo_epi8(a1, a3);
+        /* 8 8>>2 ... */
+        __m256i b3 = _mm256_unpackhi_epi8(a1, a3);
+        /* 12 12>>2 ... */
+
+        __m256i f0 = _mm256_permute2x128_si256(b0, b2, 0x20);
+        __m256i f2 = _mm256_permute2x128_si256(b0, b2, 0x31);
+        __m256i f1 = _mm256_permute2x128_si256(b1, b3, 0x20);
+        __m256i f3 = _mm256_permute2x128_si256(b1, b3, 0x31);
+
+        f0 = _mm256_add_epi8(f0, _mm256_set1_epi8(-1));
+        f1 = _mm256_add_epi8(f1, _mm256_set1_epi8(-1));
+        f2 = _mm256_add_epi8(f2, _mm256_set1_epi8(-1));
+        f3 = _mm256_add_epi8(f3, _mm256_set1_epi8(-1));
+
+        _mm256_storeu_si256((__m256i *) (f + 0), f0);
+        _mm256_storeu_si256((__m256i *) (f + 32), f1);
+        _mm256_storeu_si256((__m256i *) (f + 64), f2);
+        _mm256_storeu_si256((__m256i *) (f + 96), f3);
+        f = nextf;
+        nextf += 128;
+    }
+
+    *f = ((uint8)(*s & 3)) - 1;
+}
diff --git a/crypto_kem/ntrulpr653/avx2/crypto_decode_653x3.h b/crypto_kem/ntrulpr653/avx2/crypto_decode_653x3.h
new file mode 100644
index 00000000..fa74a621
--- /dev/null
+++ b/crypto_kem/ntrulpr653/avx2/crypto_decode_653x3.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_NTRULPR653_AVX2_CRYPTO_DECODE_653X3_H
+#define PQCLEAN_NTRULPR653_AVX2_CRYPTO_DECODE_653X3_H
+
+#include <stdint.h>
+#define PQCLEAN_NTRULPR653_AVX2_crypto_decode_653x3_STRBYTES 164
+#define PQCLEAN_NTRULPR653_AVX2_crypto_decode_653x3_ITEMS 653
+#define PQCLEAN_NTRULPR653_AVX2_crypto_decode_653x3_ITEMBYTES 1
+
+void PQCLEAN_NTRULPR653_AVX2_crypto_decode_653x3(void *v, const unsigned char *s);
+#endif
diff --git a/crypto_kem/ntrulpr653/avx2/crypto_decode_653xint16.c b/crypto_kem/ntrulpr653/avx2/crypto_decode_653xint16.c
new file mode 100644
index 00000000..fc5d7fd2
--- /dev/null
+++ b/crypto_kem/ntrulpr653/avx2/crypto_decode_653xint16.c
@@ -0,0 +1,16 @@
+#include "crypto_decode_653xint16.h"
+
+
+void PQCLEAN_NTRULPR653_AVX2_crypto_decode_653xint16(void *v, const unsigned char *s) {
+    uint16_t *x = v;
+    int i;
+
+    for (i = 0; i < 653; ++i) {
+        uint16_t u0 = s[0];
+        uint16_t u1 = s[1];
+        u1 <<= 8;
+        *x = u0 | u1;
+        x += 1;
+        s += 2;
+    }
+}
diff --git a/crypto_kem/ntrulpr653/avx2/crypto_decode_653xint16.h b/crypto_kem/ntrulpr653/avx2/crypto_decode_653xint16.h
new file mode 100644
index 00000000..af3e318f
--- /dev/null
+++ b/crypto_kem/ntrulpr653/avx2/crypto_decode_653xint16.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_NTRULPR653_AVX2_CRYPTO_DECODE_653XINT16_H
+#define PQCLEAN_NTRULPR653_AVX2_CRYPTO_DECODE_653XINT16_H
+
+#include <stdint.h>
+#define PQCLEAN_NTRULPR653_AVX2_crypto_decode_653xint16_STRBYTES 1306
+#define PQCLEAN_NTRULPR653_AVX2_crypto_decode_653xint16_ITEMBYTES 2
+#define PQCLEAN_NTRULPR653_AVX2_crypto_decode_653xint16_ITEMS 653
+
+void PQCLEAN_NTRULPR653_AVX2_crypto_decode_653xint16(void *v, const unsigned char *s);
+#endif
diff --git a/crypto_kem/ntrulpr653/avx2/crypto_decode_653xint32.c b/crypto_kem/ntrulpr653/avx2/crypto_decode_653xint32.c
new file mode 100644
index 00000000..3559b848
--- /dev/null
+++ b/crypto_kem/ntrulpr653/avx2/crypto_decode_653xint32.c
@@ -0,0 +1,20 @@
+#include "crypto_decode_653xint32.h"
+
+
+void PQCLEAN_NTRULPR653_AVX2_crypto_decode_653xint32(void *v, const unsigned char *s) {
+    uint32_t *x = v;
+    int i;
+
+    for (i = 0; i < 653; ++i) {
+        uint32_t u0 = s[0];
+        uint32_t u1 = s[1];
+        uint32_t u2 = s[2];
+        uint32_t u3 = s[3];
+        u1 <<= 8;
+        u2 <<= 16;
+        u3 <<= 24;
+        *x = u0 | u1 | u2 | u3;
+        x += 1;
+        s += 4;
+    }
+}
diff --git a/crypto_kem/ntrulpr653/avx2/crypto_decode_653xint32.h b/crypto_kem/ntrulpr653/avx2/crypto_decode_653xint32.h
new file mode 100644
index 00000000..06f57027
--- /dev/null
+++ b/crypto_kem/ntrulpr653/avx2/crypto_decode_653xint32.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_NTRULPR653_AVX2_CRYPTO_DECODE_653XINT32_H
+#define PQCLEAN_NTRULPR653_AVX2_CRYPTO_DECODE_653XINT32_H
+
+#include <stdint.h>
+#define PQCLEAN_NTRULPR653_AVX2_crypto_decode_653xint32_STRBYTES 2612
+#define PQCLEAN_NTRULPR653_AVX2_crypto_decode_653xint32_ITEMBYTES 4
+#define PQCLEAN_NTRULPR653_AVX2_crypto_decode_653xint32_ITEMS 653
+
+void PQCLEAN_NTRULPR653_AVX2_crypto_decode_653xint32(void *v, const unsigned char *s);
+#endif
diff --git a/crypto_kem/ntrulpr653/avx2/crypto_encode_256x16.c b/crypto_kem/ntrulpr653/avx2/crypto_encode_256x16.c
new file mode 100644
index 00000000..b17a2188
--- /dev/null
+++ b/crypto_kem/ntrulpr653/avx2/crypto_encode_256x16.c
@@ -0,0 +1,10 @@
+#include "crypto_encode_256x16.h"
+
+
+void PQCLEAN_NTRULPR653_AVX2_crypto_encode_256x16(unsigned char *s, const void *v) {
+    const unsigned char *T = v;
+    int i;
+    for (i = 0; i < 128; ++i) {
+        s[i] = T[2 * i] + (T[2 * i + 1] << 4);
+    }
+}
diff --git a/crypto_kem/ntrulpr653/avx2/crypto_encode_256x16.h b/crypto_kem/ntrulpr653/avx2/crypto_encode_256x16.h
new file mode 100644
index 00000000..c7f79e39
--- /dev/null
+++ b/crypto_kem/ntrulpr653/avx2/crypto_encode_256x16.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_NTRULPR653_AVX2_CRYPTO_ENCODE_256X16_H
+#define PQCLEAN_NTRULPR653_AVX2_CRYPTO_ENCODE_256X16_H
+
+#include <stdint.h>
+#define PQCLEAN_NTRULPR653_AVX2_crypto_encode_256x16_STRBYTES 128
+#define PQCLEAN_NTRULPR653_AVX2_crypto_encode_256x16_ITEMS 256
+#define PQCLEAN_NTRULPR653_AVX2_crypto_encode_256x16_ITEMBYTES 1
+
+void PQCLEAN_NTRULPR653_AVX2_crypto_encode_256x16(unsigned char *s, const void *v);
+#endif
diff --git a/crypto_kem/ntrulpr653/avx2/crypto_encode_256x2.c b/crypto_kem/ntrulpr653/avx2/crypto_encode_256x2.c
new file mode 100644
index 00000000..058fc8f4
--- /dev/null
+++ b/crypto_kem/ntrulpr653/avx2/crypto_encode_256x2.c
@@ -0,0 +1,88 @@
+#include "crypto_encode_256x2.h"
+#include <immintrin.h>
+#include <stdint.h>
+
+void PQCLEAN_NTRULPR653_AVX2_crypto_encode_256x2(unsigned char *s, const void *v) {
+    __m256i a0 = _mm256_loadu_si256(0 + (__m256i *) v);
+    __m256i a1 = _mm256_loadu_si256(1 + (__m256i *) v);
+    __m256i a2 = _mm256_loadu_si256(2 + (__m256i *) v);
+    __m256i a3 = _mm256_loadu_si256(3 + (__m256i *) v);
+    __m256i a4 = _mm256_loadu_si256(4 + (__m256i *) v);
+    __m256i a5 = _mm256_loadu_si256(5 + (__m256i *) v);
+    __m256i a6 = _mm256_loadu_si256(6 + (__m256i *) v);
+    __m256i a7 = _mm256_loadu_si256(7 + (__m256i *) v);
+    __m256i bottom = _mm256_set1_epi8(1);
+    __m256i zero = _mm256_setzero_si256();
+    __m256i b0 = _mm256_cmpgt_epi8(a0 & bottom, zero);
+    __m256i b1 = _mm256_cmpgt_epi8(a1 & bottom, zero);
+    __m256i b2 = _mm256_cmpgt_epi8(a2 & bottom, zero);
+    __m256i b3 = _mm256_cmpgt_epi8(a3 & bottom, zero);
+    __m256i b4 = _mm256_cmpgt_epi8(a4 & bottom, zero);
+    __m256i b5 = _mm256_cmpgt_epi8(a5 & bottom, zero);
+    __m256i b6 = _mm256_cmpgt_epi8(a6 & bottom, zero);
+    __m256i b7 = _mm256_cmpgt_epi8(a7 & bottom, zero);
+    int32_t c0 = _mm256_movemask_epi8(b0);
+    int32_t c1 = _mm256_movemask_epi8(b1);
+    int32_t c2 = _mm256_movemask_epi8(b2);
+    int32_t c3 = _mm256_movemask_epi8(b3);
+    int32_t c4 = _mm256_movemask_epi8(b4);
+    int32_t c5 = _mm256_movemask_epi8(b5);
+    int32_t c6 = _mm256_movemask_epi8(b6);
+    int32_t c7 = _mm256_movemask_epi8(b7);
+    *s++ = c0;
+    c0 >>= 8;
+    *s++ = c0;
+    c0 >>= 8;
+    *s++ = c0;
+    c0 >>= 8;
+    *s++ = c0;
+    *s++ = c1;
+    c1 >>= 8;
+    *s++ = c1;
+    c1 >>= 8;
+    *s++ = c1;
+    c1 >>= 8;
+    *s++ = c1;
+    *s++ = c2;
+    c2 >>= 8;
+    *s++ = c2;
+    c2 >>= 8;
+    *s++ = c2;
+    c2 >>= 8;
+    *s++ = c2;
+    *s++ = c3;
+    c3 >>= 8;
+    *s++ = c3;
+    c3 >>= 8;
+    *s++ = c3;
+    c3 >>= 8;
+    *s++ = c3;
+    *s++ = c4;
+    c4 >>= 8;
+    *s++ = c4;
+    c4 >>= 8;
+    *s++ = c4;
+    c4 >>= 8;
+    *s++ = c4;
+    *s++ = c5;
+    c5 >>= 8;
+    *s++ = c5;
+    c5 >>= 8;
+    *s++ = c5;
+    c5 >>= 8;
+    *s++ = c5;
+    *s++ = c6;
+    c6 >>= 8;
+    *s++ = c6;
+    c6 >>= 8;
+    *s++ = c6;
+    c6 >>= 8;
+    *s++ = c6;
+    *s++ = c7;
+    c7 >>= 8;
+    *s++ = c7;
+    c7 >>= 8;
+    *s++ = c7;
+    c7 >>= 8;
+    *s++ = c7;
+}
diff --git a/crypto_kem/ntrulpr653/avx2/crypto_encode_256x2.h b/crypto_kem/ntrulpr653/avx2/crypto_encode_256x2.h
new file mode 100644
index 00000000..12d50d31
--- /dev/null
+++ b/crypto_kem/ntrulpr653/avx2/crypto_encode_256x2.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_NTRULPR653_AVX2_CRYPTO_ENCODE_256X2_H
+#define PQCLEAN_NTRULPR653_AVX2_CRYPTO_ENCODE_256X2_H
+
+#include <stdint.h>
+#define PQCLEAN_NTRULPR653_AVX2_crypto_encode_256x2_STRBYTES 32
+#define PQCLEAN_NTRULPR653_AVX2_crypto_encode_256x2_ITEMS 256
+#define PQCLEAN_NTRULPR653_AVX2_crypto_encode_256x2_ITEMBYTES 1
+
+void PQCLEAN_NTRULPR653_AVX2_crypto_encode_256x2(unsigned char *s, const void *v);
+#endif
diff --git a/crypto_kem/ntrulpr653/avx2/crypto_encode_653x1541.c b/crypto_kem/ntrulpr653/avx2/crypto_encode_653x1541.c
new file mode 100644
index 00000000..f0a66124
--- /dev/null
+++ b/crypto_kem/ntrulpr653/avx2/crypto_encode_653x1541.c
@@ -0,0 +1,286 @@
+#include "crypto_encode_653x1541.h"
+#include <immintrin.h>
+/* auto-generated; do not edit */
+
+#define int16 int16_t
+#define uint16 uint16_t
+#define uint32 uint32_t
+
+void PQCLEAN_NTRULPR653_AVX2_crypto_encode_653x1541(unsigned char *out, const void *v) {
+    const int16 *R0 = v;
+    /* XXX: caller could overlap R with input */
+    uint16 R[327];
+    long i;
+    const uint16 *reading;
+    uint16 *writing;
+    uint16 r0, r1;
+    uint32 r2;
+    uint32 s0;
+
+    reading = (uint16 *) R0;
+    writing = R;
+    i = 41;
+    while (i > 0) {
+        __m256i x, y;
+        --i;
+        if (!i) {
+            reading -= 4;
+            writing -= 2;
+            out -= 2;
+        }
+        x = _mm256_loadu_si256((__m256i *) reading);
+        x = _mm256_add_epi16(x, _mm256_set1_epi16(2310));
+        x &= _mm256_set1_epi16(16383);
+        x = _mm256_mulhi_epi16(x, _mm256_set1_epi16(21846));
+        y = x & _mm256_set1_epi32(65535);
+        x = _mm256_srli_epi32(x, 16);
+        x = _mm256_mullo_epi32(x, _mm256_set1_epi32(1541));
+        x = _mm256_add_epi32(y, x);
+        x = _mm256_shuffle_epi8(x, _mm256_set_epi8(
+                                    12, 8, 4, 0, 12, 8, 4, 0, 14, 13, 10, 9, 6, 5, 2, 1,
+                                    12, 8, 4, 0, 12, 8, 4, 0, 14, 13, 10, 9, 6, 5, 2, 1
+                                ));
+        x = _mm256_permute4x64_epi64(x, 0xd8);
+        _mm_storeu_si128((__m128i *) writing, _mm256_extractf128_si256(x, 0));
+        s0 = _mm256_extract_epi32(x, 4);
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 = _mm256_extract_epi32(x, 6);
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        reading += 16;
+        writing += 8;
+    }
+    R[326] = (((R0[652] + 2310) & 16383) * 10923) >> 15;
+
+    reading = (uint16 *) R;
+    writing = R;
+    i = 11;
+    while (i > 0) {
+        __m256i x, x2, y, y2;
+        --i;
+        if (!i) {
+            reading -= 26;
+            writing -= 13;
+            out -= 26;
+        }
+        x = _mm256_loadu_si256((__m256i *) (reading + 0));
+        x2 = _mm256_loadu_si256((__m256i *) (reading + 16));
+        y = x & _mm256_set1_epi32(65535);
+        y2 = x2 & _mm256_set1_epi32(65535);
+        x = _mm256_srli_epi32(x, 16);
+        x2 = _mm256_srli_epi32(x2, 16);
+        x = _mm256_mullo_epi32(x, _mm256_set1_epi32(9277));
+        x2 = _mm256_mullo_epi32(x2, _mm256_set1_epi32(9277));
+        x = _mm256_add_epi32(y, x);
+        x2 = _mm256_add_epi32(y2, x2);
+        x = _mm256_shuffle_epi8(x, _mm256_set_epi8(
+                                    15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0,
+                                    15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0
+                                ));
+        x2 = _mm256_shuffle_epi8(x2, _mm256_set_epi8(
+                                     15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0,
+                                     15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0
+                                 ));
+        x = _mm256_permute4x64_epi64(x, 0xd8);
+        x2 = _mm256_permute4x64_epi64(x2, 0xd8);
+        _mm256_storeu_si256((__m256i *) writing, _mm256_permute2f128_si256(x, x2, 0x31));
+        _mm256_storeu_si256((__m256i *) out, _mm256_permute2f128_si256(x, x2, 0x20));
+        reading += 32;
+        writing += 16;
+        out += 32;
+    }
+    R[163] = R[326];
+
+    reading = (uint16 *) R;
+    writing = R;
+    i = 11;
+    while (i > 0) {
+        __m256i x, y;
+        --i;
+        if (!i) {
+            reading -= 12;
+            writing -= 6;
+            out -= 6;
+        }
+        x = _mm256_loadu_si256((__m256i *) reading);
+        y = x & _mm256_set1_epi32(65535);
+        x = _mm256_srli_epi32(x, 16);
+        x = _mm256_mullo_epi32(x, _mm256_set1_epi32(1314));
+        x = _mm256_add_epi32(y, x);
+        x = _mm256_shuffle_epi8(x, _mm256_set_epi8(
+                                    12, 8, 4, 0, 12, 8, 4, 0, 14, 13, 10, 9, 6, 5, 2, 1,
+                                    12, 8, 4, 0, 12, 8, 4, 0, 14, 13, 10, 9, 6, 5, 2, 1
+                                ));
+        x = _mm256_permute4x64_epi64(x, 0xd8);
+        _mm_storeu_si128((__m128i *) writing, _mm256_extractf128_si256(x, 0));
+        s0 = _mm256_extract_epi32(x, 4);
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 = _mm256_extract_epi32(x, 6);
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        reading += 16;
+        writing += 8;
+    }
+
+    reading = (uint16 *) R;
+    writing = R;
+    i = 3;
+    while (i > 0) {
+        __m256i x, x2, y, y2;
+        --i;
+        if (!i) {
+            reading -= 14;
+            writing -= 7;
+            out -= 14;
+        }
+        x = _mm256_loadu_si256((__m256i *) (reading + 0));
+        x2 = _mm256_loadu_si256((__m256i *) (reading + 16));
+        y = x & _mm256_set1_epi32(65535);
+        y2 = x2 & _mm256_set1_epi32(65535);
+        x = _mm256_srli_epi32(x, 16);
+        x2 = _mm256_srli_epi32(x2, 16);
+        x = _mm256_mullo_epi32(x, _mm256_set1_epi32(6745));
+        x2 = _mm256_mullo_epi32(x2, _mm256_set1_epi32(6745));
+        x = _mm256_add_epi32(y, x);
+        x2 = _mm256_add_epi32(y2, x2);
+        x = _mm256_shuffle_epi8(x, _mm256_set_epi8(
+                                    15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0,
+                                    15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0
+                                ));
+        x2 = _mm256_shuffle_epi8(x2, _mm256_set_epi8(
+                                     15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0,
+                                     15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0
+                                 ));
+        x = _mm256_permute4x64_epi64(x, 0xd8);
+        x2 = _mm256_permute4x64_epi64(x2, 0xd8);
+        _mm256_storeu_si256((__m256i *) writing, _mm256_permute2f128_si256(x, x2, 0x31));
+        _mm256_storeu_si256((__m256i *) out, _mm256_permute2f128_si256(x, x2, 0x20));
+        reading += 32;
+        writing += 16;
+        out += 32;
+    }
+
+    reading = (uint16 *) R;
+    writing = R;
+    i = 3;
+    while (i > 0) {
+        __m256i x, y;
+        --i;
+        if (!i) {
+            reading -= 8;
+            writing -= 4;
+            out -= 4;
+        }
+        x = _mm256_loadu_si256((__m256i *) reading);
+        y = x & _mm256_set1_epi32(65535);
+        x = _mm256_srli_epi32(x, 16);
+        x = _mm256_mullo_epi32(x, _mm256_set1_epi32(695));
+        x = _mm256_add_epi32(y, x);
+        x = _mm256_shuffle_epi8(x, _mm256_set_epi8(
+                                    12, 8, 4, 0, 12, 8, 4, 0, 14, 13, 10, 9, 6, 5, 2, 1,
+                                    12, 8, 4, 0, 12, 8, 4, 0, 14, 13, 10, 9, 6, 5, 2, 1
+                                ));
+        x = _mm256_permute4x64_epi64(x, 0xd8);
+        _mm_storeu_si128((__m128i *) writing, _mm256_extractf128_si256(x, 0));
+        s0 = _mm256_extract_epi32(x, 4);
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 = _mm256_extract_epi32(x, 6);
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        reading += 16;
+        writing += 8;
+    }
+    R[20] = R[40];
+
+    for (i = 0; i < 10; ++i) {
+        r0 = R[2 * i];
+        r1 = R[2 * i + 1];
+        r2 = r0 + r1 * (uint32)1887;
+        *out++ = r2;
+        r2 >>= 8;
+        R[i] = r2;
+    }
+    R[10] = R[20];
+
+    for (i = 0; i < 5; ++i) {
+        r0 = R[2 * i];
+        r1 = R[2 * i + 1];
+        r2 = r0 + r1 * (uint32)13910;
+        *out++ = r2;
+        r2 >>= 8;
+        *out++ = r2;
+        r2 >>= 8;
+        R[i] = r2;
+    }
+    R[5] = R[10];
+
+    for (i = 0; i < 2; ++i) {
+        r0 = R[2 * i];
+        r1 = R[2 * i + 1];
+        r2 = r0 + r1 * (uint32)2953;
+        *out++ = r2;
+        r2 >>= 8;
+        *out++ = r2;
+        r2 >>= 8;
+        R[i] = r2;
+    }
+    r0 = R[4];
+    r1 = R[5];
+    r2 = r0 + r1 * (uint32)2953;
+    *out++ = r2;
+    r2 >>= 8;
+    R[2] = r2;
+
+    r0 = R[0];
+    r1 = R[1];
+    r2 = r0 + r1 * (uint32)134;
+    *out++ = r2;
+    r2 >>= 8;
+    R[0] = r2;
+    R[1] = R[2];
+
+    r0 = R[0];
+    r1 = R[1];
+    r2 = r0 + r1 * (uint32)71;
+    *out++ = r2;
+    r2 >>= 8;
+    R[0] = r2;
+
+    r0 = R[0];
+    *out++ = r0;
+    r0 >>= 8;
+    *out++ = r0; /*clang-analyzer-deadcode.DeadStores*/ /*r0 >>= 8;*/
+}
diff --git a/crypto_kem/ntrulpr653/avx2/crypto_encode_653x1541.h b/crypto_kem/ntrulpr653/avx2/crypto_encode_653x1541.h
new file mode 100644
index 00000000..5f498ca6
--- /dev/null
+++ b/crypto_kem/ntrulpr653/avx2/crypto_encode_653x1541.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_NTRULPR653_AVX2_CRYPTO_ENCODE_653X1541_H
+#define PQCLEAN_NTRULPR653_AVX2_CRYPTO_ENCODE_653X1541_H
+
+#include <stdint.h>
+#define PQCLEAN_NTRULPR653_AVX2_crypto_encode_653x1541_STRBYTES 865
+#define PQCLEAN_NTRULPR653_AVX2_crypto_encode_653x1541_ITEMS 653
+#define PQCLEAN_NTRULPR653_AVX2_crypto_encode_653x1541_ITEMBYTES 2
+
+void PQCLEAN_NTRULPR653_AVX2_crypto_encode_653x1541(unsigned char *out, const void *v);
+#endif
diff --git a/crypto_kem/ntrulpr653/avx2/crypto_encode_653x1541round.c b/crypto_kem/ntrulpr653/avx2/crypto_encode_653x1541round.c
new file mode 100644
index 00000000..878fe438
--- /dev/null
+++ b/crypto_kem/ntrulpr653/avx2/crypto_encode_653x1541round.c
@@ -0,0 +1,288 @@
+#include "crypto_encode_653x1541round.h"
+#include <immintrin.h>
+/* auto-generated; do not edit */
+
+#define int16 int16_t
+#define uint16 uint16_t
+#define uint32 uint32_t
+
+void PQCLEAN_NTRULPR653_AVX2_crypto_encode_653x1541round(unsigned char *out, const void *v) {
+    const int16 *R0 = v;
+    /* XXX: caller could overlap R with input */
+    uint16 R[327];
+    long i;
+    const uint16 *reading;
+    uint16 *writing;
+    uint16 r0, r1;
+    uint32 r2;
+    uint32 s0;
+
+    reading = (uint16 *) R0;
+    writing = R;
+    i = 41;
+    while (i > 0) {
+        __m256i x, y;
+        --i;
+        if (!i) {
+            reading -= 4;
+            writing -= 2;
+            out -= 2;
+        }
+        x = _mm256_loadu_si256((__m256i *) reading);
+        x = _mm256_mulhrs_epi16(x, _mm256_set1_epi16(10923));
+        x = _mm256_add_epi16(x, _mm256_add_epi16(x, x));
+        x = _mm256_add_epi16(x, _mm256_set1_epi16(2310));
+        x &= _mm256_set1_epi16(16383);
+        x = _mm256_mulhi_epi16(x, _mm256_set1_epi16(21846));
+        y = x & _mm256_set1_epi32(65535);
+        x = _mm256_srli_epi32(x, 16);
+        x = _mm256_mullo_epi32(x, _mm256_set1_epi32(1541));
+        x = _mm256_add_epi32(y, x);
+        x = _mm256_shuffle_epi8(x, _mm256_set_epi8(
+                                    12, 8, 4, 0, 12, 8, 4, 0, 14, 13, 10, 9, 6, 5, 2, 1,
+                                    12, 8, 4, 0, 12, 8, 4, 0, 14, 13, 10, 9, 6, 5, 2, 1
+                                ));
+        x = _mm256_permute4x64_epi64(x, 0xd8);
+        _mm_storeu_si128((__m128i *) writing, _mm256_extractf128_si256(x, 0));
+        s0 = _mm256_extract_epi32(x, 4);
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 = _mm256_extract_epi32(x, 6);
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        reading += 16;
+        writing += 8;
+    }
+    R[326] = (((3 * ((10923 * R0[652] + 16384) >> 15) + 2310) & 16383) * 10923) >> 15;
+
+    reading = (uint16 *) R;
+    writing = R;
+    i = 11;
+    while (i > 0) {
+        __m256i x, x2, y, y2;
+        --i;
+        if (!i) {
+            reading -= 26;
+            writing -= 13;
+            out -= 26;
+        }
+        x = _mm256_loadu_si256((__m256i *) (reading + 0));
+        x2 = _mm256_loadu_si256((__m256i *) (reading + 16));
+        y = x & _mm256_set1_epi32(65535);
+        y2 = x2 & _mm256_set1_epi32(65535);
+        x = _mm256_srli_epi32(x, 16);
+        x2 = _mm256_srli_epi32(x2, 16);
+        x = _mm256_mullo_epi32(x, _mm256_set1_epi32(9277));
+        x2 = _mm256_mullo_epi32(x2, _mm256_set1_epi32(9277));
+        x = _mm256_add_epi32(y, x);
+        x2 = _mm256_add_epi32(y2, x2);
+        x = _mm256_shuffle_epi8(x, _mm256_set_epi8(
+                                    15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0,
+                                    15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0
+                                ));
+        x2 = _mm256_shuffle_epi8(x2, _mm256_set_epi8(
+                                     15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0,
+                                     15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0
+                                 ));
+        x = _mm256_permute4x64_epi64(x, 0xd8);
+        x2 = _mm256_permute4x64_epi64(x2, 0xd8);
+        _mm256_storeu_si256((__m256i *) writing, _mm256_permute2f128_si256(x, x2, 0x31));
+        _mm256_storeu_si256((__m256i *) out, _mm256_permute2f128_si256(x, x2, 0x20));
+        reading += 32;
+        writing += 16;
+        out += 32;
+    }
+    R[163] = R[326];
+
+    reading = (uint16 *) R;
+    writing = R;
+    i = 11;
+    while (i > 0) {
+        __m256i x, y;
+        --i;
+        if (!i) {
+            reading -= 12;
+            writing -= 6;
+            out -= 6;
+        }
+        x = _mm256_loadu_si256((__m256i *) reading);
+        y = x & _mm256_set1_epi32(65535);
+        x = _mm256_srli_epi32(x, 16);
+        x = _mm256_mullo_epi32(x, _mm256_set1_epi32(1314));
+        x = _mm256_add_epi32(y, x);
+        x = _mm256_shuffle_epi8(x, _mm256_set_epi8(
+                                    12, 8, 4, 0, 12, 8, 4, 0, 14, 13, 10, 9, 6, 5, 2, 1,
+                                    12, 8, 4, 0, 12, 8, 4, 0, 14, 13, 10, 9, 6, 5, 2, 1
+                                ));
+        x = _mm256_permute4x64_epi64(x, 0xd8);
+        _mm_storeu_si128((__m128i *) writing, _mm256_extractf128_si256(x, 0));
+        s0 = _mm256_extract_epi32(x, 4);
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 = _mm256_extract_epi32(x, 6);
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        reading += 16;
+        writing += 8;
+    }
+
+    reading = (uint16 *) R;
+    writing = R;
+    i = 3;
+    while (i > 0) {
+        __m256i x, x2, y, y2;
+        --i;
+        if (!i) {
+            reading -= 14;
+            writing -= 7;
+            out -= 14;
+        }
+        x = _mm256_loadu_si256((__m256i *) (reading + 0));
+        x2 = _mm256_loadu_si256((__m256i *) (reading + 16));
+        y = x & _mm256_set1_epi32(65535);
+        y2 = x2 & _mm256_set1_epi32(65535);
+        x = _mm256_srli_epi32(x, 16);
+        x2 = _mm256_srli_epi32(x2, 16);
+        x = _mm256_mullo_epi32(x, _mm256_set1_epi32(6745));
+        x2 = _mm256_mullo_epi32(x2, _mm256_set1_epi32(6745));
+        x = _mm256_add_epi32(y, x);
+        x2 = _mm256_add_epi32(y2, x2);
+        x = _mm256_shuffle_epi8(x, _mm256_set_epi8(
+                                    15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0,
+                                    15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0
+                                ));
+        x2 = _mm256_shuffle_epi8(x2, _mm256_set_epi8(
+                                     15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0,
+                                     15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0
+                                 ));
+        x = _mm256_permute4x64_epi64(x, 0xd8);
+        x2 = _mm256_permute4x64_epi64(x2, 0xd8);
+        _mm256_storeu_si256((__m256i *) writing, _mm256_permute2f128_si256(x, x2, 0x31));
+        _mm256_storeu_si256((__m256i *) out, _mm256_permute2f128_si256(x, x2, 0x20));
+        reading += 32;
+        writing += 16;
+        out += 32;
+    }
+
+    reading = (uint16 *) R;
+    writing = R;
+    i = 3;
+    while (i > 0) {
+        __m256i x, y;
+        --i;
+        if (!i) {
+            reading -= 8;
+            writing -= 4;
+            out -= 4;
+        }
+        x = _mm256_loadu_si256((__m256i *) reading);
+        y = x & _mm256_set1_epi32(65535);
+        x = _mm256_srli_epi32(x, 16);
+        x = _mm256_mullo_epi32(x, _mm256_set1_epi32(695));
+        x = _mm256_add_epi32(y, x);
+        x = _mm256_shuffle_epi8(x, _mm256_set_epi8(
+                                    12, 8, 4, 0, 12, 8, 4, 0, 14, 13, 10, 9, 6, 5, 2, 1,
+                                    12, 8, 4, 0, 12, 8, 4, 0, 14, 13, 10, 9, 6, 5, 2, 1
+                                ));
+        x = _mm256_permute4x64_epi64(x, 0xd8);
+        _mm_storeu_si128((__m128i *) writing, _mm256_extractf128_si256(x, 0));
+        s0 = _mm256_extract_epi32(x, 4);
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 = _mm256_extract_epi32(x, 6);
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        reading += 16;
+        writing += 8;
+    }
+    R[20] = R[40];
+
+    for (i = 0; i < 10; ++i) {
+        r0 = R[2 * i];
+        r1 = R[2 * i + 1];
+        r2 = r0 + r1 * (uint32)1887;
+        *out++ = r2;
+        r2 >>= 8;
+        R[i] = r2;
+    }
+    R[10] = R[20];
+
+    for (i = 0; i < 5; ++i) {
+        r0 = R[2 * i];
+        r1 = R[2 * i + 1];
+        r2 = r0 + r1 * (uint32)13910;
+        *out++ = r2;
+        r2 >>= 8;
+        *out++ = r2;
+        r2 >>= 8;
+        R[i] = r2;
+    }
+    R[5] = R[10];
+
+    for (i = 0; i < 2; ++i) {
+        r0 = R[2 * i];
+        r1 = R[2 * i + 1];
+        r2 = r0 + r1 * (uint32)2953;
+        *out++ = r2;
+        r2 >>= 8;
+        *out++ = r2;
+        r2 >>= 8;
+        R[i] = r2;
+    }
+    r0 = R[4];
+    r1 = R[5];
+    r2 = r0 + r1 * (uint32)2953;
+    *out++ = r2;
+    r2 >>= 8;
+    R[2] = r2;
+
+    r0 = R[0];
+    r1 = R[1];
+    r2 = r0 + r1 * (uint32)134;
+    *out++ = r2;
+    r2 >>= 8;
+    R[0] = r2;
+    R[1] = R[2];
+
+    r0 = R[0];
+    r1 = R[1];
+    r2 = r0 + r1 * (uint32)71;
+    *out++ = r2;
+    r2 >>= 8;
+    R[0] = r2;
+
+    r0 = R[0];
+    *out++ = r0;
+    r0 >>= 8;
+    *out++ = r0; /*clang-analyzer-deadcode.DeadStores*/ /*r0 >>= 8;*/
+}
diff --git a/crypto_kem/ntrulpr653/avx2/crypto_encode_653x1541round.h b/crypto_kem/ntrulpr653/avx2/crypto_encode_653x1541round.h
new file mode 100644
index 00000000..036eb13e
--- /dev/null
+++ b/crypto_kem/ntrulpr653/avx2/crypto_encode_653x1541round.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_NTRULPR653_AVX2_CRYPTO_ENCODE_653X1541ROUND_H
+#define PQCLEAN_NTRULPR653_AVX2_CRYPTO_ENCODE_653X1541ROUND_H
+
+#include <stdint.h>
+#define PQCLEAN_NTRULPR653_AVX2_crypto_encode_653x1541round_STRBYTES 865
+#define PQCLEAN_NTRULPR653_AVX2_crypto_encode_653x1541round_ITEMS 653
+#define PQCLEAN_NTRULPR653_AVX2_crypto_encode_653x1541round_ITEMBYTES 2
+
+void PQCLEAN_NTRULPR653_AVX2_crypto_encode_653x1541round(unsigned char *out, const void *v);
+#endif
diff --git a/crypto_kem/ntrulpr653/avx2/crypto_encode_653x3.c b/crypto_kem/ntrulpr653/avx2/crypto_encode_653x3.c
new file mode 100644
index 00000000..f6748a3f
--- /dev/null
+++ b/crypto_kem/ntrulpr653/avx2/crypto_encode_653x3.c
@@ -0,0 +1,64 @@
+#include "crypto_encode_653x3.h"
+#include <immintrin.h>
+#define uint8 uint8_t
+
+#define p 653
+#define loops 6
+#define overshoot 29
+
+static const union {
+    uint8 init[32];
+    __m256i val;
+} lobytes_buf = { .init = {
+        255, 0, 255, 0, 255, 0, 255, 0,
+        255, 0, 255, 0, 255, 0, 255, 0,
+        255, 0, 255, 0, 255, 0, 255, 0,
+        255, 0, 255, 0, 255, 0, 255, 0,
+    }
+};
+#define lobytes (lobytes_buf.val)
+
+void PQCLEAN_NTRULPR653_AVX2_crypto_encode_653x3(unsigned char *s, const void *v) {
+    const uint8 *f = v;
+    int loop;
+    const uint8 *nextf = f + 128 - 4 * overshoot;
+    unsigned char *nexts = s + 32 - overshoot;
+
+    for (loop = loops; loop > 0; --loop) {
+        __m256i f0 = _mm256_loadu_si256((const __m256i *) (f + 0));
+        __m256i f1 = _mm256_loadu_si256((const __m256i *) (f + 32));
+        __m256i f2 = _mm256_loadu_si256((const __m256i *) (f + 64));
+        __m256i f3 = _mm256_loadu_si256((const __m256i *) (f + 96));
+        f = nextf;
+        nextf += 128;
+
+        __m256i a0 = _mm256_packus_epi16(f0 & lobytes, f1 & lobytes);
+        /* 0 2 4 6 8 10 12 14 32 34 36 38 40 42 44 46 */
+        /* 16 18 20 22 24 26 28 30 48 50 52 54 56 58 60 62 */
+        __m256i a1 = _mm256_packus_epi16(_mm256_srli_epi16(f0, 8), _mm256_srli_epi16(f1, 8));
+        /* 1 3 ... */
+        __m256i a2 = _mm256_packus_epi16(f2 & lobytes, f3 & lobytes);
+        __m256i a3 = _mm256_packus_epi16(_mm256_srli_epi16(f2, 8), _mm256_srli_epi16(f3, 8));
+
+        a0 = _mm256_add_epi8(a0, _mm256_slli_epi16(a1 & _mm256_set1_epi8(63), 2));
+        a2 = _mm256_add_epi8(a2, _mm256_slli_epi16(a3 & _mm256_set1_epi8(63), 2));
+
+        __m256i b0 = _mm256_packus_epi16(a0 & lobytes, a2 & lobytes);
+        /* 0 4 8 12 32 36 40 44 64 68 72 76 96 100 104 108 */
+        /* 16 20 24 28 48 52 56 60 80 84 88 92 112 116 120 124 */
+        __m256i b2 = _mm256_packus_epi16(_mm256_srli_epi16(a0, 8), _mm256_srli_epi16(a2, 8));
+        /* 2 6 ... */
+
+        b0 = _mm256_add_epi8(b0, _mm256_slli_epi16(b2 & _mm256_set1_epi8(15), 4));
+
+        b0 = _mm256_permutevar8x32_epi32(b0, _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0));
+
+        b0 = _mm256_add_epi8(b0, _mm256_set1_epi8(85));
+
+        _mm256_storeu_si256((__m256i *) s, b0);
+        s = nexts;
+        nexts += 32;
+    }
+
+    *s++ = *f++ + 1;
+}
diff --git a/crypto_kem/ntrulpr653/avx2/crypto_encode_653x3.h b/crypto_kem/ntrulpr653/avx2/crypto_encode_653x3.h
new file mode 100644
index 00000000..b4cd590d
--- /dev/null
+++ b/crypto_kem/ntrulpr653/avx2/crypto_encode_653x3.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_NTRULPR653_AVX2_CRYPTO_ENCODE_653X3_H
+#define PQCLEAN_NTRULPR653_AVX2_CRYPTO_ENCODE_653X3_H
+
+#include <stdint.h>
+#define PQCLEAN_NTRULPR653_AVX2_crypto_encode_653x3_STRBYTES 164
+#define PQCLEAN_NTRULPR653_AVX2_crypto_encode_653x3_ITEMS 653
+#define PQCLEAN_NTRULPR653_AVX2_crypto_encode_653x3_ITEMBYTES 1
+
+void PQCLEAN_NTRULPR653_AVX2_crypto_encode_653x3(unsigned char *s, const void *v);
+#endif
diff --git a/crypto_kem/ntrulpr653/avx2/crypto_encode_653xint16.c b/crypto_kem/ntrulpr653/avx2/crypto_encode_653xint16.c
new file mode 100644
index 00000000..7d60b0c9
--- /dev/null
+++ b/crypto_kem/ntrulpr653/avx2/crypto_encode_653xint16.c
@@ -0,0 +1,13 @@
+#include "crypto_encode_653xint16.h"
+
+
+void PQCLEAN_NTRULPR653_AVX2_crypto_encode_653xint16(unsigned char *s, const void *v) {
+    const uint16_t *x = v;
+    int i;
+
+    for (i = 0; i < 653; ++i) {
+        uint16_t u = *x++;
+        *s++ = u;
+        *s++ = u >> 8;
+    }
+}
diff --git a/crypto_kem/ntrulpr653/avx2/crypto_encode_653xint16.h b/crypto_kem/ntrulpr653/avx2/crypto_encode_653xint16.h
new file mode 100644
index 00000000..8d6681ad
--- /dev/null
+++ b/crypto_kem/ntrulpr653/avx2/crypto_encode_653xint16.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_NTRULPR653_AVX2_CRYPTO_ENCODE_653XINT16_H
+#define PQCLEAN_NTRULPR653_AVX2_CRYPTO_ENCODE_653XINT16_H
+
+#include <stdint.h>
+#define PQCLEAN_NTRULPR653_AVX2_crypto_encode_653xint16_STRBYTES 1306
+#define PQCLEAN_NTRULPR653_AVX2_crypto_encode_653xint16_ITEMBYTES 2
+#define PQCLEAN_NTRULPR653_AVX2_crypto_encode_653xint16_ITEMS 653
+
+void PQCLEAN_NTRULPR653_AVX2_crypto_encode_653xint16(unsigned char *s, const void *v);
+#endif
diff --git a/crypto_kem/ntrulpr653/avx2/crypto_sort_int32.c b/crypto_kem/ntrulpr653/avx2/crypto_sort_int32.c
new file mode 100644
index 00000000..e1327c65
--- /dev/null
+++ b/crypto_kem/ntrulpr653/avx2/crypto_sort_int32.c
@@ -0,0 +1,1210 @@
+#include "crypto_sort_int32.h"
+#include <immintrin.h>
+// Based on supercop-20200820/crypto_sort/int32/avx2
+
+
+#define int32 int32_t
+
+typedef __m256i int32x8;
+#define int32x8_load(z) _mm256_loadu_si256((__m256i *) (z))
+#define int32x8_store(z,i) _mm256_storeu_si256((__m256i *) (z),(i))
+#define int32x8_min _mm256_min_epi32
+#define int32x8_max _mm256_max_epi32
+
+#define int32x8_MINMAX(a,b) \
+    do { \
+        int32x8 c = int32x8_min((a),(b)); \
+        (b) = int32x8_max((a),(b)); \
+        (a) = c; \
+    } while(0)
+
+static inline void int32_MINMAX(int32 *a, int32 *b) {
+    int32 ab = *b ^ *a;
+    int32 c = (int32)((int64_t) * b - (int64_t) * a);
+    c ^= ab & (c ^ *b);
+    c >>= 31;
+    c &= ab;
+    *a ^= c;
+    *b ^= c;
+}
+
+static void minmax_vector(int32 *x, int32 *y, size_t n) {
+    if ((long long) n < 8) {
+        while ((long long) n > 0) {
+            int32_MINMAX(x, y);
+            ++x;
+            ++y;
+            --n;
+        }
+        return;
+    }
+    if (n & 7) {
+        int32x8 x0 = int32x8_load(x + n - 8);
+        int32x8 y0 = int32x8_load(y + n - 8);
+        int32x8_MINMAX(x0, y0);
+        int32x8_store(x + n - 8, x0);
+        int32x8_store(y + n - 8, y0);
+        n &= ~7;
+    }
+    do {
+        int32x8 x0 = int32x8_load(x);
+        int32x8 y0 = int32x8_load(y);
+        int32x8_MINMAX(x0, y0);
+        int32x8_store(x, x0);
+        int32x8_store(y, y0);
+        x += 8;
+        y += 8;
+        n -= 8;
+    } while (n);
+}
+
+/* stages 8,4,2,1 of size-16 bitonic merging */
+static void merge16_finish(int32 *x, int32x8 x0, int32x8 x1, int flagdown) {
+    int32x8 b0, b1, c0, c1, mask;
+
+    int32x8_MINMAX(x0, x1);
+
+    b0 = _mm256_permute2x128_si256(x0, x1, 0x20); /* A0123B0123 */
+    b1 = _mm256_permute2x128_si256(x0, x1, 0x31); /* A4567B4567 */
+
+    int32x8_MINMAX(b0, b1);
+
+    c0 = _mm256_unpacklo_epi64(b0, b1); /* A0145B0145 */
+    c1 = _mm256_unpackhi_epi64(b0, b1); /* A2367B2367 */
+
+    int32x8_MINMAX(c0, c1);
+
+    b0 = _mm256_unpacklo_epi32(c0, c1); /* A0213B0213 */
+    b1 = _mm256_unpackhi_epi32(c0, c1); /* A4657B4657 */
+
+    c0 = _mm256_unpacklo_epi64(b0, b1); /* A0246B0246 */
+    c1 = _mm256_unpackhi_epi64(b0, b1); /* A1357B1357 */
+
+    int32x8_MINMAX(c0, c1);
+
+    b0 = _mm256_unpacklo_epi32(c0, c1); /* A0123B0123 */
+    b1 = _mm256_unpackhi_epi32(c0, c1); /* A4567B4567 */
+
+    x0 = _mm256_permute2x128_si256(b0, b1, 0x20); /* A01234567 */
+    x1 = _mm256_permute2x128_si256(b0, b1, 0x31); /* A01234567 */
+
+    if (flagdown) {
+        mask = _mm256_set1_epi32(-1);
+        x0 ^= mask;
+        x1 ^= mask;
+    }
+
+    int32x8_store(&x[0], x0);
+    int32x8_store(&x[8], x1);
+}
+
+/* stages 64,32 of bitonic merging; n is multiple of 128 */
+static void int32_twostages_32(int32 *x, size_t n) {
+    size_t i;
+
+    while (n > 0) {
+        for (i = 0; i < 32; i += 8) {
+            int32x8 x0 = int32x8_load(&x[i]);
+            int32x8 x1 = int32x8_load(&x[i + 32]);
+            int32x8 x2 = int32x8_load(&x[i + 64]);
+            int32x8 x3 = int32x8_load(&x[i + 96]);
+
+            int32x8_MINMAX(x0, x2);
+            int32x8_MINMAX(x1, x3);
+            int32x8_MINMAX(x0, x1);
+            int32x8_MINMAX(x2, x3);
+
+            int32x8_store(&x[i], x0);
+            int32x8_store(&x[i + 32], x1);
+            int32x8_store(&x[i + 64], x2);
+            int32x8_store(&x[i + 96], x3);
+        }
+        x += 128;
+        n -= 128;
+    }
+}
+
+/* stages 4q,2q,q of bitonic merging */
+static size_t int32_threestages(int32 *x, size_t n, size_t q) {
+    size_t k, i;
+
+    for (k = 0; k + 8 * q <= n; k += 8 * q) {
+        for (i = k; i < k + q; i += 8) {
+            int32x8 x0 = int32x8_load(&x[i]);
+            int32x8 x1 = int32x8_load(&x[i + q]);
+            int32x8 x2 = int32x8_load(&x[i + 2 * q]);
+            int32x8 x3 = int32x8_load(&x[i + 3 * q]);
+            int32x8 x4 = int32x8_load(&x[i + 4 * q]);
+            int32x8 x5 = int32x8_load(&x[i + 5 * q]);
+            int32x8 x6 = int32x8_load(&x[i + 6 * q]);
+            int32x8 x7 = int32x8_load(&x[i + 7 * q]);
+
+            int32x8_MINMAX(x0, x4);
+            int32x8_MINMAX(x1, x5);
+            int32x8_MINMAX(x2, x6);
+            int32x8_MINMAX(x3, x7);
+            int32x8_MINMAX(x0, x2);
+            int32x8_MINMAX(x1, x3);
+            int32x8_MINMAX(x4, x6);
+            int32x8_MINMAX(x5, x7);
+            int32x8_MINMAX(x0, x1);
+            int32x8_MINMAX(x2, x3);
+            int32x8_MINMAX(x4, x5);
+            int32x8_MINMAX(x6, x7);
+
+            int32x8_store(&x[i], x0);
+            int32x8_store(&x[i + q], x1);
+            int32x8_store(&x[i + 2 * q], x2);
+            int32x8_store(&x[i + 3 * q], x3);
+            int32x8_store(&x[i + 4 * q], x4);
+            int32x8_store(&x[i + 5 * q], x5);
+            int32x8_store(&x[i + 6 * q], x6);
+            int32x8_store(&x[i + 7 * q], x7);
+        }
+    }
+
+    return k;
+}
+
+/* n is a power of 2; n >= 8; if n == 8 then flagdown */
+// NOLINTNEXTLINE(google-readability-function-size)
+static void int32_sort_2power(int32 *x, size_t n, int flagdown) {
+    size_t p, q, i, j, k;
+    int32x8 mask;
+
+    if (n == 8) {
+        int32 x0 = x[0];
+        int32 x1 = x[1];
+        int32 x2 = x[2];
+        int32 x3 = x[3];
+        int32 x4 = x[4];
+        int32 x5 = x[5];
+        int32 x6 = x[6];
+        int32 x7 = x[7];
+
+        /* odd-even sort instead of bitonic sort */
+
+        int32_MINMAX(&x1, &x0);
+        int32_MINMAX(&x3, &x2);
+        int32_MINMAX(&x2, &x0);
+        int32_MINMAX(&x3, &x1);
+        int32_MINMAX(&x2, &x1);
+
+        int32_MINMAX(&x5, &x4);
+        int32_MINMAX(&x7, &x6);
+        int32_MINMAX(&x6, &x4);
+        int32_MINMAX(&x7, &x5);
+        int32_MINMAX(&x6, &x5);
+
+        int32_MINMAX(&x4, &x0);
+        int32_MINMAX(&x6, &x2);
+        int32_MINMAX(&x4, &x2);
+
+        int32_MINMAX(&x5, &x1);
+        int32_MINMAX(&x7, &x3);
+        int32_MINMAX(&x5, &x3);
+
+        int32_MINMAX(&x2, &x1);
+        int32_MINMAX(&x4, &x3);
+        int32_MINMAX(&x6, &x5);
+
+        x[0] = x0;
+        x[1] = x1;
+        x[2] = x2;
+        x[3] = x3;
+        x[4] = x4;
+        x[5] = x5;
+        x[6] = x6;
+        x[7] = x7;
+        return;
+    }
+
+    if (n == 16) {
+        int32x8 x0, x1, b0, b1, c0, c1;
+
+        x0 = int32x8_load(&x[0]);
+        x1 = int32x8_load(&x[8]);
+
+        mask = _mm256_set_epi32(0, 0, -1, -1, 0, 0, -1, -1);
+
+        x0 ^= mask; /* A01234567 */
+        x1 ^= mask; /* B01234567 */
+
+        b0 = _mm256_unpacklo_epi32(x0, x1); /* AB0AB1AB4AB5 */
+        b1 = _mm256_unpackhi_epi32(x0, x1); /* AB2AB3AB6AB7 */
+
+        c0 = _mm256_unpacklo_epi64(b0, b1); /* AB0AB2AB4AB6 */
+        c1 = _mm256_unpackhi_epi64(b0, b1); /* AB1AB3AB5AB7 */
+
+        int32x8_MINMAX(c0, c1);
+
+        mask = _mm256_set_epi32(0, 0, -1, -1, -1, -1, 0, 0);
+        c0 ^= mask;
+        c1 ^= mask;
+
+        b0 = _mm256_unpacklo_epi32(c0, c1); /* A01B01A45B45 */
+        b1 = _mm256_unpackhi_epi32(c0, c1); /* A23B23A67B67 */
+
+        int32x8_MINMAX(b0, b1);
+
+        x0 = _mm256_unpacklo_epi64(b0, b1); /* A01234567 */
+        x1 = _mm256_unpackhi_epi64(b0, b1); /* B01234567 */
+
+        b0 = _mm256_unpacklo_epi32(x0, x1); /* AB0AB1AB4AB5 */
+        b1 = _mm256_unpackhi_epi32(x0, x1); /* AB2AB3AB6AB7 */
+
+        c0 = _mm256_unpacklo_epi64(b0, b1); /* AB0AB2AB4AB6 */
+        c1 = _mm256_unpackhi_epi64(b0, b1); /* AB1AB3AB5AB7 */
+
+        int32x8_MINMAX(c0, c1);
+
+        b0 = _mm256_unpacklo_epi32(c0, c1); /* A01B01A45B45 */
+        b1 = _mm256_unpackhi_epi32(c0, c1); /* A23B23A67B67 */
+
+        b0 ^= mask;
+        b1 ^= mask;
+
+        c0 = _mm256_permute2x128_si256(b0, b1, 0x20); /* A01B01A23B23 */
+        c1 = _mm256_permute2x128_si256(b0, b1, 0x31); /* A45B45A67B67 */
+
+        int32x8_MINMAX(c0, c1);
+
+        b0 = _mm256_permute2x128_si256(c0, c1, 0x20); /* A01B01A45B45 */
+        b1 = _mm256_permute2x128_si256(c0, c1, 0x31); /* A23B23A67B67 */
+
+        int32x8_MINMAX(b0, b1);
+
+        x0 = _mm256_unpacklo_epi64(b0, b1); /* A01234567 */
+        x1 = _mm256_unpackhi_epi64(b0, b1); /* B01234567 */
+
+        b0 = _mm256_unpacklo_epi32(x0, x1); /* AB0AB1AB4AB5 */
+        b1 = _mm256_unpackhi_epi32(x0, x1); /* AB2AB3AB6AB7 */
+
+        c0 = _mm256_unpacklo_epi64(b0, b1); /* AB0AB2AB4AB6 */
+        c1 = _mm256_unpackhi_epi64(b0, b1); /* AB1AB3AB5AB7 */
+
+        int32x8_MINMAX(c0, c1);
+
+        b0 = _mm256_unpacklo_epi32(c0, c1); /* A01B01A45B45 */
+        b1 = _mm256_unpackhi_epi32(c0, c1); /* A23B23A67B67 */
+
+        x0 = _mm256_unpacklo_epi64(b0, b1); /* A01234567 */
+        x1 = _mm256_unpackhi_epi64(b0, b1); /* B01234567 */
+
+        mask = _mm256_set1_epi32(-1);
+        if (flagdown) {
+            x1 ^= mask;
+        } else {
+            x0 ^= mask;
+        }
+
+        merge16_finish(x, x0, x1, flagdown);
+        return;
+    }
+
+    if (n == 32) {
+        int32x8 x0, x1, x2, x3;
+
+        int32_sort_2power(x, 16, 1);
+        int32_sort_2power(x + 16, 16, 0);
+
+        x0 = int32x8_load(&x[0]);
+        x1 = int32x8_load(&x[8]);
+        x2 = int32x8_load(&x[16]);
+        x3 = int32x8_load(&x[24]);
+
+        if (flagdown) {
+            mask = _mm256_set1_epi32(-1);
+            x0 ^= mask;
+            x1 ^= mask;
+            x2 ^= mask;
+            x3 ^= mask;
+        }
+
+        int32x8_MINMAX(x0, x2);
+        int32x8_MINMAX(x1, x3);
+
+        merge16_finish(x, x0, x1, flagdown);
+        merge16_finish(x + 16, x2, x3, flagdown);
+        return;
+    }
+
+    p = n >> 3;
+    for (i = 0; i < p; i += 8) {
+        int32x8 x0 = int32x8_load(&x[i]);
+        int32x8 x2 = int32x8_load(&x[i + 2 * p]);
+        int32x8 x4 = int32x8_load(&x[i + 4 * p]);
+        int32x8 x6 = int32x8_load(&x[i + 6 * p]);
+
+        /* odd-even stage instead of bitonic stage */
+
+        int32x8_MINMAX(x4, x0);
+        int32x8_MINMAX(x6, x2);
+        int32x8_MINMAX(x2, x0);
+        int32x8_MINMAX(x6, x4);
+        int32x8_MINMAX(x2, x4);
+
+        int32x8_store(&x[i], x0);
+        int32x8_store(&x[i + 2 * p], x2);
+        int32x8_store(&x[i + 4 * p], x4);
+        int32x8_store(&x[i + 6 * p], x6);
+
+        int32x8 x1 = int32x8_load(&x[i + p]);
+        int32x8 x3 = int32x8_load(&x[i + 3 * p]);
+        int32x8 x5 = int32x8_load(&x[i + 5 * p]);
+        int32x8 x7 = int32x8_load(&x[i + 7 * p]);
+
+        int32x8_MINMAX(x1, x5);
+        int32x8_MINMAX(x3, x7);
+        int32x8_MINMAX(x1, x3);
+        int32x8_MINMAX(x5, x7);
+        int32x8_MINMAX(x5, x3);
+
+        int32x8_store(&x[i + p], x1);
+        int32x8_store(&x[i + 3 * p], x3);
+        int32x8_store(&x[i + 5 * p], x5);
+        int32x8_store(&x[i + 7 * p], x7);
+    }
+
+    if (n >= 128) {
+        int flip, flipflip;
+
+        mask = _mm256_set1_epi32(-1);
+
+        for (j = 0; j < n; j += 32) {
+            int32x8 x0 = int32x8_load(&x[j]);
+            int32x8 x1 = int32x8_load(&x[j + 16]);
+            x0 ^= mask;
+            x1 ^= mask;
+            int32x8_store(&x[j], x0);
+            int32x8_store(&x[j + 16], x1);
+        }
+
+        p = 8;
+        for (;;) { /* for p in [8, 16, ..., n/16] */
+            q = p >> 1;
+            while (q >= 128) {
+                int32_threestages(x, n, q >> 2);
+                q >>= 3;
+            }
+            if (q == 64) {
+                int32_twostages_32(x, n);
+                q = 16;
+            }
+            if (q == 32) {
+                q = 8;
+                for (k = 0; k < n; k += 8 * q) {
+                    for (i = k; i < k + q; i += 8) {
+                        int32x8 x0 = int32x8_load(&x[i]);
+                        int32x8 x1 = int32x8_load(&x[i + q]);
+                        int32x8 x2 = int32x8_load(&x[i + 2 * q]);
+                        int32x8 x3 = int32x8_load(&x[i + 3 * q]);
+                        int32x8 x4 = int32x8_load(&x[i + 4 * q]);
+                        int32x8 x5 = int32x8_load(&x[i + 5 * q]);
+                        int32x8 x6 = int32x8_load(&x[i + 6 * q]);
+                        int32x8 x7 = int32x8_load(&x[i + 7 * q]);
+
+                        int32x8_MINMAX(x0, x4);
+                        int32x8_MINMAX(x1, x5);
+                        int32x8_MINMAX(x2, x6);
+                        int32x8_MINMAX(x3, x7);
+                        int32x8_MINMAX(x0, x2);
+                        int32x8_MINMAX(x1, x3);
+                        int32x8_MINMAX(x4, x6);
+                        int32x8_MINMAX(x5, x7);
+                        int32x8_MINMAX(x0, x1);
+                        int32x8_MINMAX(x2, x3);
+                        int32x8_MINMAX(x4, x5);
+                        int32x8_MINMAX(x6, x7);
+
+                        int32x8_store(&x[i], x0);
+                        int32x8_store(&x[i + q], x1);
+                        int32x8_store(&x[i + 2 * q], x2);
+                        int32x8_store(&x[i + 3 * q], x3);
+                        int32x8_store(&x[i + 4 * q], x4);
+                        int32x8_store(&x[i + 5 * q], x5);
+                        int32x8_store(&x[i + 6 * q], x6);
+                        int32x8_store(&x[i + 7 * q], x7);
+                    }
+                }
+                q = 4;
+            }
+            if (q == 16) {
+                q = 8;
+                for (k = 0; k < n; k += 4 * q) {
+                    for (i = k; i < k + q; i += 8) {
+                        int32x8 x0 = int32x8_load(&x[i]);
+                        int32x8 x1 = int32x8_load(&x[i + q]);
+                        int32x8 x2 = int32x8_load(&x[i + 2 * q]);
+                        int32x8 x3 = int32x8_load(&x[i + 3 * q]);
+
+                        int32x8_MINMAX(x0, x2);
+                        int32x8_MINMAX(x1, x3);
+                        int32x8_MINMAX(x0, x1);
+                        int32x8_MINMAX(x2, x3);
+
+                        int32x8_store(&x[i], x0);
+                        int32x8_store(&x[i + q], x1);
+                        int32x8_store(&x[i + 2 * q], x2);
+                        int32x8_store(&x[i + 3 * q], x3);
+                    }
+                }
+                q = 4;
+            }
+            if (q == 8) {
+                for (k = 0; k < n; k += q + q) {
+                    int32x8 x0 = int32x8_load(&x[k]);
+                    int32x8 x1 = int32x8_load(&x[k + q]);
+
+                    int32x8_MINMAX(x0, x1);
+
+                    int32x8_store(&x[k], x0);
+                    int32x8_store(&x[k + q], x1);
+                }
+            }
+
+            q = n >> 3;
+            flip = (p << 1 == q);
+            flipflip = !flip;
+            for (j = 0; j < q; j += p + p) {
+                for (k = j; k < j + p + p; k += p) {
+                    for (i = k; i < k + p; i += 8) {
+                        int32x8 x0 = int32x8_load(&x[i]);
+                        int32x8 x1 = int32x8_load(&x[i + q]);
+                        int32x8 x2 = int32x8_load(&x[i + 2 * q]);
+                        int32x8 x3 = int32x8_load(&x[i + 3 * q]);
+                        int32x8 x4 = int32x8_load(&x[i + 4 * q]);
+                        int32x8 x5 = int32x8_load(&x[i + 5 * q]);
+                        int32x8 x6 = int32x8_load(&x[i + 6 * q]);
+                        int32x8 x7 = int32x8_load(&x[i + 7 * q]);
+
+                        int32x8_MINMAX(x0, x1);
+                        int32x8_MINMAX(x2, x3);
+                        int32x8_MINMAX(x4, x5);
+                        int32x8_MINMAX(x6, x7);
+                        int32x8_MINMAX(x0, x2);
+                        int32x8_MINMAX(x1, x3);
+                        int32x8_MINMAX(x4, x6);
+                        int32x8_MINMAX(x5, x7);
+                        int32x8_MINMAX(x0, x4);
+                        int32x8_MINMAX(x1, x5);
+                        int32x8_MINMAX(x2, x6);
+                        int32x8_MINMAX(x3, x7);
+
+                        if (flip) {
+                            x0 ^= mask;
+                            x1 ^= mask;
+                            x2 ^= mask;
+                            x3 ^= mask;
+                            x4 ^= mask;
+                            x5 ^= mask;
+                            x6 ^= mask;
+                            x7 ^= mask;
+                        }
+
+                        int32x8_store(&x[i], x0);
+                        int32x8_store(&x[i + q], x1);
+                        int32x8_store(&x[i + 2 * q], x2);
+                        int32x8_store(&x[i + 3 * q], x3);
+                        int32x8_store(&x[i + 4 * q], x4);
+                        int32x8_store(&x[i + 5 * q], x5);
+                        int32x8_store(&x[i + 6 * q], x6);
+                        int32x8_store(&x[i + 7 * q], x7);
+                    }
+                    flip ^= 1;
+                }
+                flip ^= flipflip;
+            }
+
+            if (p << 4 == n) {
+                break;
+            }
+            p <<= 1;
+        }
+    }
+
+    for (p = 4; p >= 1; p >>= 1) {
+        int32 *z = x;
+        int32 *target = x + n;
+        if (p == 4) {
+            mask = _mm256_set_epi32(0, 0, 0, 0, -1, -1, -1, -1);
+            while (z != target) {
+                int32x8 x0 = int32x8_load(&z[0]);
+                int32x8 x1 = int32x8_load(&z[8]);
+                x0 ^= mask;
+                x1 ^= mask;
+                int32x8_store(&z[0], x0);
+                int32x8_store(&z[8], x1);
+                z += 16;
+            }
+        } else if (p == 2) {
+            mask = _mm256_set_epi32(0, 0, -1, -1, -1, -1, 0, 0);
+            while (z != target) {
+                int32x8 x0 = int32x8_load(&z[0]);
+                int32x8 x1 = int32x8_load(&z[8]);
+                x0 ^= mask;
+                x1 ^= mask;
+                int32x8 b0 = _mm256_permute2x128_si256(x0, x1, 0x20);
+                int32x8 b1 = _mm256_permute2x128_si256(x0, x1, 0x31);
+                int32x8_MINMAX(b0, b1);
+                int32x8 c0 = _mm256_permute2x128_si256(b0, b1, 0x20);
+                int32x8 c1 = _mm256_permute2x128_si256(b0, b1, 0x31);
+                int32x8_store(&z[0], c0);
+                int32x8_store(&z[8], c1);
+                z += 16;
+            }
+        } else { /* p == 1 */
+            mask = _mm256_set_epi32(0, -1, -1, 0, 0, -1, -1, 0);
+            while (z != target) {
+                int32x8 x0 = int32x8_load(&z[0]);
+                int32x8 x1 = int32x8_load(&z[8]);
+                x0 ^= mask;
+                x1 ^= mask;
+                int32x8 b0 = _mm256_permute2x128_si256(x0, x1, 0x20); /* A0123B0123 */
+                int32x8 b1 = _mm256_permute2x128_si256(x0, x1, 0x31); /* A4567B4567 */
+                int32x8 c0 = _mm256_unpacklo_epi64(b0, b1); /* A0145B0145 */
+                int32x8 c1 = _mm256_unpackhi_epi64(b0, b1); /* A2367B2367 */
+                int32x8_MINMAX(c0, c1);
+                int32x8 d0 = _mm256_unpacklo_epi64(c0, c1); /* A0123B0123 */
+                int32x8 d1 = _mm256_unpackhi_epi64(c0, c1); /* A4567B4567 */
+                int32x8_MINMAX(d0, d1);
+                int32x8 e0 = _mm256_permute2x128_si256(d0, d1, 0x20);
+                int32x8 e1 = _mm256_permute2x128_si256(d0, d1, 0x31);
+                int32x8_store(&z[0], e0);
+                int32x8_store(&z[8], e1);
+                z += 16;
+            }
+        }
+
+        q = n >> 4;
+        while (q >= 128 || q == 32) {
+            int32_threestages(x, n, q >> 2);
+            q >>= 3;
+        }
+        while (q >= 16) {
+            q >>= 1;
+            for (j = 0; j < n; j += 4 * q) {
+                for (k = j; k < j + q; k += 8) {
+                    int32x8 x0 = int32x8_load(&x[k]);
+                    int32x8 x1 = int32x8_load(&x[k + q]);
+                    int32x8 x2 = int32x8_load(&x[k + 2 * q]);
+                    int32x8 x3 = int32x8_load(&x[k + 3 * q]);
+
+                    int32x8_MINMAX(x0, x2);
+                    int32x8_MINMAX(x1, x3);
+                    int32x8_MINMAX(x0, x1);
+                    int32x8_MINMAX(x2, x3);
+
+                    int32x8_store(&x[k], x0);
+                    int32x8_store(&x[k + q], x1);
+                    int32x8_store(&x[k + 2 * q], x2);
+                    int32x8_store(&x[k + 3 * q], x3);
+                }
+            }
+            q >>= 1;
+        }
+        if (q == 8) {
+            for (j = 0; j < n; j += 2 * q) {
+                int32x8 x0 = int32x8_load(&x[j]);
+                int32x8 x1 = int32x8_load(&x[j + q]);
+
+                int32x8_MINMAX(x0, x1);
+
+                int32x8_store(&x[j], x0);
+                int32x8_store(&x[j + q], x1);
+            }
+        }
+
+        q = n >> 3;
+        for (k = 0; k < q; k += 8) {
+            int32x8 x0 = int32x8_load(&x[k]);
+            int32x8 x1 = int32x8_load(&x[k + q]);
+            int32x8 x2 = int32x8_load(&x[k + 2 * q]);
+            int32x8 x3 = int32x8_load(&x[k + 3 * q]);
+            int32x8 x4 = int32x8_load(&x[k + 4 * q]);
+            int32x8 x5 = int32x8_load(&x[k + 5 * q]);
+            int32x8 x6 = int32x8_load(&x[k + 6 * q]);
+            int32x8 x7 = int32x8_load(&x[k + 7 * q]);
+
+            int32x8_MINMAX(x0, x1);
+            int32x8_MINMAX(x2, x3);
+            int32x8_MINMAX(x4, x5);
+            int32x8_MINMAX(x6, x7);
+            int32x8_MINMAX(x0, x2);
+            int32x8_MINMAX(x1, x3);
+            int32x8_MINMAX(x4, x6);
+            int32x8_MINMAX(x5, x7);
+            int32x8_MINMAX(x0, x4);
+            int32x8_MINMAX(x1, x5);
+            int32x8_MINMAX(x2, x6);
+            int32x8_MINMAX(x3, x7);
+
+            int32x8_store(&x[k], x0);
+            int32x8_store(&x[k + q], x1);
+            int32x8_store(&x[k + 2 * q], x2);
+            int32x8_store(&x[k + 3 * q], x3);
+            int32x8_store(&x[k + 4 * q], x4);
+            int32x8_store(&x[k + 5 * q], x5);
+            int32x8_store(&x[k + 6 * q], x6);
+            int32x8_store(&x[k + 7 * q], x7);
+        }
+    }
+
+    /* everything is still masked with _mm256_set_epi32(0,-1,0,-1,0,-1,0,-1); */
+    mask = _mm256_set1_epi32(-1);
+
+    for (i = 0; i < n; i += 64) {
+        int32x8 a0 = int32x8_load(&x[i]);
+        int32x8 a1 = int32x8_load(&x[i + 8]);
+        int32x8 a2 = int32x8_load(&x[i + 16]);
+        int32x8 a3 = int32x8_load(&x[i + 24]);
+        int32x8 a4 = int32x8_load(&x[i + 32]);
+        int32x8 a5 = int32x8_load(&x[i + 40]);
+        int32x8 a6 = int32x8_load(&x[i + 48]);
+        int32x8 a7 = int32x8_load(&x[i + 56]);
+
+        int32x8 b0 = _mm256_unpacklo_epi32(a0, a1); /* AB0AB1AB4AB5 */
+        int32x8 b1 = _mm256_unpackhi_epi32(a0, a1); /* AB2AB3AB6AB7 */
+        int32x8 b2 = _mm256_unpacklo_epi32(a2, a3); /* CD0CD1CD4CD5 */
+        int32x8 b3 = _mm256_unpackhi_epi32(a2, a3); /* CD2CD3CD6CD7 */
+        int32x8 b4 = _mm256_unpacklo_epi32(a4, a5); /* EF0EF1EF4EF5 */
+        int32x8 b5 = _mm256_unpackhi_epi32(a4, a5); /* EF2EF3EF6EF7 */
+        int32x8 b6 = _mm256_unpacklo_epi32(a6, a7); /* GH0GH1GH4GH5 */
+        int32x8 b7 = _mm256_unpackhi_epi32(a6, a7); /* GH2GH3GH6GH7 */
+
+        int32x8 c0 = _mm256_unpacklo_epi64(b0, b2); /* ABCD0ABCD4 */
+        int32x8 c1 = _mm256_unpacklo_epi64(b1, b3); /* ABCD2ABCD6 */
+        int32x8 c2 = _mm256_unpackhi_epi64(b0, b2); /* ABCD1ABCD5 */
+        int32x8 c3 = _mm256_unpackhi_epi64(b1, b3); /* ABCD3ABCD7 */
+        int32x8 c4 = _mm256_unpacklo_epi64(b4, b6); /* EFGH0EFGH4 */
+        int32x8 c5 = _mm256_unpacklo_epi64(b5, b7); /* EFGH2EFGH6 */
+        int32x8 c6 = _mm256_unpackhi_epi64(b4, b6); /* EFGH1EFGH5 */
+        int32x8 c7 = _mm256_unpackhi_epi64(b5, b7); /* EFGH3EFGH7 */
+
+        if (flagdown) {
+            c2 ^= mask;
+            c3 ^= mask;
+            c6 ^= mask;
+            c7 ^= mask;
+        } else {
+            c0 ^= mask;
+            c1 ^= mask;
+            c4 ^= mask;
+            c5 ^= mask;
+        }
+
+        int32x8 d0 = _mm256_permute2x128_si256(c0, c4, 0x20); /* ABCDEFGH0 */
+        int32x8 d1 = _mm256_permute2x128_si256(c2, c6, 0x20); /* ABCDEFGH1 */
+        int32x8 d2 = _mm256_permute2x128_si256(c1, c5, 0x20); /* ABCDEFGH2 */
+        int32x8 d3 = _mm256_permute2x128_si256(c3, c7, 0x20); /* ABCDEFGH5 */
+        int32x8 d4 = _mm256_permute2x128_si256(c0, c4, 0x31); /* ABCDEFGH4 */
+        int32x8 d5 = _mm256_permute2x128_si256(c2, c6, 0x31); /* ABCDEFGH3 */
+        int32x8 d6 = _mm256_permute2x128_si256(c1, c5, 0x31); /* ABCDEFGH6 */
+        int32x8 d7 = _mm256_permute2x128_si256(c3, c7, 0x31); /* ABCDEFGH7 */
+
+        int32x8_MINMAX(d0, d1);
+        int32x8_MINMAX(d2, d3);
+        int32x8_MINMAX(d4, d5);
+        int32x8_MINMAX(d6, d7);
+        int32x8_MINMAX(d0, d2);
+        int32x8_MINMAX(d1, d3);
+        int32x8_MINMAX(d4, d6);
+        int32x8_MINMAX(d5, d7);
+        int32x8_MINMAX(d0, d4);
+        int32x8_MINMAX(d1, d5);
+        int32x8_MINMAX(d2, d6);
+        int32x8_MINMAX(d3, d7);
+
+        int32x8 e0 = _mm256_unpacklo_epi32(d0, d1);
+        int32x8 e1 = _mm256_unpackhi_epi32(d0, d1);
+        int32x8 e2 = _mm256_unpacklo_epi32(d2, d3);
+        int32x8 e3 = _mm256_unpackhi_epi32(d2, d3);
+        int32x8 e4 = _mm256_unpacklo_epi32(d4, d5);
+        int32x8 e5 = _mm256_unpackhi_epi32(d4, d5);
+        int32x8 e6 = _mm256_unpacklo_epi32(d6, d7);
+        int32x8 e7 = _mm256_unpackhi_epi32(d6, d7);
+
+        int32x8 f0 = _mm256_unpacklo_epi64(e0, e2);
+        int32x8 f1 = _mm256_unpacklo_epi64(e1, e3);
+        int32x8 f2 = _mm256_unpackhi_epi64(e0, e2);
+        int32x8 f3 = _mm256_unpackhi_epi64(e1, e3);
+        int32x8 f4 = _mm256_unpacklo_epi64(e4, e6);
+        int32x8 f5 = _mm256_unpacklo_epi64(e5, e7);
+        int32x8 f6 = _mm256_unpackhi_epi64(e4, e6);
+        int32x8 f7 = _mm256_unpackhi_epi64(e5, e7);
+
+        int32x8 g0 = _mm256_permute2x128_si256(f0, f4, 0x20);
+        int32x8 g1 = _mm256_permute2x128_si256(f2, f6, 0x20);
+        int32x8 g2 = _mm256_permute2x128_si256(f1, f5, 0x20);
+        int32x8 g3 = _mm256_permute2x128_si256(f3, f7, 0x20);
+        int32x8 g4 = _mm256_permute2x128_si256(f0, f4, 0x31);
+        int32x8 g5 = _mm256_permute2x128_si256(f2, f6, 0x31);
+        int32x8 g6 = _mm256_permute2x128_si256(f1, f5, 0x31);
+        int32x8 g7 = _mm256_permute2x128_si256(f3, f7, 0x31);
+
+        int32x8_store(&x[i], g0);
+        int32x8_store(&x[i + 8], g1);
+        int32x8_store(&x[i + 16], g2);
+        int32x8_store(&x[i + 24], g3);
+        int32x8_store(&x[i + 32], g4);
+        int32x8_store(&x[i + 40], g5);
+        int32x8_store(&x[i + 48], g6);
+        int32x8_store(&x[i + 56], g7);
+    }
+
+    q = n >> 4;
+    while (q >= 128 || q == 32) {
+        q >>= 2;
+        for (j = 0; j < n; j += 8 * q) {
+            for (i = j; i < j + q; i += 8) {
+                int32x8 x0 = int32x8_load(&x[i]);
+                int32x8 x1 = int32x8_load(&x[i + q]);
+                int32x8 x2 = int32x8_load(&x[i + 2 * q]);
+                int32x8 x3 = int32x8_load(&x[i + 3 * q]);
+                int32x8 x4 = int32x8_load(&x[i + 4 * q]);
+                int32x8 x5 = int32x8_load(&x[i + 5 * q]);
+                int32x8 x6 = int32x8_load(&x[i + 6 * q]);
+                int32x8 x7 = int32x8_load(&x[i + 7 * q]);
+                int32x8_MINMAX(x0, x4);
+                int32x8_MINMAX(x1, x5);
+                int32x8_MINMAX(x2, x6);
+                int32x8_MINMAX(x3, x7);
+                int32x8_MINMAX(x0, x2);
+                int32x8_MINMAX(x1, x3);
+                int32x8_MINMAX(x4, x6);
+                int32x8_MINMAX(x5, x7);
+                int32x8_MINMAX(x0, x1);
+                int32x8_MINMAX(x2, x3);
+                int32x8_MINMAX(x4, x5);
+                int32x8_MINMAX(x6, x7);
+                int32x8_store(&x[i], x0);
+                int32x8_store(&x[i + q], x1);
+                int32x8_store(&x[i + 2 * q], x2);
+                int32x8_store(&x[i + 3 * q], x3);
+                int32x8_store(&x[i + 4 * q], x4);
+                int32x8_store(&x[i + 5 * q], x5);
+                int32x8_store(&x[i + 6 * q], x6);
+                int32x8_store(&x[i + 7 * q], x7);
+            }
+        }
+        q >>= 1;
+    }
+    while (q >= 16) {
+        q >>= 1;
+        for (j = 0; j < n; j += 4 * q) {
+            for (i = j; i < j + q; i += 8) {
+                int32x8 x0 = int32x8_load(&x[i]);
+                int32x8 x1 = int32x8_load(&x[i + q]);
+                int32x8 x2 = int32x8_load(&x[i + 2 * q]);
+                int32x8 x3 = int32x8_load(&x[i + 3 * q]);
+                int32x8_MINMAX(x0, x2);
+                int32x8_MINMAX(x1, x3);
+                int32x8_MINMAX(x0, x1);
+                int32x8_MINMAX(x2, x3);
+                int32x8_store(&x[i], x0);
+                int32x8_store(&x[i + q], x1);
+                int32x8_store(&x[i + 2 * q], x2);
+                int32x8_store(&x[i + 3 * q], x3);
+            }
+        }
+        q >>= 1;
+    }
+    if (q == 8) {
+        for (j = 0; j < n; j += q + q) {
+            int32x8 x0 = int32x8_load(&x[j]);
+            int32x8 x1 = int32x8_load(&x[j + q]);
+            int32x8_MINMAX(x0, x1);
+            int32x8_store(&x[j], x0);
+            int32x8_store(&x[j + q], x1);
+        }
+    }
+
+    q = n >> 3;
+    for (i = 0; i < q; i += 8) {
+        int32x8 x0 = int32x8_load(&x[i]);
+        int32x8 x1 = int32x8_load(&x[i + q]);
+        int32x8 x2 = int32x8_load(&x[i + 2 * q]);
+        int32x8 x3 = int32x8_load(&x[i + 3 * q]);
+        int32x8 x4 = int32x8_load(&x[i + 4 * q]);
+        int32x8 x5 = int32x8_load(&x[i + 5 * q]);
+        int32x8 x6 = int32x8_load(&x[i + 6 * q]);
+        int32x8 x7 = int32x8_load(&x[i + 7 * q]);
+
+        int32x8_MINMAX(x0, x1);
+        int32x8_MINMAX(x2, x3);
+        int32x8_MINMAX(x4, x5);
+        int32x8_MINMAX(x6, x7);
+        int32x8_MINMAX(x0, x2);
+        int32x8_MINMAX(x1, x3);
+        int32x8_MINMAX(x4, x6);
+        int32x8_MINMAX(x5, x7);
+        int32x8_MINMAX(x0, x4);
+        int32x8_MINMAX(x1, x5);
+        int32x8_MINMAX(x2, x6);
+        int32x8_MINMAX(x3, x7);
+
+        int32x8 b0 = _mm256_unpacklo_epi32(x0, x4); /* AE0AE1AE4AE5 */
+        int32x8 b1 = _mm256_unpackhi_epi32(x0, x4); /* AE2AE3AE6AE7 */
+        int32x8 b2 = _mm256_unpacklo_epi32(x1, x5); /* BF0BF1BF4BF5 */
+        int32x8 b3 = _mm256_unpackhi_epi32(x1, x5); /* BF2BF3BF6BF7 */
+        int32x8 b4 = _mm256_unpacklo_epi32(x2, x6); /* CG0CG1CG4CG5 */
+        int32x8 b5 = _mm256_unpackhi_epi32(x2, x6); /* CG2CG3CG6CG7 */
+        int32x8 b6 = _mm256_unpacklo_epi32(x3, x7); /* DH0DH1DH4DH5 */
+        int32x8 b7 = _mm256_unpackhi_epi32(x3, x7); /* DH2DH3DH6DH7 */
+
+        int32x8 c0 = _mm256_unpacklo_epi64(b0, b4); /* AECG0AECG4 */
+        int32x8 c1 = _mm256_unpacklo_epi64(b1, b5); /* AECG2AECG6 */
+        int32x8 c2 = _mm256_unpackhi_epi64(b0, b4); /* AECG1AECG5 */
+        int32x8 c3 = _mm256_unpackhi_epi64(b1, b5); /* AECG3AECG7 */
+        int32x8 c4 = _mm256_unpacklo_epi64(b2, b6); /* BFDH0BFDH4 */
+        int32x8 c5 = _mm256_unpacklo_epi64(b3, b7); /* BFDH2BFDH6 */
+        int32x8 c6 = _mm256_unpackhi_epi64(b2, b6); /* BFDH1BFDH5 */
+        int32x8 c7 = _mm256_unpackhi_epi64(b3, b7); /* BFDH3BFDH7 */
+
+        int32x8 d0 = _mm256_permute2x128_si256(c0, c4, 0x20); /* AECGBFDH0 */
+        int32x8 d1 = _mm256_permute2x128_si256(c1, c5, 0x20); /* AECGBFDH2 */
+        int32x8 d2 = _mm256_permute2x128_si256(c2, c6, 0x20); /* AECGBFDH1 */
+        int32x8 d3 = _mm256_permute2x128_si256(c3, c7, 0x20); /* AECGBFDH3 */
+        int32x8 d4 = _mm256_permute2x128_si256(c0, c4, 0x31); /* AECGBFDH4 */
+        int32x8 d5 = _mm256_permute2x128_si256(c1, c5, 0x31); /* AECGBFDH6 */
+        int32x8 d6 = _mm256_permute2x128_si256(c2, c6, 0x31); /* AECGBFDH5 */
+        int32x8 d7 = _mm256_permute2x128_si256(c3, c7, 0x31); /* AECGBFDH7 */
+
+        if (flagdown) {
+            d0 ^= mask;
+            d1 ^= mask;
+            d2 ^= mask;
+            d3 ^= mask;
+            d4 ^= mask;
+            d5 ^= mask;
+            d6 ^= mask;
+            d7 ^= mask;
+        }
+
+        int32x8_store(&x[i], d0);
+        int32x8_store(&x[i + q], d4);
+        int32x8_store(&x[i + 2 * q], d1);
+        int32x8_store(&x[i + 3 * q], d5);
+        int32x8_store(&x[i + 4 * q], d2);
+        int32x8_store(&x[i + 5 * q], d6);
+        int32x8_store(&x[i + 6 * q], d3);
+        int32x8_store(&x[i + 7 * q], d7);
+    }
+}
+
+void PQCLEAN_NTRULPR653_AVX2_crypto_sort_int32(int32 *x, size_t n) {
+    size_t q, i, j;
+
+    if (n <= 8) {
+        if (n == 8) {
+            int32_MINMAX(&x[0], &x[1]);
+            int32_MINMAX(&x[1], &x[2]);
+            int32_MINMAX(&x[2], &x[3]);
+            int32_MINMAX(&x[3], &x[4]);
+            int32_MINMAX(&x[4], &x[5]);
+            int32_MINMAX(&x[5], &x[6]);
+            int32_MINMAX(&x[6], &x[7]);
+        }
+        if (n >= 7) {
+            int32_MINMAX(&x[0], &x[1]);
+            int32_MINMAX(&x[1], &x[2]);
+            int32_MINMAX(&x[2], &x[3]);
+            int32_MINMAX(&x[3], &x[4]);
+            int32_MINMAX(&x[4], &x[5]);
+            int32_MINMAX(&x[5], &x[6]);
+        }
+        if (n >= 6) {
+            int32_MINMAX(&x[0], &x[1]);
+            int32_MINMAX(&x[1], &x[2]);
+            int32_MINMAX(&x[2], &x[3]);
+            int32_MINMAX(&x[3], &x[4]);
+            int32_MINMAX(&x[4], &x[5]);
+        }
+        if (n >= 5) {
+            int32_MINMAX(&x[0], &x[1]);
+            int32_MINMAX(&x[1], &x[2]);
+            int32_MINMAX(&x[2], &x[3]);
+            int32_MINMAX(&x[3], &x[4]);
+        }
+        if (n >= 4) {
+            int32_MINMAX(&x[0], &x[1]);
+            int32_MINMAX(&x[1], &x[2]);
+            int32_MINMAX(&x[2], &x[3]);
+        }
+        if (n >= 3) {
+            int32_MINMAX(&x[0], &x[1]);
+            int32_MINMAX(&x[1], &x[2]);
+        }
+        if (n >= 2) {
+            int32_MINMAX(&x[0], &x[1]);
+        }
+        return;
+    }
+
+    if (!(n & (n - 1))) {
+        int32_sort_2power(x, n, 0);
+        return;
+    }
+
+    q = 8;
+    while (q < n - q) {
+        q += q;
+    }
+    /* n > q >= 8 */
+
+    if (q <= 128) { /* n <= 256 */
+        int32x8 y[32];
+        for (i = q >> 3; i < q >> 2; ++i) {
+            y[i] = _mm256_set1_epi32(0x7fffffff);
+        }
+        for (i = 0; i < n; ++i) {
+            ((int32 *) y)[i] = x[i];
+        }
+        int32_sort_2power((int32 *) y, 2 * q, 0);
+        for (i = 0; i < n; ++i) {
+            x[i] = ((int32 *) y)[i];
+        }
+        return;
+    }
+
+    int32_sort_2power(x, q, 1);
+    PQCLEAN_NTRULPR653_AVX2_crypto_sort_int32(x + q, n - q);
+
+    while (q >= 64) {
+        q >>= 2;
+        j = int32_threestages(x, n, q);
+        minmax_vector(x + j, x + j + 4 * q, n - 4 * q - j);
+        if (j + 4 * q <= n) {
+            for (i = j; i < j + q; i += 8) {
+                int32x8 x0 = int32x8_load(&x[i]);
+                int32x8 x1 = int32x8_load(&x[i + q]);
+                int32x8 x2 = int32x8_load(&x[i + 2 * q]);
+                int32x8 x3 = int32x8_load(&x[i + 3 * q]);
+                int32x8_MINMAX(x0, x2);
+                int32x8_MINMAX(x1, x3);
+                int32x8_MINMAX(x0, x1);
+                int32x8_MINMAX(x2, x3);
+                int32x8_store(&x[i], x0);
+                int32x8_store(&x[i + q], x1);
+                int32x8_store(&x[i + 2 * q], x2);
+                int32x8_store(&x[i + 3 * q], x3);
+            }
+            j += 4 * q;
+        }
+        minmax_vector(x + j, x + j + 2 * q, n - 2 * q - j);
+        if (j + 2 * q <= n) {
+            for (i = j; i < j + q; i += 8) {
+                int32x8 x0 = int32x8_load(&x[i]);
+                int32x8 x1 = int32x8_load(&x[i + q]);
+                int32x8_MINMAX(x0, x1);
+                int32x8_store(&x[i], x0);
+                int32x8_store(&x[i + q], x1);
+            }
+            j += 2 * q;
+        }
+        minmax_vector(x + j, x + j + q, n - q - j);
+        q >>= 1;
+    }
+    if (q == 32) {
+        j = 0;
+        for (; j + 64 <= n; j += 64) {
+            int32x8 x0 = int32x8_load(&x[j]);
+            int32x8 x1 = int32x8_load(&x[j + 8]);
+            int32x8 x2 = int32x8_load(&x[j + 16]);
+            int32x8 x3 = int32x8_load(&x[j + 24]);
+            int32x8 x4 = int32x8_load(&x[j + 32]);
+            int32x8 x5 = int32x8_load(&x[j + 40]);
+            int32x8 x6 = int32x8_load(&x[j + 48]);
+            int32x8 x7 = int32x8_load(&x[j + 56]);
+            int32x8_MINMAX(x0, x4);
+            int32x8_MINMAX(x1, x5);
+            int32x8_MINMAX(x2, x6);
+            int32x8_MINMAX(x3, x7);
+            int32x8_MINMAX(x0, x2);
+            int32x8_MINMAX(x1, x3);
+            int32x8_MINMAX(x4, x6);
+            int32x8_MINMAX(x5, x7);
+            int32x8_MINMAX(x0, x1);
+            int32x8_MINMAX(x2, x3);
+            int32x8_MINMAX(x4, x5);
+            int32x8_MINMAX(x6, x7);
+            int32x8 a0 = _mm256_permute2x128_si256(x0, x1, 0x20);
+            int32x8 a1 = _mm256_permute2x128_si256(x0, x1, 0x31);
+            int32x8 a2 = _mm256_permute2x128_si256(x2, x3, 0x20);
+            int32x8 a3 = _mm256_permute2x128_si256(x2, x3, 0x31);
+            int32x8 a4 = _mm256_permute2x128_si256(x4, x5, 0x20);
+            int32x8 a5 = _mm256_permute2x128_si256(x4, x5, 0x31);
+            int32x8 a6 = _mm256_permute2x128_si256(x6, x7, 0x20);
+            int32x8 a7 = _mm256_permute2x128_si256(x6, x7, 0x31);
+            int32x8_MINMAX(a0, a1);
+            int32x8_MINMAX(a2, a3);
+            int32x8_MINMAX(a4, a5);
+            int32x8_MINMAX(a6, a7);
+            int32x8 b0 = _mm256_permute2x128_si256(a0, a1, 0x20);
+            int32x8 b1 = _mm256_permute2x128_si256(a0, a1, 0x31);
+            int32x8 b2 = _mm256_permute2x128_si256(a2, a3, 0x20);
+            int32x8 b3 = _mm256_permute2x128_si256(a2, a3, 0x31);
+            int32x8 b4 = _mm256_permute2x128_si256(a4, a5, 0x20);
+            int32x8 b5 = _mm256_permute2x128_si256(a4, a5, 0x31);
+            int32x8 b6 = _mm256_permute2x128_si256(a6, a7, 0x20);
+            int32x8 b7 = _mm256_permute2x128_si256(a6, a7, 0x31);
+            int32x8 c0 = _mm256_unpacklo_epi64(b0, b1);
+            int32x8 c1 = _mm256_unpackhi_epi64(b0, b1);
+            int32x8 c2 = _mm256_unpacklo_epi64(b2, b3);
+            int32x8 c3 = _mm256_unpackhi_epi64(b2, b3);
+            int32x8 c4 = _mm256_unpacklo_epi64(b4, b5);
+            int32x8 c5 = _mm256_unpackhi_epi64(b4, b5);
+            int32x8 c6 = _mm256_unpacklo_epi64(b6, b7);
+            int32x8 c7 = _mm256_unpackhi_epi64(b6, b7);
+            int32x8_MINMAX(c0, c1);
+            int32x8_MINMAX(c2, c3);
+            int32x8_MINMAX(c4, c5);
+            int32x8_MINMAX(c6, c7);
+            int32x8 d0 = _mm256_unpacklo_epi32(c0, c1);
+            int32x8 d1 = _mm256_unpackhi_epi32(c0, c1);
+            int32x8 d2 = _mm256_unpacklo_epi32(c2, c3);
+            int32x8 d3 = _mm256_unpackhi_epi32(c2, c3);
+            int32x8 d4 = _mm256_unpacklo_epi32(c4, c5);
+            int32x8 d5 = _mm256_unpackhi_epi32(c4, c5);
+            int32x8 d6 = _mm256_unpacklo_epi32(c6, c7);
+            int32x8 d7 = _mm256_unpackhi_epi32(c6, c7);
+            int32x8 e0 = _mm256_unpacklo_epi64(d0, d1);
+            int32x8 e1 = _mm256_unpackhi_epi64(d0, d1);
+            int32x8 e2 = _mm256_unpacklo_epi64(d2, d3);
+            int32x8 e3 = _mm256_unpackhi_epi64(d2, d3);
+            int32x8 e4 = _mm256_unpacklo_epi64(d4, d5);
+            int32x8 e5 = _mm256_unpackhi_epi64(d4, d5);
+            int32x8 e6 = _mm256_unpacklo_epi64(d6, d7);
+            int32x8 e7 = _mm256_unpackhi_epi64(d6, d7);
+            int32x8_MINMAX(e0, e1);
+            int32x8_MINMAX(e2, e3);
+            int32x8_MINMAX(e4, e5);
+            int32x8_MINMAX(e6, e7);
+            int32x8 f0 = _mm256_unpacklo_epi32(e0, e1);
+            int32x8 f1 = _mm256_unpackhi_epi32(e0, e1);
+            int32x8 f2 = _mm256_unpacklo_epi32(e2, e3);
+            int32x8 f3 = _mm256_unpackhi_epi32(e2, e3);
+            int32x8 f4 = _mm256_unpacklo_epi32(e4, e5);
+            int32x8 f5 = _mm256_unpackhi_epi32(e4, e5);
+            int32x8 f6 = _mm256_unpacklo_epi32(e6, e7);
+            int32x8 f7 = _mm256_unpackhi_epi32(e6, e7);
+            int32x8_store(&x[j], f0);
+            int32x8_store(&x[j + 8], f1);
+            int32x8_store(&x[j + 16], f2);
+            int32x8_store(&x[j + 24], f3);
+            int32x8_store(&x[j + 32], f4);
+            int32x8_store(&x[j + 40], f5);
+            int32x8_store(&x[j + 48], f6);
+            int32x8_store(&x[j + 56], f7);
+        }
+        minmax_vector(x + j, x + j + 32, n - 32 - j);
+        goto continue16;
+    }
+    if (q == 16) {
+        j = 0;
+continue16:
+        for (; j + 32 <= n; j += 32) {
+            int32x8 x0 = int32x8_load(&x[j]);
+            int32x8 x1 = int32x8_load(&x[j + 8]);
+            int32x8 x2 = int32x8_load(&x[j + 16]);
+            int32x8 x3 = int32x8_load(&x[j + 24]);
+            int32x8_MINMAX(x0, x2);
+            int32x8_MINMAX(x1, x3);
+            int32x8_MINMAX(x0, x1);
+            int32x8_MINMAX(x2, x3);
+            int32x8 a0 = _mm256_permute2x128_si256(x0, x1, 0x20);
+            int32x8 a1 = _mm256_permute2x128_si256(x0, x1, 0x31);
+            int32x8 a2 = _mm256_permute2x128_si256(x2, x3, 0x20);
+            int32x8 a3 = _mm256_permute2x128_si256(x2, x3, 0x31);
+            int32x8_MINMAX(a0, a1);
+            int32x8_MINMAX(a2, a3);
+            int32x8 b0 = _mm256_permute2x128_si256(a0, a1, 0x20);
+            int32x8 b1 = _mm256_permute2x128_si256(a0, a1, 0x31);
+            int32x8 b2 = _mm256_permute2x128_si256(a2, a3, 0x20);
+            int32x8 b3 = _mm256_permute2x128_si256(a2, a3, 0x31);
+            int32x8 c0 = _mm256_unpacklo_epi64(b0, b1);
+            int32x8 c1 = _mm256_unpackhi_epi64(b0, b1);
+            int32x8 c2 = _mm256_unpacklo_epi64(b2, b3);
+            int32x8 c3 = _mm256_unpackhi_epi64(b2, b3);
+            int32x8_MINMAX(c0, c1);
+            int32x8_MINMAX(c2, c3);
+            int32x8 d0 = _mm256_unpacklo_epi32(c0, c1);
+            int32x8 d1 = _mm256_unpackhi_epi32(c0, c1);
+            int32x8 d2 = _mm256_unpacklo_epi32(c2, c3);
+            int32x8 d3 = _mm256_unpackhi_epi32(c2, c3);
+            int32x8 e0 = _mm256_unpacklo_epi64(d0, d1);
+            int32x8 e1 = _mm256_unpackhi_epi64(d0, d1);
+            int32x8 e2 = _mm256_unpacklo_epi64(d2, d3);
+            int32x8 e3 = _mm256_unpackhi_epi64(d2, d3);
+            int32x8_MINMAX(e0, e1);
+            int32x8_MINMAX(e2, e3);
+            int32x8 f0 = _mm256_unpacklo_epi32(e0, e1);
+            int32x8 f1 = _mm256_unpackhi_epi32(e0, e1);
+            int32x8 f2 = _mm256_unpacklo_epi32(e2, e3);
+            int32x8 f3 = _mm256_unpackhi_epi32(e2, e3);
+            int32x8_store(&x[j], f0);
+            int32x8_store(&x[j + 8], f1);
+            int32x8_store(&x[j + 16], f2);
+            int32x8_store(&x[j + 24], f3);
+        }
+        minmax_vector(x + j, x + j + 16, n - 16 - j);
+        goto continue8;
+    }
+    /* q == 8 */
+    j = 0;
+continue8:
+    for (; j + 16 <= n; j += 16) {
+        int32x8 x0 = int32x8_load(&x[j]);
+        int32x8 x1 = int32x8_load(&x[j + 8]);
+        int32x8_MINMAX(x0, x1);
+        int32x8_store(&x[j], x0);
+        int32x8_store(&x[j + 8], x1);
+        int32x8 a0 = _mm256_permute2x128_si256(x0, x1, 0x20); /* x0123y0123 */
+        int32x8 a1 = _mm256_permute2x128_si256(x0, x1, 0x31); /* x4567y4567 */
+        int32x8_MINMAX(a0, a1);
+        int32x8 b0 = _mm256_permute2x128_si256(a0, a1, 0x20); /* x01234567 */
+        int32x8 b1 = _mm256_permute2x128_si256(a0, a1, 0x31); /* y01234567 */
+        int32x8 c0 = _mm256_unpacklo_epi64(b0, b1); /* x01y01x45y45 */
+        int32x8 c1 = _mm256_unpackhi_epi64(b0, b1); /* x23y23x67y67 */
+        int32x8_MINMAX(c0, c1);
+        int32x8 d0 = _mm256_unpacklo_epi32(c0, c1); /* x02x13x46x57 */
+        int32x8 d1 = _mm256_unpackhi_epi32(c0, c1); /* y02y13y46y57 */
+        int32x8 e0 = _mm256_unpacklo_epi64(d0, d1); /* x02y02x46y46 */
+        int32x8 e1 = _mm256_unpackhi_epi64(d0, d1); /* x13y13x57y57 */
+        int32x8_MINMAX(e0, e1);
+        int32x8 f0 = _mm256_unpacklo_epi32(e0, e1); /* x01234567 */
+        int32x8 f1 = _mm256_unpackhi_epi32(e0, e1); /* y01234567 */
+        int32x8_store(&x[j], f0);
+        int32x8_store(&x[j + 8], f1);
+    }
+    minmax_vector(x + j, x + j + 8, n - 8 - j);
+    if (j + 8 <= n) {
+        int32_MINMAX(&x[j], &x[j + 4]);
+        int32_MINMAX(&x[j + 1], &x[j + 5]);
+        int32_MINMAX(&x[j + 2], &x[j + 6]);
+        int32_MINMAX(&x[j + 3], &x[j + 7]);
+        int32_MINMAX(&x[j], &x[j + 2]);
+        int32_MINMAX(&x[j + 1], &x[j + 3]);
+        int32_MINMAX(&x[j], &x[j + 1]);
+        int32_MINMAX(&x[j + 2], &x[j + 3]);
+        int32_MINMAX(&x[j + 4], &x[j + 6]);
+        int32_MINMAX(&x[j + 5], &x[j + 7]);
+        int32_MINMAX(&x[j + 4], &x[j + 5]);
+        int32_MINMAX(&x[j + 6], &x[j + 7]);
+        j += 8;
+    }
+    minmax_vector(x + j, x + j + 4, n - 4 - j);
+    if (j + 4 <= n) {
+        int32_MINMAX(&x[j], &x[j + 2]);
+        int32_MINMAX(&x[j + 1], &x[j + 3]);
+        int32_MINMAX(&x[j], &x[j + 1]);
+        int32_MINMAX(&x[j + 2], &x[j + 3]);
+        j += 4;
+    }
+    if (j + 3 <= n) {
+        int32_MINMAX(&x[j], &x[j + 2]);
+    }
+    if (j + 2 <= n) {
+        int32_MINMAX(&x[j], &x[j + 1]);
+    }
+}
diff --git a/crypto_kem/ntrulpr653/avx2/crypto_sort_int32.h b/crypto_kem/ntrulpr653/avx2/crypto_sort_int32.h
new file mode 100644
index 00000000..c20a0167
--- /dev/null
+++ b/crypto_kem/ntrulpr653/avx2/crypto_sort_int32.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_NTRULPR653_AVX2_CRYPTO_SORT
+#define PQCLEAN_NTRULPR653_AVX2_CRYPTO_SORT
+
+#include <stddef.h>
+#include <stdint.h>
+
+
+void PQCLEAN_NTRULPR653_AVX2_crypto_sort_int32(int32_t *x, size_t n);
+
+#endif
diff --git a/crypto_kem/ntrulpr653/avx2/crypto_sort_uint32.c b/crypto_kem/ntrulpr653/avx2/crypto_sort_uint32.c
new file mode 100644
index 00000000..0580b89a
--- /dev/null
+++ b/crypto_kem/ntrulpr653/avx2/crypto_sort_uint32.c
@@ -0,0 +1,20 @@
+#include "crypto_sort_int32.h"
+#include "crypto_sort_uint32.h"
+#include <stdint.h>
+
+#define uint32 uint32_t
+
+/* can save time by vectorizing xor loops */
+/* can save time by integrating xor loops with int32_sort */
+
+void PQCLEAN_NTRULPR653_AVX2_crypto_sort_uint32(uint32_t *array, size_t n) {
+    uint32 *x = array;
+    size_t j;
+    for (j = 0; j < n; ++j) {
+        x[j] ^= 0x80000000;
+    }
+    PQCLEAN_NTRULPR653_AVX2_crypto_sort_int32((int32_t *)array, n);
+    for (j = 0; j < n; ++j) {
+        x[j] ^= 0x80000000;
+    }
+}
diff --git a/crypto_kem/ntrulpr653/avx2/crypto_sort_uint32.h b/crypto_kem/ntrulpr653/avx2/crypto_sort_uint32.h
new file mode 100644
index 00000000..9fcd720b
--- /dev/null
+++ b/crypto_kem/ntrulpr653/avx2/crypto_sort_uint32.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_NTRULPR653_AVX2_CRYPTO_SORT_UINT32_H
+#define PQCLEAN_NTRULPR653_AVX2_CRYPTO_SORT_UINT32_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+
+void PQCLEAN_NTRULPR653_AVX2_crypto_sort_uint32(uint32_t *array, size_t n);
+
+#endif
diff --git a/crypto_kem/ntrulpr653/avx2/crypto_stream_aes256ctr.c b/crypto_kem/ntrulpr653/avx2/crypto_stream_aes256ctr.c
new file mode 100644
index 00000000..7932691b
--- /dev/null
+++ b/crypto_kem/ntrulpr653/avx2/crypto_stream_aes256ctr.c
@@ -0,0 +1,15 @@
+#include "crypto_stream_aes256ctr.h"
+
+
+int PQCLEAN_NTRULPR653_AVX2_crypto_stream_aes256ctr(
+    uint8_t *out,
+    size_t outlen,
+    const uint8_t nonce[AESCTR_NONCEBYTES],
+    const uint8_t key[AES256_KEYBYTES]) {
+
+    aes256ctx state;
+    aes256_ctr_keyexp(&state, key);
+    aes256_ctr(out, outlen, nonce, &state);
+    aes256_ctx_release(&state);
+    return 0;
+}
diff --git a/crypto_kem/ntrulpr653/avx2/crypto_stream_aes256ctr.h b/crypto_kem/ntrulpr653/avx2/crypto_stream_aes256ctr.h
new file mode 100644
index 00000000..d7e653ff
--- /dev/null
+++ b/crypto_kem/ntrulpr653/avx2/crypto_stream_aes256ctr.h
@@ -0,0 +1,15 @@
+#ifndef PQCLEAN_NTRULPR653_AVX2_CRYPTO_STREAM_AES256CTR_H
+#define PQCLEAN_NTRULPR653_AVX2_CRYPTO_STREAM_AES256CTR_H
+#include "aes.h"
+#include <stddef.h>
+#include <stdint.h>
+
+
+
+int PQCLEAN_NTRULPR653_AVX2_crypto_stream_aes256ctr(
+    uint8_t *out,
+    size_t outlen,
+    const uint8_t nonce[AESCTR_NONCEBYTES],
+    const uint8_t key[AES256_KEYBYTES]);
+
+#endif
diff --git a/crypto_kem/ntrulpr653/avx2/crypto_verify_1025.c b/crypto_kem/ntrulpr653/avx2/crypto_verify_1025.c
new file mode 100644
index 00000000..4286184b
--- /dev/null
+++ b/crypto_kem/ntrulpr653/avx2/crypto_verify_1025.c
@@ -0,0 +1,36 @@
+#include "crypto_verify_1025.h"
+#include <immintrin.h>
+
+int PQCLEAN_NTRULPR653_AVX2_crypto_verify_1025(const unsigned char *x, const unsigned char *y) {
+    __m256i diff = _mm256_set1_epi8(0);
+    unsigned int differentbits = 0;
+    int i = PQCLEAN_NTRULPR653_AVX2_crypto_verify_1025_BYTES;
+
+    i -= 32;
+    for (;;) {
+        do {
+            __m256i x0 = _mm256_loadu_si256((__m256i *) x);
+            __m256i y0 = _mm256_loadu_si256((__m256i *) y);
+            diff |= x0 ^ y0;
+            i -= 32;
+            x += 32;
+            y += 32;
+        } while (i >= 0);
+        if (i <= -32) {
+            break;
+        }
+        x += i;
+        y += i;
+    }
+
+    diff |= _mm256_srli_epi16(diff, 8);
+    diff |= _mm256_srli_epi32(diff, 16);
+    diff |= _mm256_srli_epi64(diff, 32);
+
+    differentbits = _mm256_extract_epi8(diff, 0);
+    differentbits |= _mm256_extract_epi8(diff, 8);
+    differentbits |= _mm256_extract_epi8(diff, 16);
+    differentbits |= _mm256_extract_epi8(diff, 24);
+
+    return (int) (1 & ((differentbits - 1) >> 8)) - 1;
+}
diff --git a/crypto_kem/ntrulpr653/avx2/crypto_verify_1025.h b/crypto_kem/ntrulpr653/avx2/crypto_verify_1025.h
new file mode 100644
index 00000000..b3a4d7bf
--- /dev/null
+++ b/crypto_kem/ntrulpr653/avx2/crypto_verify_1025.h
@@ -0,0 +1,8 @@
+#ifndef PQCLEAN_NTRULPR653_AVX2_CRYPTO_VERIFY_1025_H
+#define PQCLEAN_NTRULPR653_AVX2_CRYPTO_VERIFY_1025_H
+
+#include <stdint.h>
+#define PQCLEAN_NTRULPR653_AVX2_crypto_verify_1025_BYTES 1025
+
+int PQCLEAN_NTRULPR653_AVX2_crypto_verify_1025(const unsigned char *x, const unsigned char *y);
+#endif
diff --git a/crypto_kem/ntrulpr653/avx2/kem.c b/crypto_kem/ntrulpr653/avx2/kem.c
new file mode 100644
index 00000000..70d6a0ce
--- /dev/null
+++ b/crypto_kem/ntrulpr653/avx2/kem.c
@@ -0,0 +1,287 @@
+#include "api.h"
+#include "crypto_sort_uint32.h"
+#include "crypto_stream_aes256ctr.h"
+#include "params.h"
+#include "randombytes.h"
+#include "sha2.h"
+
+
+
+#define int8 int8_t
+#define int16 int16_t
+#define int32 int32_t
+#define uint16 uint16_t
+#define uint32 uint32_t
+#define uint64 uint64_t
+
+/* ----- masks */
+
+/* return -1 if x<0; otherwise return 0 */
+static int int16_negative_mask(int16 x) {
+    uint16 u = x;
+    u >>= 15;
+    return -(int) u;
+    /* alternative with gcc -fwrapv: */
+    /* x>>15 compiles to CPU's arithmetic right shift */
+}
+
+/* ----- arithmetic mod 3 */
+
+typedef int8 small;
+/* F3 is always represented as -1,0,1 */
+
+/* ----- arithmetic mod q */
+
+#define q12 ((q-1)/2)
+typedef int16 Fq;
+
+/* works for -14000000 < x < 14000000 if q in 4591, 4621, 5167 */
+/* assumes twos complement; use, e.g., gcc -fwrapv */
+static Fq Fq_freeze(int32 x) {
+    x -= q * ((q18 * x) >> 18);
+    x -= q * ((q27 * x + 67108864) >> 27);
+    return x;
+}
+
+/* works for all uint32 x */
+static Fq Fq_bigfreeze(uint32 x) {
+    x -= q * ((x * (uint64)q31) >> 31);
+    x -= q * ((x * (uint64)q31) >> 31);
+    x -= q;
+    x += (-(x >> 31)) & (uint32)q;
+    return x;
+}
+
+/* ----- Top and Right */
+
+static int8 Top(Fq C) {
+    return (tau1 * (int32)(C + tau0) + 16384) >> 15;
+}
+
+static Fq Right(int8 T) {
+    return Fq_freeze(tau3 * (int32)T - tau2);
+}
+
+/* ----- polynomials mod q */
+
+/* h = h*g in the ring Rq */
+static void Rq_mult_small(Fq *h, const small *g) {
+    crypto_encode_pxint16((unsigned char *) h, h);
+    crypto_core_mult((unsigned char *) h, (const unsigned char *) h, (const unsigned char *) g);
+    crypto_decode_pxint16(h, (const unsigned char *) h);
+}
+
+/* ----- sorting to generate short polynomial */
+
+static void Short_fromlist(small *out, const uint32 *in) {
+    uint32 L[ppadsort];
+    int i;
+
+    for (i = 0; i < w; ++i) {
+        L[i] = in[i] & (uint32) - 2;
+    }
+    for (i = w; i < p; ++i) {
+        L[i] = (in[i] & (uint32) - 3) | 1;
+    }
+    for (i = p; i < ppadsort; ++i) {
+        L[i] = 0xffffffff;
+    }
+    PQCLEAN_NTRULPR653_AVX2_crypto_sort_uint32(L, ppadsort);
+    for (i = 0; i < p; ++i) {
+        out[i] = (L[i] & 3) - 1;
+    }
+}
+
+/* ----- underlying hash function */
+
+#define Hash_bytes 32
+
+static void Hash(unsigned char *out, const unsigned char *in, int inlen) {
+    unsigned char h[64];
+    int i;
+    sha512(h, in, inlen);
+    for (i = 0; i < 32; ++i) {
+        out[i] = h[i];
+    }
+}
+
+/* ----- higher-level randomness */
+
+static void Short_random(small *out) {
+    uint32 L[p];
+
+    randombytes((unsigned char *) L, sizeof L);
+    crypto_decode_pxint32(L, (unsigned char *) L);
+    Short_fromlist(out, L);
+}
+
+/* ----- Inputs, Generator */
+
+typedef int8 Inputs[I]; /* passed by reference */
+
+static const unsigned char aes_nonce[16] = {0};
+
+/* G = Generator(pk) */
+static void Generator(Fq *G, const unsigned char *pk) {
+    uint32 L[p];
+    int i;
+
+    PQCLEAN_NTRULPR653_AVX2_crypto_stream_aes256ctr((unsigned char *) L, 4 * p, aes_nonce, pk);
+    crypto_decode_pxint32(L, (unsigned char *) L);
+    for (i = 0; i < p; ++i) {
+        G[i] = Fq_bigfreeze(L[i]) - q12;
+    }
+}
+
+/* ----- NTRU LPRime */
+
+#define Seeds_bytes 32
+#define Ciphertexts_bytes (Rounded_bytes+Top_bytes)
+#define SecretKeys_bytes Small_bytes
+#define PublicKeys_bytes (Seeds_bytes+Rounded_bytes)
+#define Confirm_bytes 32
+
+/* c,r_enc[1:] = Hide(r,pk,cache); cache is Hash4(pk) */
+static void Hide(unsigned char *c, unsigned char *r_enc, const Inputs r, const unsigned char *pk, const unsigned char *cache) {
+    small b[p];
+    int i;
+
+    Inputs_encode(r_enc + 1, r);
+    {
+        unsigned char h[Hash_bytes];
+        uint32 L[p];
+        {
+            unsigned char s[1 + Inputs_bytes];
+            Inputs_encode(s + 1, r);
+            s[0] = 5;
+            Hash(h, s, sizeof s);
+        }
+        PQCLEAN_NTRULPR653_AVX2_crypto_stream_aes256ctr((unsigned char *) L, 4 * p, aes_nonce, h);
+        crypto_decode_pxint32(L, (unsigned char *) L);
+        Short_fromlist(b, L);
+    }
+    {
+        Fq bG[p];
+        Generator(bG, pk);
+        Rq_mult_small(bG, b);
+        Round_and_encode(c, bG);
+        c += Rounded_bytes;
+    }
+    {
+        Fq bA[p];
+        int8 T[I];
+        Rounded_decode(bA, pk + Seeds_bytes);
+        Rq_mult_small(bA, b);
+        for (i = 0; i < I; ++i) {
+            T[i] = Top(Fq_freeze(bA[i] + r[i] * q12));
+        }
+        Top_encode(c, T);
+        c += Top_bytes;
+    }
+    {
+        unsigned char x[1 + Inputs_bytes + Hash_bytes];
+        for (i = 0; i < Inputs_bytes; ++i) {
+            x[1 + i] = r_enc[1 + i];
+        }
+        for (i = 0; i < Hash_bytes; ++i) {
+            x[1 + Inputs_bytes + i] = cache[i];
+        }
+        x[0] = 2;
+        Hash(c, x, sizeof x);
+    }
+}
+
+
+int PQCLEAN_NTRULPR653_AVX2_crypto_kem_keypair(unsigned char *pk, unsigned char *sk) {
+    Fq aG[p];
+    int i;
+    randombytes(pk, Seeds_bytes);
+    Generator(aG, pk);
+    {
+        small a[p];
+        Short_random(a);
+        Rq_mult_small(aG, a);
+        Small_encode(sk, a);
+    }
+    Round_and_encode(pk + Seeds_bytes, aG);
+    {
+        unsigned char sksave = sk[SecretKeys_bytes - 1];
+        for (i = 0; i < PublicKeys_bytes; ++i) {
+            sk[SecretKeys_bytes + i] = pk[i];
+        }
+        sk[SecretKeys_bytes - 1] = 4;
+        Hash(sk + SecretKeys_bytes + PublicKeys_bytes + Inputs_bytes, sk + SecretKeys_bytes - 1, 1 + PublicKeys_bytes);
+        sk[SecretKeys_bytes - 1] = sksave;
+        randombytes(sk + SecretKeys_bytes + PublicKeys_bytes, Inputs_bytes);
+    }
+    return 0;
+}
+
+int PQCLEAN_NTRULPR653_AVX2_crypto_kem_enc(unsigned char *c, unsigned char *k, const unsigned char *pk) {
+    int i;
+    unsigned char cache[Hash_bytes];
+    {
+        unsigned char y[1 + PublicKeys_bytes];
+        for (i = 0; i < PublicKeys_bytes; ++i) {
+            y[1 + i] = pk[i];
+        }
+        y[0] = 4;
+        Hash(cache, y, sizeof y);
+    }
+    Inputs r;
+    {
+        unsigned char s[Inputs_bytes];
+        randombytes(s, sizeof s);
+        Inputs_decode(r, s);
+    }
+    {
+        unsigned char x[1 + Inputs_bytes + Ciphertexts_bytes + Confirm_bytes];
+        Hide(c, x, r, pk, cache);
+        for (i = 0; i < Ciphertexts_bytes + Confirm_bytes; ++i) {
+            x[1 + Inputs_bytes + i] = c[i];
+        }
+        x[0] = 1;
+        Hash(k, x, sizeof x);
+    }
+    return 0;
+}
+
+int PQCLEAN_NTRULPR653_AVX2_crypto_kem_dec(unsigned char *k, const unsigned char *c, const unsigned char *sk) {
+    const unsigned char *pk = sk + SecretKeys_bytes;
+    const unsigned char *rho = pk + PublicKeys_bytes;
+    const unsigned char *cache = rho + Inputs_bytes;
+    Inputs r;
+    int i;
+    {
+        Fq aB[p];
+        Rounded_decode(aB, c);
+        {
+            small a[p];
+            Small_decode(a, sk);
+            Rq_mult_small(aB, a);
+        }
+        {
+            int8 T[I];
+            Top_decode(T, c + Rounded_bytes);
+            for (i = 0; i < I; ++i) {
+                r[i] = -int16_negative_mask(Fq_freeze(Right(T[i]) - aB[i] + 4 * w + 1));
+            }
+        }
+    }
+    {
+        unsigned char cnew[Ciphertexts_bytes + Confirm_bytes];
+        int mask;
+        unsigned char x[1 + Inputs_bytes + Ciphertexts_bytes + Confirm_bytes];
+        Hide(cnew, x, r, pk, cache);
+        mask = crypto_verify_clen(c, cnew);
+        for (i = 0; i < Inputs_bytes; ++i) {
+            x[1 + i] ^= mask & (x[1 + i] ^ rho[i]);
+        }
+        for (i = 0; i < Ciphertexts_bytes + Confirm_bytes; ++i) {
+            x[1 + Inputs_bytes + i] = c[i];
+        }
+        x[0] = 1 + mask;
+        Hash(k, x, sizeof x);
+    }
+    return 0;
+}
diff --git a/crypto_kem/ntrulpr653/avx2/params.h b/crypto_kem/ntrulpr653/avx2/params.h
new file mode 100644
index 00000000..653fccdb
--- /dev/null
+++ b/crypto_kem/ntrulpr653/avx2/params.h
@@ -0,0 +1,61 @@
+#ifndef params_H
+#define params_H
+#include "crypto_core_multsntrup653.h"
+#include "crypto_decode_256x16.h"
+#include "crypto_decode_256x2.h"
+#include "crypto_decode_653x1541.h"
+#include "crypto_decode_653x3.h"
+#include "crypto_decode_653xint16.h"
+#include "crypto_decode_653xint32.h"
+#include "crypto_encode_256x16.h"
+#include "crypto_encode_256x2.h"
+#include "crypto_encode_653x1541.h"
+#include "crypto_encode_653x1541round.h"
+#include "crypto_encode_653x3.h"
+#include "crypto_encode_653xint16.h"
+#include "crypto_verify_1025.h"
+
+
+#define p 653
+#define q 4621
+#define w 252
+#define tau0 2175
+#define tau1 113
+#define tau2 2031
+#define tau3 290
+#define I 256
+
+#define ppadsort 653
+
+#define q18 57 /* round(2^18/q) */
+#define q27 29045 /* round(2^27/q) */
+#define q31 464722 /* floor(2^31/q) */
+
+#define crypto_verify_clen PQCLEAN_NTRULPR653_AVX2_crypto_verify_1025
+
+#define Rounded_bytes PQCLEAN_NTRULPR653_AVX2_crypto_decode_653x1541_STRBYTES
+#define Rounded_decode PQCLEAN_NTRULPR653_AVX2_crypto_decode_653x1541
+
+#define Round_and_encode PQCLEAN_NTRULPR653_AVX2_crypto_encode_653x1541round
+
+#define Small_bytes PQCLEAN_NTRULPR653_AVX2_crypto_encode_653x3_STRBYTES
+#define Small_encode PQCLEAN_NTRULPR653_AVX2_crypto_encode_653x3
+#define Small_decode PQCLEAN_NTRULPR653_AVX2_crypto_decode_653x3
+
+#define Top_bytes PQCLEAN_NTRULPR653_AVX2_crypto_encode_256x16_STRBYTES
+#define Top_encode PQCLEAN_NTRULPR653_AVX2_crypto_encode_256x16
+#define Top_decode PQCLEAN_NTRULPR653_AVX2_crypto_decode_256x16
+
+#define Inputs_bytes PQCLEAN_NTRULPR653_AVX2_crypto_encode_256x2_STRBYTES
+#define Inputs_encode PQCLEAN_NTRULPR653_AVX2_crypto_encode_256x2
+#define Inputs_decode PQCLEAN_NTRULPR653_AVX2_crypto_decode_256x2
+
+#define crypto_decode_pxint32 PQCLEAN_NTRULPR653_AVX2_crypto_decode_653xint32
+
+#define crypto_decode_pxint16 PQCLEAN_NTRULPR653_AVX2_crypto_decode_653xint16
+
+#define crypto_encode_pxint16 PQCLEAN_NTRULPR653_AVX2_crypto_encode_653xint16
+
+#define crypto_core_mult PQCLEAN_NTRULPR653_AVX2_crypto_core_multsntrup653
+
+#endif
diff --git a/crypto_kem/ntrulpr653/clean/LICENSE b/crypto_kem/ntrulpr653/clean/LICENSE
new file mode 100644
index 00000000..d5d21fff
--- /dev/null
+++ b/crypto_kem/ntrulpr653/clean/LICENSE
@@ -0,0 +1 @@
+Public Domain
diff --git a/crypto_kem/ntrulpr653/clean/Makefile b/crypto_kem/ntrulpr653/clean/Makefile
new file mode 100644
index 00000000..7807ed9b
--- /dev/null
+++ b/crypto_kem/ntrulpr653/clean/Makefile
@@ -0,0 +1,19 @@
+# This Makefile can be used with GNU Make or BSD Make
+
+LIB=libntrulpr653_clean.a
+HEADERS=api.h crypto_core_multsntrup653.h crypto_decode_256x16.h crypto_decode_256x2.h crypto_decode_653x1541.h crypto_decode_653x3.h crypto_decode_653xint16.h crypto_decode_653xint32.h crypto_encode_256x16.h crypto_encode_256x2.h crypto_encode_653x1541.h crypto_encode_653x1541round.h crypto_encode_653x3.h crypto_encode_653xint16.h crypto_sort_int32.h crypto_sort_uint32.h crypto_stream_aes256ctr.h crypto_verify_1025.h params.h 
+OBJECTS=crypto_core_multsntrup653.o crypto_decode_256x16.o crypto_decode_256x2.o crypto_decode_653x1541.o crypto_decode_653x3.o crypto_decode_653xint16.o crypto_decode_653xint32.o crypto_encode_256x16.o crypto_encode_256x2.o crypto_encode_653x1541.o crypto_encode_653x1541round.o crypto_encode_653x3.o crypto_encode_653xint16.o crypto_sort_int32.o crypto_sort_uint32.o crypto_stream_aes256ctr.o crypto_verify_1025.o kem.o 
+
+CFLAGS=-O3 -Wall -Wextra -Wpedantic -Wvla -Werror -Wredundant-decls -Wmissing-prototypes -std=c99 -I../../../common $(EXTRAFLAGS)
+
+all: $(LIB)
+
+%.o: %.c $(HEADERS)
+	$(CC) $(CFLAGS) -c -o $@ $<
+
+$(LIB): $(OBJECTS)
+	$(AR) -r $@ $(OBJECTS)
+
+clean:
+	$(RM) $(OBJECTS)
+	$(RM) $(LIB)
diff --git a/crypto_kem/ntrulpr653/clean/Makefile.Microsoft_nmake b/crypto_kem/ntrulpr653/clean/Makefile.Microsoft_nmake
new file mode 100644
index 00000000..027d0730
--- /dev/null
+++ b/crypto_kem/ntrulpr653/clean/Makefile.Microsoft_nmake
@@ -0,0 +1,19 @@
+# This Makefile can be used with Microsoft Visual Studio's nmake using the command:
+#    nmake /f Makefile.Microsoft_nmake
+
+LIBRARY=libntrulpr653_clean.lib
+OBJECTS=crypto_core_multsntrup653.obj crypto_decode_256x16.obj crypto_decode_256x2.obj crypto_decode_653x1541.obj crypto_decode_653x3.obj crypto_decode_653xint16.obj crypto_decode_653xint32.obj crypto_encode_256x16.obj crypto_encode_256x2.obj crypto_encode_653x1541.obj crypto_encode_653x1541round.obj crypto_encode_653x3.obj crypto_encode_653xint16.obj crypto_sort_int32.obj crypto_sort_uint32.obj crypto_stream_aes256ctr.obj crypto_verify_1025.obj kem.obj 
+
+CFLAGS=/nologo /O2 /I ..\..\..\common /W4 /WX
+
+all: $(LIBRARY)
+
+# Make sure objects are recompiled if headers change.
+$(OBJECTS): *.h
+
+$(LIBRARY): $(OBJECTS)
+    LIB.EXE /NOLOGO /WX /OUT:$@ $**
+
+clean:
+    -DEL $(OBJECTS)
+    -DEL $(LIBRARY)
diff --git a/crypto_kem/ntrulpr653/clean/api.h b/crypto_kem/ntrulpr653/clean/api.h
new file mode 100644
index 00000000..ca280411
--- /dev/null
+++ b/crypto_kem/ntrulpr653/clean/api.h
@@ -0,0 +1,16 @@
+#ifndef PQCLEAN_NTRULPR653_CLEAN_API_H
+#define PQCLEAN_NTRULPR653_CLEAN_API_H
+
+
+
+#define PQCLEAN_NTRULPR653_CLEAN_CRYPTO_ALGNAME "ntrulpr653"
+
+#define PQCLEAN_NTRULPR653_CLEAN_CRYPTO_SECRETKEYBYTES 1125
+#define PQCLEAN_NTRULPR653_CLEAN_CRYPTO_PUBLICKEYBYTES 897
+#define PQCLEAN_NTRULPR653_CLEAN_CRYPTO_CIPHERTEXTBYTES 1025
+#define PQCLEAN_NTRULPR653_CLEAN_CRYPTO_BYTES 32
+
+int PQCLEAN_NTRULPR653_CLEAN_crypto_kem_keypair(unsigned char *pk, unsigned char *sk);
+int PQCLEAN_NTRULPR653_CLEAN_crypto_kem_enc(unsigned char *c, unsigned char *k, const unsigned char *pk);
+int PQCLEAN_NTRULPR653_CLEAN_crypto_kem_dec(unsigned char *k, const unsigned char *c, const unsigned char *sk);
+#endif
diff --git a/crypto_kem/ntrulpr653/clean/crypto_core_multsntrup653.c b/crypto_kem/ntrulpr653/clean/crypto_core_multsntrup653.c
new file mode 100644
index 00000000..5b38329b
--- /dev/null
+++ b/crypto_kem/ntrulpr653/clean/crypto_core_multsntrup653.c
@@ -0,0 +1,60 @@
+#include "crypto_core_multsntrup653.h"
+#include "params.h"
+
+
+#define int8 int8_t
+#define int16 int16_t
+#define int32 int32_t
+typedef int8 small;
+
+typedef int16 Fq;
+/* always represented as -(q-1)/2...(q-1)/2 */
+
+/* works for -14000000 < x < 14000000 if q in 4591, 4621, 5167 */
+static Fq Fq_freeze(int32 x) {
+    x -= q * ((q18 * x) >> 18);
+    x -= q * ((q27 * x + 67108864) >> 27);
+    return x;
+}
+
+int PQCLEAN_NTRULPR653_CLEAN_crypto_core_multsntrup653(unsigned char *outbytes, const unsigned char *inbytes, const unsigned char *kbytes) {
+    Fq f[p];
+    small g[p];
+    Fq fg[p + p - 1];
+    int32 result;
+    int i, j;
+
+    crypto_decode_pxint16(f, inbytes);
+    for (i = 0; i < p; ++i) {
+        f[i] = Fq_freeze(f[i]);
+    }
+
+    for (i = 0; i < p; ++i) {
+        small gi = kbytes[i];
+        small gi0 = gi & 1;
+        g[i] = gi0 - (gi & (gi0 << 1));
+    }
+
+    for (i = 0; i < p; ++i) {
+        result = 0;
+        for (j = 0; j <= i; ++j) {
+            result += f[j] * (int32)g[i - j];
+        }
+        fg[i] = Fq_freeze(result);
+    }
+    for (i = p; i < p + p - 1; ++i) {
+        result = 0;
+        for (j = i - p + 1; j < p; ++j) {
+            result += f[j] * (int32)g[i - j];
+        }
+        fg[i] = Fq_freeze(result);
+    }
+
+    for (i = p + p - 2; i >= p; --i) {
+        fg[i - p] = Fq_freeze(fg[i - p] + fg[i]);
+        fg[i - p + 1] = Fq_freeze(fg[i - p + 1] + fg[i]);
+    }
+
+    crypto_encode_pxint16(outbytes, fg);
+    return 0;
+}
diff --git a/crypto_kem/ntrulpr653/clean/crypto_core_multsntrup653.h b/crypto_kem/ntrulpr653/clean/crypto_core_multsntrup653.h
new file mode 100644
index 00000000..f653a5a9
--- /dev/null
+++ b/crypto_kem/ntrulpr653/clean/crypto_core_multsntrup653.h
@@ -0,0 +1,11 @@
+#ifndef PQCLEAN_NTRULPR653_CLEAN_CRYPTO_CORE_MULTSNTRUP653_H
+#define PQCLEAN_NTRULPR653_CLEAN_CRYPTO_CORE_MULTSNTRUP653_H
+
+#include <stdint.h>
+#define PQCLEAN_NTRULPR653_CLEAN_crypto_core_multsntrup653_OUTPUTBYTES 1306
+#define PQCLEAN_NTRULPR653_CLEAN_crypto_core_multsntrup653_INPUTBYTES 1306
+#define PQCLEAN_NTRULPR653_CLEAN_crypto_core_multsntrup653_KEYBYTES 653
+#define PQCLEAN_NTRULPR653_CLEAN_crypto_core_multsntrup653_CONSTBYTES 0
+
+int PQCLEAN_NTRULPR653_CLEAN_crypto_core_multsntrup653(unsigned char *outbytes, const unsigned char *inbytes, const unsigned char *kbytes);
+#endif
diff --git a/crypto_kem/ntrulpr653/clean/crypto_decode_256x16.c b/crypto_kem/ntrulpr653/clean/crypto_decode_256x16.c
new file mode 100644
index 00000000..20460924
--- /dev/null
+++ b/crypto_kem/ntrulpr653/clean/crypto_decode_256x16.c
@@ -0,0 +1,11 @@
+#include "crypto_decode_256x16.h"
+
+
+void PQCLEAN_NTRULPR653_CLEAN_crypto_decode_256x16(void *v, const unsigned char *s) {
+    unsigned char *T = v;
+    int i;
+    for (i = 0; i < 128; ++i) {
+        T[2 * i] = s[i] & 15;
+        T[2 * i + 1] = s[i] >> 4;
+    }
+}
diff --git a/crypto_kem/ntrulpr653/clean/crypto_decode_256x16.h b/crypto_kem/ntrulpr653/clean/crypto_decode_256x16.h
new file mode 100644
index 00000000..2f5376dc
--- /dev/null
+++ b/crypto_kem/ntrulpr653/clean/crypto_decode_256x16.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_NTRULPR653_CLEAN_CRYPTO_DECODE_256X16_H
+#define PQCLEAN_NTRULPR653_CLEAN_CRYPTO_DECODE_256X16_H
+
+#include <stdint.h>
+#define PQCLEAN_NTRULPR653_CLEAN_crypto_decode_256x16_STRBYTES 128
+#define PQCLEAN_NTRULPR653_CLEAN_crypto_decode_256x16_ITEMS 256
+#define PQCLEAN_NTRULPR653_CLEAN_crypto_decode_256x16_ITEMBYTES 1
+
+void PQCLEAN_NTRULPR653_CLEAN_crypto_decode_256x16(void *v, const unsigned char *s);
+#endif
diff --git a/crypto_kem/ntrulpr653/clean/crypto_decode_256x2.c b/crypto_kem/ntrulpr653/clean/crypto_decode_256x2.c
new file mode 100644
index 00000000..50a0bfeb
--- /dev/null
+++ b/crypto_kem/ntrulpr653/clean/crypto_decode_256x2.c
@@ -0,0 +1,10 @@
+#include "crypto_decode_256x2.h"
+
+
+void PQCLEAN_NTRULPR653_CLEAN_crypto_decode_256x2(void *v, const unsigned char *s) {
+    unsigned char *r = v;
+    int i;
+    for (i = 0; i < 256; ++i) {
+        r[i] = 1 & (s[i >> 3] >> (i & 7));
+    }
+}
diff --git a/crypto_kem/ntrulpr653/clean/crypto_decode_256x2.h b/crypto_kem/ntrulpr653/clean/crypto_decode_256x2.h
new file mode 100644
index 00000000..7e1f0bd8
--- /dev/null
+++ b/crypto_kem/ntrulpr653/clean/crypto_decode_256x2.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_NTRULPR653_CLEAN_CRYPTO_DECODE_256X2_H
+#define PQCLEAN_NTRULPR653_CLEAN_CRYPTO_DECODE_256X2_H
+
+#include <stdint.h>
+#define PQCLEAN_NTRULPR653_CLEAN_crypto_decode_256x2_STRBYTES 32
+#define PQCLEAN_NTRULPR653_CLEAN_crypto_decode_256x2_ITEMS 256
+#define PQCLEAN_NTRULPR653_CLEAN_crypto_decode_256x2_ITEMBYTES 1
+
+void PQCLEAN_NTRULPR653_CLEAN_crypto_decode_256x2(void *v, const unsigned char *s);
+#endif
diff --git a/crypto_kem/ntrulpr653/clean/crypto_decode_653x1541.c b/crypto_kem/ntrulpr653/clean/crypto_decode_653x1541.c
new file mode 100644
index 00000000..5e07c4bb
--- /dev/null
+++ b/crypto_kem/ntrulpr653/clean/crypto_decode_653x1541.c
@@ -0,0 +1,200 @@
+#include "crypto_decode_653x1541.h"
+
+/* auto-generated; do not edit */
+
+#define int16 int16_t
+#define uint16 uint16_t
+#define uint32 uint32_t
+#define uint64 uint64_t
+
+/*
+CPU division instruction typically takes time depending on x.
+This software is designed to take time independent of x.
+Time still varies depending on m; user must ensure that m is constant.
+Time also varies on CPUs where multiplication is variable-time.
+There could be more CPU issues.
+There could also be compiler issues.
+*/
+
+static void uint32_divmod_uint14(uint32 *q, uint16 *r, uint32 x, uint16 m) {
+    uint32 v = 0x80000000;
+    uint32 qpart;
+    uint32 mask;
+
+    v /= m;
+
+    /* caller guarantees m > 0 */
+    /* caller guarantees m < 16384 */
+    /* vm <= 2^31 <= vm+m-1 */
+    /* xvm <= 2^31 x <= xvm+x(m-1) */
+
+    *q = 0;
+
+    qpart = (x * (uint64)v) >> 31;
+    /* 2^31 qpart <= xv <= 2^31 qpart + 2^31-1 */
+    /* 2^31 qpart m <= xvm <= 2^31 qpart m + (2^31-1)m */
+    /* 2^31 qpart m <= 2^31 x <= 2^31 qpart m + (2^31-1)m + x(m-1) */
+    /* 0 <= 2^31 newx <= (2^31-1)m + x(m-1) */
+    /* 0 <= newx <= (1-1/2^31)m + x(m-1)/2^31 */
+    /* 0 <= newx <= (1-1/2^31)(2^14-1) + (2^32-1)((2^14-1)-1)/2^31 */
+
+    x -= qpart * m;
+    *q += qpart;
+    /* x <= 49146 */
+
+    qpart = (x * (uint64)v) >> 31;
+    /* 0 <= newx <= (1-1/2^31)m + x(m-1)/2^31 */
+    /* 0 <= newx <= m + 49146(2^14-1)/2^31 */
+    /* 0 <= newx <= m + 0.4 */
+    /* 0 <= newx <= m */
+
+    x -= qpart * m;
+    *q += qpart;
+    /* x <= m */
+
+    x -= m;
+    *q += 1;
+    mask = -(x >> 31);
+    x += mask & (uint32)m;
+    *q += mask;
+    /* x < m */
+
+    *r = x;
+}
+
+static uint16 uint32_mod_uint14(uint32 x, uint16 m) {
+    uint32 q;
+    uint16 r;
+    uint32_divmod_uint14(&q, &r, x, m);
+    return r;
+}
+
+void PQCLEAN_NTRULPR653_CLEAN_crypto_decode_653x1541(void *v, const unsigned char *s) {
+    int16 *R0 = v;
+    uint16 R1[327], R2[164], R3[82], R4[41], R5[21], R6[11], R7[6], R8[3], R9[2], R10[1];
+    long long i;
+    uint16 r0;
+    uint32 r1, r2;
+
+    s += PQCLEAN_NTRULPR653_CLEAN_crypto_decode_653x1541_STRBYTES;
+    r1 = 0;
+    r1 = (r1 << 8) | *--s;
+    r1 = (r1 << 8) | *--s;
+    r1 = uint32_mod_uint14(r1, 2608); /* needed only for invalid inputs */
+    R10[0] = r1;
+
+    r2 = R10[0];
+    r2 = (r2 << 8) | *--s;
+    uint32_divmod_uint14(&r1, &r0, r2, 71);
+    R9[0] = r0;
+    r1 = uint32_mod_uint14(r1, 9402); /* needed only for invalid inputs */
+    R9[1] = r1;
+
+    R8[2] = R9[1];
+    r2 = R9[0];
+    r2 = (r2 << 8) | *--s;
+    uint32_divmod_uint14(&r1, &r0, r2, 134);
+    R8[0] = r0;
+    r1 = uint32_mod_uint14(r1, 134); /* needed only for invalid inputs */
+    R8[1] = r1;
+
+    r2 = R8[2];
+    r2 = (r2 << 8) | *--s;
+    uint32_divmod_uint14(&r1, &r0, r2, 2953);
+    R7[4] = r0;
+    r1 = uint32_mod_uint14(r1, 815); /* needed only for invalid inputs */
+    R7[5] = r1;
+    for (i = 1; i >= 0; --i) {
+        r2 = R8[i];
+        r2 = (r2 << 8) | *--s;
+        r2 = (r2 << 8) | *--s;
+        uint32_divmod_uint14(&r1, &r0, r2, 2953);
+        R7[2 * i] = r0;
+        r1 = uint32_mod_uint14(r1, 2953); /* needed only for invalid inputs */
+        R7[2 * i + 1] = r1;
+    }
+
+    R6[10] = R7[5];
+    for (i = 4; i >= 0; --i) {
+        r2 = R7[i];
+        r2 = (r2 << 8) | *--s;
+        r2 = (r2 << 8) | *--s;
+        uint32_divmod_uint14(&r1, &r0, r2, 13910);
+        R6[2 * i] = r0;
+        r1 = uint32_mod_uint14(r1, 13910); /* needed only for invalid inputs */
+        R6[2 * i + 1] = r1;
+    }
+
+    R5[20] = R6[10];
+    for (i = 9; i >= 0; --i) {
+        r2 = R6[i];
+        r2 = (r2 << 8) | *--s;
+        uint32_divmod_uint14(&r1, &r0, r2, 1887);
+        R5[2 * i] = r0;
+        r1 = uint32_mod_uint14(r1, 1887); /* needed only for invalid inputs */
+        R5[2 * i + 1] = r1;
+    }
+
+    R4[40] = R5[20];
+    for (i = 19; i >= 0; --i) {
+        r2 = R5[i];
+        r2 = (r2 << 8) | *--s;
+        uint32_divmod_uint14(&r1, &r0, r2, 695);
+        R4[2 * i] = r0;
+        r1 = uint32_mod_uint14(r1, 695); /* needed only for invalid inputs */
+        R4[2 * i + 1] = r1;
+    }
+
+    r2 = R4[40];
+    r2 = (r2 << 8) | *--s;
+    r2 = (r2 << 8) | *--s;
+    uint32_divmod_uint14(&r1, &r0, r2, 6745);
+    R3[80] = r0;
+    r1 = uint32_mod_uint14(r1, 7910); /* needed only for invalid inputs */
+    R3[81] = r1;
+    for (i = 39; i >= 0; --i) {
+        r2 = R4[i];
+        r2 = (r2 << 8) | *--s;
+        r2 = (r2 << 8) | *--s;
+        uint32_divmod_uint14(&r1, &r0, r2, 6745);
+        R3[2 * i] = r0;
+        r1 = uint32_mod_uint14(r1, 6745); /* needed only for invalid inputs */
+        R3[2 * i + 1] = r1;
+    }
+
+    r2 = R3[81];
+    r2 = (r2 << 8) | *--s;
+    uint32_divmod_uint14(&r1, &r0, r2, 1314);
+    R2[162] = r0;
+    r1 = uint32_mod_uint14(r1, 1541); /* needed only for invalid inputs */
+    R2[163] = r1;
+    for (i = 80; i >= 0; --i) {
+        r2 = R3[i];
+        r2 = (r2 << 8) | *--s;
+        uint32_divmod_uint14(&r1, &r0, r2, 1314);
+        R2[2 * i] = r0;
+        r1 = uint32_mod_uint14(r1, 1314); /* needed only for invalid inputs */
+        R2[2 * i + 1] = r1;
+    }
+
+    R1[326] = R2[163];
+    for (i = 162; i >= 0; --i) {
+        r2 = R2[i];
+        r2 = (r2 << 8) | *--s;
+        r2 = (r2 << 8) | *--s;
+        uint32_divmod_uint14(&r1, &r0, r2, 9277);
+        R1[2 * i] = r0;
+        r1 = uint32_mod_uint14(r1, 9277); /* needed only for invalid inputs */
+        R1[2 * i + 1] = r1;
+    }
+
+    R0[652] = 3 * R1[326] - 2310;
+    for (i = 325; i >= 0; --i) {
+        r2 = R1[i];
+        r2 = (r2 << 8) | *--s;
+        uint32_divmod_uint14(&r1, &r0, r2, 1541);
+        R0[2 * i] = 3 * r0 - 2310;
+        r1 = uint32_mod_uint14(r1, 1541); /* needed only for invalid inputs */
+        R0[2 * i + 1] = 3 * r1 - 2310;
+    }
+}
diff --git a/crypto_kem/ntrulpr653/clean/crypto_decode_653x1541.h b/crypto_kem/ntrulpr653/clean/crypto_decode_653x1541.h
new file mode 100644
index 00000000..99b43d6b
--- /dev/null
+++ b/crypto_kem/ntrulpr653/clean/crypto_decode_653x1541.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_NTRULPR653_CLEAN_CRYPTO_DECODE_653X1541_H
+#define PQCLEAN_NTRULPR653_CLEAN_CRYPTO_DECODE_653X1541_H
+
+#include <stdint.h>
+#define PQCLEAN_NTRULPR653_CLEAN_crypto_decode_653x1541_STRBYTES 865
+#define PQCLEAN_NTRULPR653_CLEAN_crypto_decode_653x1541_ITEMS 653
+#define PQCLEAN_NTRULPR653_CLEAN_crypto_decode_653x1541_ITEMBYTES 2
+
+void PQCLEAN_NTRULPR653_CLEAN_crypto_decode_653x1541(void *v, const unsigned char *s);
+#endif
diff --git a/crypto_kem/ntrulpr653/clean/crypto_decode_653x3.c b/crypto_kem/ntrulpr653/clean/crypto_decode_653x3.c
new file mode 100644
index 00000000..b703aac2
--- /dev/null
+++ b/crypto_kem/ntrulpr653/clean/crypto_decode_653x3.c
@@ -0,0 +1,24 @@
+#include "crypto_decode_653x3.h"
+
+#define uint8 uint8_t
+
+#define p 653
+
+void PQCLEAN_NTRULPR653_CLEAN_crypto_decode_653x3(void *v, const unsigned char *s) {
+    uint8 *f = v;
+    uint8 x;
+    int i;
+
+    for (i = 0; i < p / 4; ++i) {
+        x = *s++;
+        *f++ = ((uint8)(x & 3)) - 1;
+        x >>= 2;
+        *f++ = ((uint8)(x & 3)) - 1;
+        x >>= 2;
+        *f++ = ((uint8)(x & 3)) - 1;
+        x >>= 2;
+        *f++ = ((uint8)(x & 3)) - 1;
+    }
+    x = *s++;
+    *f++ = ((uint8)(x & 3)) - 1;
+}
diff --git a/crypto_kem/ntrulpr653/clean/crypto_decode_653x3.h b/crypto_kem/ntrulpr653/clean/crypto_decode_653x3.h
new file mode 100644
index 00000000..8984a459
--- /dev/null
+++ b/crypto_kem/ntrulpr653/clean/crypto_decode_653x3.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_NTRULPR653_CLEAN_CRYPTO_DECODE_653X3_H
+#define PQCLEAN_NTRULPR653_CLEAN_CRYPTO_DECODE_653X3_H
+
+#include <stdint.h>
+#define PQCLEAN_NTRULPR653_CLEAN_crypto_decode_653x3_STRBYTES 164
+#define PQCLEAN_NTRULPR653_CLEAN_crypto_decode_653x3_ITEMS 653
+#define PQCLEAN_NTRULPR653_CLEAN_crypto_decode_653x3_ITEMBYTES 1
+
+void PQCLEAN_NTRULPR653_CLEAN_crypto_decode_653x3(void *v, const unsigned char *s);
+#endif
diff --git a/crypto_kem/ntrulpr653/clean/crypto_decode_653xint16.c b/crypto_kem/ntrulpr653/clean/crypto_decode_653xint16.c
new file mode 100644
index 00000000..e2299739
--- /dev/null
+++ b/crypto_kem/ntrulpr653/clean/crypto_decode_653xint16.c
@@ -0,0 +1,16 @@
+#include "crypto_decode_653xint16.h"
+
+
+void PQCLEAN_NTRULPR653_CLEAN_crypto_decode_653xint16(void *v, const unsigned char *s) {
+    uint16_t *x = v;
+    int i;
+
+    for (i = 0; i < 653; ++i) {
+        uint16_t u0 = s[0];
+        uint16_t u1 = s[1];
+        u1 <<= 8;
+        *x = u0 | u1;
+        x += 1;
+        s += 2;
+    }
+}
diff --git a/crypto_kem/ntrulpr653/clean/crypto_decode_653xint16.h b/crypto_kem/ntrulpr653/clean/crypto_decode_653xint16.h
new file mode 100644
index 00000000..409c3053
--- /dev/null
+++ b/crypto_kem/ntrulpr653/clean/crypto_decode_653xint16.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_NTRULPR653_CLEAN_CRYPTO_DECODE_653XINT16_H
+#define PQCLEAN_NTRULPR653_CLEAN_CRYPTO_DECODE_653XINT16_H
+
+#include <stdint.h>
+#define PQCLEAN_NTRULPR653_CLEAN_crypto_decode_653xint16_STRBYTES 1306
+#define PQCLEAN_NTRULPR653_CLEAN_crypto_decode_653xint16_ITEMBYTES 2
+#define PQCLEAN_NTRULPR653_CLEAN_crypto_decode_653xint16_ITEMS 653
+
+void PQCLEAN_NTRULPR653_CLEAN_crypto_decode_653xint16(void *v, const unsigned char *s);
+#endif
diff --git a/crypto_kem/ntrulpr653/clean/crypto_decode_653xint32.c b/crypto_kem/ntrulpr653/clean/crypto_decode_653xint32.c
new file mode 100644
index 00000000..29a21249
--- /dev/null
+++ b/crypto_kem/ntrulpr653/clean/crypto_decode_653xint32.c
@@ -0,0 +1,20 @@
+#include "crypto_decode_653xint32.h"
+
+
+void PQCLEAN_NTRULPR653_CLEAN_crypto_decode_653xint32(void *v, const unsigned char *s) {
+    uint32_t *x = v;
+    int i;
+
+    for (i = 0; i < 653; ++i) {
+        uint32_t u0 = s[0];
+        uint32_t u1 = s[1];
+        uint32_t u2 = s[2];
+        uint32_t u3 = s[3];
+        u1 <<= 8;
+        u2 <<= 16;
+        u3 <<= 24;
+        *x = u0 | u1 | u2 | u3;
+        x += 1;
+        s += 4;
+    }
+}
diff --git a/crypto_kem/ntrulpr653/clean/crypto_decode_653xint32.h b/crypto_kem/ntrulpr653/clean/crypto_decode_653xint32.h
new file mode 100644
index 00000000..273c304c
--- /dev/null
+++ b/crypto_kem/ntrulpr653/clean/crypto_decode_653xint32.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_NTRULPR653_CLEAN_CRYPTO_DECODE_653XINT32_H
+#define PQCLEAN_NTRULPR653_CLEAN_CRYPTO_DECODE_653XINT32_H
+
+#include <stdint.h>
+#define PQCLEAN_NTRULPR653_CLEAN_crypto_decode_653xint32_STRBYTES 2612
+#define PQCLEAN_NTRULPR653_CLEAN_crypto_decode_653xint32_ITEMBYTES 4
+#define PQCLEAN_NTRULPR653_CLEAN_crypto_decode_653xint32_ITEMS 653
+
+void PQCLEAN_NTRULPR653_CLEAN_crypto_decode_653xint32(void *v, const unsigned char *s);
+#endif
diff --git a/crypto_kem/ntrulpr653/clean/crypto_encode_256x16.c b/crypto_kem/ntrulpr653/clean/crypto_encode_256x16.c
new file mode 100644
index 00000000..c985088c
--- /dev/null
+++ b/crypto_kem/ntrulpr653/clean/crypto_encode_256x16.c
@@ -0,0 +1,10 @@
+#include "crypto_encode_256x16.h"
+
+
+void PQCLEAN_NTRULPR653_CLEAN_crypto_encode_256x16(unsigned char *s, const void *v) {
+    const unsigned char *T = v;
+    int i;
+    for (i = 0; i < 128; ++i) {
+        s[i] = T[2 * i] + (T[2 * i + 1] << 4);
+    }
+}
diff --git a/crypto_kem/ntrulpr653/clean/crypto_encode_256x16.h b/crypto_kem/ntrulpr653/clean/crypto_encode_256x16.h
new file mode 100644
index 00000000..6757ca09
--- /dev/null
+++ b/crypto_kem/ntrulpr653/clean/crypto_encode_256x16.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_NTRULPR653_CLEAN_CRYPTO_ENCODE_256X16_H
+#define PQCLEAN_NTRULPR653_CLEAN_CRYPTO_ENCODE_256X16_H
+
+#include <stdint.h>
+#define PQCLEAN_NTRULPR653_CLEAN_crypto_encode_256x16_STRBYTES 128
+#define PQCLEAN_NTRULPR653_CLEAN_crypto_encode_256x16_ITEMS 256
+#define PQCLEAN_NTRULPR653_CLEAN_crypto_encode_256x16_ITEMBYTES 1
+
+void PQCLEAN_NTRULPR653_CLEAN_crypto_encode_256x16(unsigned char *s, const void *v);
+#endif
diff --git a/crypto_kem/ntrulpr653/clean/crypto_encode_256x2.c b/crypto_kem/ntrulpr653/clean/crypto_encode_256x2.c
new file mode 100644
index 00000000..4ab11cd7
--- /dev/null
+++ b/crypto_kem/ntrulpr653/clean/crypto_encode_256x2.c
@@ -0,0 +1,13 @@
+#include "crypto_encode_256x2.h"
+
+
+void PQCLEAN_NTRULPR653_CLEAN_crypto_encode_256x2(unsigned char *s, const void *v) {
+    const unsigned char *r = v;
+    int i;
+    for (i = 0; i < 32; ++i) {
+        s[i] = 0;
+    }
+    for (i = 0; i < 256; ++i) {
+        s[i >> 3] |= (r[i] & 1) << (i & 7);
+    }
+}
diff --git a/crypto_kem/ntrulpr653/clean/crypto_encode_256x2.h b/crypto_kem/ntrulpr653/clean/crypto_encode_256x2.h
new file mode 100644
index 00000000..155bf29e
--- /dev/null
+++ b/crypto_kem/ntrulpr653/clean/crypto_encode_256x2.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_NTRULPR653_CLEAN_CRYPTO_ENCODE_256X2_H
+#define PQCLEAN_NTRULPR653_CLEAN_CRYPTO_ENCODE_256X2_H
+
+#include <stdint.h>
+#define PQCLEAN_NTRULPR653_CLEAN_crypto_encode_256x2_STRBYTES 32
+#define PQCLEAN_NTRULPR653_CLEAN_crypto_encode_256x2_ITEMS 256
+#define PQCLEAN_NTRULPR653_CLEAN_crypto_encode_256x2_ITEMBYTES 1
+
+void PQCLEAN_NTRULPR653_CLEAN_crypto_encode_256x2(unsigned char *s, const void *v);
+#endif
diff --git a/crypto_kem/ntrulpr653/clean/crypto_encode_653x1541.c b/crypto_kem/ntrulpr653/clean/crypto_encode_653x1541.c
new file mode 100644
index 00000000..1862de4c
--- /dev/null
+++ b/crypto_kem/ntrulpr653/clean/crypto_encode_653x1541.c
@@ -0,0 +1,127 @@
+#include "crypto_encode_653x1541.h"
+
+/* auto-generated; do not edit */
+
+#define int16 int16_t
+#define uint16 uint16_t
+#define uint32 uint32_t
+
+void PQCLEAN_NTRULPR653_CLEAN_crypto_encode_653x1541(unsigned char *out, const void *v) {
+    const int16 *R0 = v;
+    /* XXX: caller could overlap R with input */
+    uint16 R[327];
+    long i;
+    uint16 r0, r1;
+    uint32 r2;
+
+    for (i = 0; i < 326; ++i) {
+        r0 = (((R0[2 * i] + 2310) & 16383) * 10923) >> 15;
+        r1 = (((R0[2 * i + 1] + 2310) & 16383) * 10923) >> 15;
+        r2 = r0 + r1 * (uint32)1541;
+        *out++ = r2;
+        r2 >>= 8;
+        R[i] = r2;
+    }
+    R[326] = (((R0[652] + 2310) & 16383) * 10923) >> 15;
+
+    for (i = 0; i < 163; ++i) {
+        r0 = R[2 * i];
+        r1 = R[2 * i + 1];
+        r2 = r0 + r1 * (uint32)9277;
+        *out++ = r2;
+        r2 >>= 8;
+        *out++ = r2;
+        r2 >>= 8;
+        R[i] = r2;
+    }
+    R[163] = R[326];
+
+    for (i = 0; i < 82; ++i) {
+        r0 = R[2 * i];
+        r1 = R[2 * i + 1];
+        r2 = r0 + r1 * (uint32)1314;
+        *out++ = r2;
+        r2 >>= 8;
+        R[i] = r2;
+    }
+
+    for (i = 0; i < 41; ++i) {
+        r0 = R[2 * i];
+        r1 = R[2 * i + 1];
+        r2 = r0 + r1 * (uint32)6745;
+        *out++ = r2;
+        r2 >>= 8;
+        *out++ = r2;
+        r2 >>= 8;
+        R[i] = r2;
+    }
+
+    for (i = 0; i < 20; ++i) {
+        r0 = R[2 * i];
+        r1 = R[2 * i + 1];
+        r2 = r0 + r1 * (uint32)695;
+        *out++ = r2;
+        r2 >>= 8;
+        R[i] = r2;
+    }
+    R[20] = R[40];
+
+    for (i = 0; i < 10; ++i) {
+        r0 = R[2 * i];
+        r1 = R[2 * i + 1];
+        r2 = r0 + r1 * (uint32)1887;
+        *out++ = r2;
+        r2 >>= 8;
+        R[i] = r2;
+    }
+    R[10] = R[20];
+
+    for (i = 0; i < 5; ++i) {
+        r0 = R[2 * i];
+        r1 = R[2 * i + 1];
+        r2 = r0 + r1 * (uint32)13910;
+        *out++ = r2;
+        r2 >>= 8;
+        *out++ = r2;
+        r2 >>= 8;
+        R[i] = r2;
+    }
+    R[5] = R[10];
+
+    for (i = 0; i < 2; ++i) {
+        r0 = R[2 * i];
+        r1 = R[2 * i + 1];
+        r2 = r0 + r1 * (uint32)2953;
+        *out++ = r2;
+        r2 >>= 8;
+        *out++ = r2;
+        r2 >>= 8;
+        R[i] = r2;
+    }
+    r0 = R[4];
+    r1 = R[5];
+    r2 = r0 + r1 * (uint32)2953;
+    *out++ = r2;
+    r2 >>= 8;
+    R[2] = r2;
+
+    r0 = R[0];
+    r1 = R[1];
+    r2 = r0 + r1 * (uint32)134;
+    *out++ = r2;
+    r2 >>= 8;
+    R[0] = r2;
+    R[1] = R[2];
+
+    r0 = R[0];
+    r1 = R[1];
+    r2 = r0 + r1 * (uint32)71;
+    *out++ = r2;
+    r2 >>= 8;
+    R[0] = r2;
+
+    r0 = R[0];
+    *out++ = r0;
+    r0 >>= 8;
+    *out++ = r0; /*clang-analyzer-deadcode.DeadStores*/ /*r0 >>= 8;*/
+}
diff --git a/crypto_kem/ntrulpr653/clean/crypto_encode_653x1541.h b/crypto_kem/ntrulpr653/clean/crypto_encode_653x1541.h
new file mode 100644
index 00000000..fd04831c
--- /dev/null
+++ b/crypto_kem/ntrulpr653/clean/crypto_encode_653x1541.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_NTRULPR653_CLEAN_CRYPTO_ENCODE_653X1541_H
+#define PQCLEAN_NTRULPR653_CLEAN_CRYPTO_ENCODE_653X1541_H
+
+#include <stdint.h>
+#define PQCLEAN_NTRULPR653_CLEAN_crypto_encode_653x1541_STRBYTES 865
+#define PQCLEAN_NTRULPR653_CLEAN_crypto_encode_653x1541_ITEMS 653
+#define PQCLEAN_NTRULPR653_CLEAN_crypto_encode_653x1541_ITEMBYTES 2
+
+void PQCLEAN_NTRULPR653_CLEAN_crypto_encode_653x1541(unsigned char *out, const void *v);
+#endif
diff --git a/crypto_kem/ntrulpr653/clean/crypto_encode_653x1541round.c b/crypto_kem/ntrulpr653/clean/crypto_encode_653x1541round.c
new file mode 100644
index 00000000..a4b969c4
--- /dev/null
+++ b/crypto_kem/ntrulpr653/clean/crypto_encode_653x1541round.c
@@ -0,0 +1,17 @@
+#include "crypto_encode_653x1541.h"
+#include "crypto_encode_653x1541round.h"
+
+#define int16 int16_t
+
+#define p 653
+
+void PQCLEAN_NTRULPR653_CLEAN_crypto_encode_653x1541round(unsigned char *out, const void *v) {
+    const int16 *a = v;
+    int16 x[p];
+    int i;
+
+    for (i = 0; i < p; ++i) {
+        x[i] = 3 * ((10923 * a[i] + 16384) >> 15);
+    }
+    PQCLEAN_NTRULPR653_CLEAN_crypto_encode_653x1541(out, x);
+}
diff --git a/crypto_kem/ntrulpr653/clean/crypto_encode_653x1541round.h b/crypto_kem/ntrulpr653/clean/crypto_encode_653x1541round.h
new file mode 100644
index 00000000..84a35183
--- /dev/null
+++ b/crypto_kem/ntrulpr653/clean/crypto_encode_653x1541round.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_NTRULPR653_CLEAN_CRYPTO_ENCODE_653X1541ROUND_H
+#define PQCLEAN_NTRULPR653_CLEAN_CRYPTO_ENCODE_653X1541ROUND_H
+
+#include <stdint.h>
+#define PQCLEAN_NTRULPR653_CLEAN_crypto_encode_653x1541round_STRBYTES 865
+#define PQCLEAN_NTRULPR653_CLEAN_crypto_encode_653x1541round_ITEMS 653
+#define PQCLEAN_NTRULPR653_CLEAN_crypto_encode_653x1541round_ITEMBYTES 2
+
+void PQCLEAN_NTRULPR653_CLEAN_crypto_encode_653x1541round(unsigned char *out, const void *v);
+#endif
diff --git a/crypto_kem/ntrulpr653/clean/crypto_encode_653x3.c b/crypto_kem/ntrulpr653/clean/crypto_encode_653x3.c
new file mode 100644
index 00000000..34f1f6c4
--- /dev/null
+++ b/crypto_kem/ntrulpr653/clean/crypto_encode_653x3.c
@@ -0,0 +1,21 @@
+#include "crypto_encode_653x3.h"
+
+#define uint8 uint8_t
+
+#define p 653
+
+void PQCLEAN_NTRULPR653_CLEAN_crypto_encode_653x3(unsigned char *s, const void *v) {
+    const uint8 *f = v;
+    uint8 x;
+    int i;
+
+    for (i = 0; i < p / 4; ++i) {
+        x = *f++ + 1;
+        x += (*f++ + 1) << 2;
+        x += (*f++ + 1) << 4;
+        x += (*f++ + 1) << 6;
+        *s++ = x;
+    }
+    x = *f++ + 1;
+    *s++ = x;
+}
diff --git a/crypto_kem/ntrulpr653/clean/crypto_encode_653x3.h b/crypto_kem/ntrulpr653/clean/crypto_encode_653x3.h
new file mode 100644
index 00000000..fa055522
--- /dev/null
+++ b/crypto_kem/ntrulpr653/clean/crypto_encode_653x3.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_NTRULPR653_CLEAN_CRYPTO_ENCODE_653X3_H
+#define PQCLEAN_NTRULPR653_CLEAN_CRYPTO_ENCODE_653X3_H
+
+#include <stdint.h>
+#define PQCLEAN_NTRULPR653_CLEAN_crypto_encode_653x3_STRBYTES 164
+#define PQCLEAN_NTRULPR653_CLEAN_crypto_encode_653x3_ITEMS 653
+#define PQCLEAN_NTRULPR653_CLEAN_crypto_encode_653x3_ITEMBYTES 1
+
+void PQCLEAN_NTRULPR653_CLEAN_crypto_encode_653x3(unsigned char *s, const void *v);
+#endif
diff --git a/crypto_kem/ntrulpr653/clean/crypto_encode_653xint16.c b/crypto_kem/ntrulpr653/clean/crypto_encode_653xint16.c
new file mode 100644
index 00000000..6bb71b20
--- /dev/null
+++ b/crypto_kem/ntrulpr653/clean/crypto_encode_653xint16.c
@@ -0,0 +1,13 @@
+#include "crypto_encode_653xint16.h"
+
+
+void PQCLEAN_NTRULPR653_CLEAN_crypto_encode_653xint16(unsigned char *s, const void *v) {
+    const uint16_t *x = v;
+    int i;
+
+    for (i = 0; i < 653; ++i) {
+        uint16_t u = *x++;
+        *s++ = u;
+        *s++ = u >> 8;
+    }
+}
diff --git a/crypto_kem/ntrulpr653/clean/crypto_encode_653xint16.h b/crypto_kem/ntrulpr653/clean/crypto_encode_653xint16.h
new file mode 100644
index 00000000..b083af75
--- /dev/null
+++ b/crypto_kem/ntrulpr653/clean/crypto_encode_653xint16.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_NTRULPR653_CLEAN_CRYPTO_ENCODE_653XINT16_H
+#define PQCLEAN_NTRULPR653_CLEAN_CRYPTO_ENCODE_653XINT16_H
+
+#include <stdint.h>
+#define PQCLEAN_NTRULPR653_CLEAN_crypto_encode_653xint16_STRBYTES 1306
+#define PQCLEAN_NTRULPR653_CLEAN_crypto_encode_653xint16_ITEMBYTES 2
+#define PQCLEAN_NTRULPR653_CLEAN_crypto_encode_653xint16_ITEMS 653
+
+void PQCLEAN_NTRULPR653_CLEAN_crypto_encode_653xint16(unsigned char *s, const void *v);
+#endif
diff --git a/crypto_kem/ntrulpr653/clean/crypto_sort_int32.c b/crypto_kem/ntrulpr653/clean/crypto_sort_int32.c
new file mode 100644
index 00000000..ec4496d9
--- /dev/null
+++ b/crypto_kem/ntrulpr653/clean/crypto_sort_int32.c
@@ -0,0 +1,86 @@
+#include "crypto_sort_int32.h"
+#include <stdint.h>
+// Based on supercop-20190110/crypto_sort/int32/x86
+
+
+#define int32 int32_t
+
+#define int32_MINMAX(a,b) \
+    do { \
+        int32_t ab = (b) ^ (a); \
+        int32_t c = (int32_t)((int64_t)(b) - (int64_t)(a)); \
+        c ^= ab & (c ^ (b)); \
+        c >>= 31; \
+        c &= ab; \
+        (a) ^= c; \
+        (b) ^= c; \
+    } while(0)
+
+/* assume 2 <= n <= 0x40000000 */
+void PQCLEAN_NTRULPR653_CLEAN_crypto_sort_int32(int32 *array, size_t n) {
+    size_t top, p, q, r, i, j;
+    int32 *x = array;
+
+    top = 1;
+    while (top < n - top) {
+        top += top;
+    }
+
+    for (p = top; p >= 1; p >>= 1) {
+        i = 0;
+        while (i + 2 * p <= n) {
+            for (j = i; j < i + p; ++j) {
+                int32_MINMAX(x[j], x[j + p]);
+            }
+            i += 2 * p;
+        }
+        for (j = i; j < n - p; ++j) {
+            int32_MINMAX(x[j], x[j + p]);
+        }
+
+        i = 0;
+        j = 0;
+        for (q = top; q > p; q >>= 1) {
+            if (j != i) {
+                for (;;) {
+                    if (j == n - q) {
+                        goto done;
+                    }
+                    int32 a = x[j + p];
+                    for (r = q; r > p; r >>= 1) {
+                        int32_MINMAX(a, x[j + r]);
+                    }
+                    x[j + p] = a;
+                    ++j;
+                    if (j == i + p) {
+                        i += 2 * p;
+                        break;
+                    }
+                }
+            }
+            while (i + p <= n - q) {
+                for (j = i; j < i + p; ++j) {
+                    int32 a = x[j + p];
+                    for (r = q; r > p; r >>= 1) {
+                        int32_MINMAX(a, x[j + r]);
+                    }
+                    x[j + p] = a;
+                }
+                i += 2 * p;
+            }
+            /* now i + p > n - q */
+            j = i;
+            while (j < n - q) {
+                int32 a = x[j + p];
+                for (r = q; r > p; r >>= 1) {
+                    int32_MINMAX(a, x[j + r]);
+                }
+                x[j + p] = a;
+                ++j;
+            }
+
+done:
+            ;
+        }
+    }
+}
diff --git a/crypto_kem/ntrulpr653/clean/crypto_sort_int32.h b/crypto_kem/ntrulpr653/clean/crypto_sort_int32.h
new file mode 100644
index 00000000..ebf8449f
--- /dev/null
+++ b/crypto_kem/ntrulpr653/clean/crypto_sort_int32.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_NTRULPR653_CLEAN_CRYPTO_SORT_INT32_H
+#define PQCLEAN_NTRULPR653_CLEAN_CRYPTO_SORT_INT32_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+
+void PQCLEAN_NTRULPR653_CLEAN_crypto_sort_int32(int32_t *array, size_t n);
+
+#endif
diff --git a/crypto_kem/ntrulpr653/clean/crypto_sort_uint32.c b/crypto_kem/ntrulpr653/clean/crypto_sort_uint32.c
new file mode 100644
index 00000000..35b9c3ab
--- /dev/null
+++ b/crypto_kem/ntrulpr653/clean/crypto_sort_uint32.c
@@ -0,0 +1,20 @@
+#include "crypto_sort_int32.h"
+#include "crypto_sort_uint32.h"
+#include <stdint.h>
+
+#define uint32 uint32_t
+
+/* can save time by vectorizing xor loops */
+/* can save time by integrating xor loops with int32_sort */
+
+void PQCLEAN_NTRULPR653_CLEAN_crypto_sort_uint32(uint32_t *array, size_t n) {
+    uint32 *x = array;
+    size_t j;
+    for (j = 0; j < n; ++j) {
+        x[j] ^= 0x80000000;
+    }
+    PQCLEAN_NTRULPR653_CLEAN_crypto_sort_int32((int32_t *)array, n);
+    for (j = 0; j < n; ++j) {
+        x[j] ^= 0x80000000;
+    }
+}
diff --git a/crypto_kem/ntrulpr653/clean/crypto_sort_uint32.h b/crypto_kem/ntrulpr653/clean/crypto_sort_uint32.h
new file mode 100644
index 00000000..f1a64f3f
--- /dev/null
+++ b/crypto_kem/ntrulpr653/clean/crypto_sort_uint32.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_NTRULPR653_CLEAN_CRYPTO_SORT_UINT32_H
+#define PQCLEAN_NTRULPR653_CLEAN_CRYPTO_SORT_UINT32_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+
+void PQCLEAN_NTRULPR653_CLEAN_crypto_sort_uint32(uint32_t *array, size_t n);
+
+#endif
diff --git a/crypto_kem/ntrulpr653/clean/crypto_stream_aes256ctr.c b/crypto_kem/ntrulpr653/clean/crypto_stream_aes256ctr.c
new file mode 100644
index 00000000..f402a0d7
--- /dev/null
+++ b/crypto_kem/ntrulpr653/clean/crypto_stream_aes256ctr.c
@@ -0,0 +1,15 @@
+#include "crypto_stream_aes256ctr.h"
+
+
+int PQCLEAN_NTRULPR653_CLEAN_crypto_stream_aes256ctr(
+    uint8_t *out,
+    size_t outlen,
+    const uint8_t nonce[AESCTR_NONCEBYTES],
+    const uint8_t key[AES256_KEYBYTES]) {
+
+    aes256ctx state;
+    aes256_ctr_keyexp(&state, key);
+    aes256_ctr(out, outlen, nonce, &state);
+    aes256_ctx_release(&state);
+    return 0;
+}
diff --git a/crypto_kem/ntrulpr653/clean/crypto_stream_aes256ctr.h b/crypto_kem/ntrulpr653/clean/crypto_stream_aes256ctr.h
new file mode 100644
index 00000000..8b2991e6
--- /dev/null
+++ b/crypto_kem/ntrulpr653/clean/crypto_stream_aes256ctr.h
@@ -0,0 +1,15 @@
+#ifndef PQCLEAN_NTRULPR653_CLEAN_CRYPTO_STREAM_AES256CTR_H
+#define PQCLEAN_NTRULPR653_CLEAN_CRYPTO_STREAM_AES256CTR_H
+#include "aes.h"
+#include <stddef.h>
+#include <stdint.h>
+
+
+
+int PQCLEAN_NTRULPR653_CLEAN_crypto_stream_aes256ctr(
+    uint8_t *out,
+    size_t outlen,
+    const uint8_t nonce[AESCTR_NONCEBYTES],
+    const uint8_t key[AES256_KEYBYTES]);
+
+#endif
diff --git a/crypto_kem/ntrulpr653/clean/crypto_verify_1025.c b/crypto_kem/ntrulpr653/clean/crypto_verify_1025.c
new file mode 100644
index 00000000..0cf6b97b
--- /dev/null
+++ b/crypto_kem/ntrulpr653/clean/crypto_verify_1025.c
@@ -0,0 +1,13 @@
+#include "crypto_verify_1025.h"
+
+
+int PQCLEAN_NTRULPR653_CLEAN_crypto_verify_1025(const unsigned char *x, const unsigned char *y) {
+    unsigned int differentbits = 0;
+    int i;
+
+    for (i = 0; i < PQCLEAN_NTRULPR653_CLEAN_crypto_verify_1025_BYTES; ++i) {
+        differentbits |= x[i] ^ y[i];
+    }
+
+    return (int) (1 & ((differentbits - 1) >> 8)) - 1;
+}
diff --git a/crypto_kem/ntrulpr653/clean/crypto_verify_1025.h b/crypto_kem/ntrulpr653/clean/crypto_verify_1025.h
new file mode 100644
index 00000000..82c6c9ae
--- /dev/null
+++ b/crypto_kem/ntrulpr653/clean/crypto_verify_1025.h
@@ -0,0 +1,8 @@
+#ifndef PQCLEAN_NTRULPR653_CLEAN_CRYPTO_VERIFY_1025_H
+#define PQCLEAN_NTRULPR653_CLEAN_CRYPTO_VERIFY_1025_H
+
+#include <stdint.h>
+#define PQCLEAN_NTRULPR653_CLEAN_crypto_verify_1025_BYTES 1025
+
+int PQCLEAN_NTRULPR653_CLEAN_crypto_verify_1025(const unsigned char *x, const unsigned char *y);
+#endif
diff --git a/crypto_kem/ntrulpr653/clean/kem.c b/crypto_kem/ntrulpr653/clean/kem.c
new file mode 100644
index 00000000..1da750e0
--- /dev/null
+++ b/crypto_kem/ntrulpr653/clean/kem.c
@@ -0,0 +1,287 @@
+#include "api.h"
+#include "crypto_sort_uint32.h"
+#include "crypto_stream_aes256ctr.h"
+#include "params.h"
+#include "randombytes.h"
+#include "sha2.h"
+
+
+
+#define int8 int8_t
+#define int16 int16_t
+#define int32 int32_t
+#define uint16 uint16_t
+#define uint32 uint32_t
+#define uint64 uint64_t
+
+/* ----- masks */
+
+/* return -1 if x<0; otherwise return 0 */
+static int int16_negative_mask(int16 x) {
+    uint16 u = x;
+    u >>= 15;
+    return -(int) u;
+    /* alternative with gcc -fwrapv: */
+    /* x>>15 compiles to CPU's arithmetic right shift */
+}
+
+/* ----- arithmetic mod 3 */
+
+typedef int8 small;
+/* F3 is always represented as -1,0,1 */
+
+/* ----- arithmetic mod q */
+
+#define q12 ((q-1)/2)
+typedef int16 Fq;
+
+/* works for -14000000 < x < 14000000 if q in 4591, 4621, 5167 */
+/* assumes twos complement; use, e.g., gcc -fwrapv */
+static Fq Fq_freeze(int32 x) {
+    x -= q * ((q18 * x) >> 18);
+    x -= q * ((q27 * x + 67108864) >> 27);
+    return x;
+}
+
+/* works for all uint32 x */
+static Fq Fq_bigfreeze(uint32 x) {
+    x -= q * ((x * (uint64)q31) >> 31);
+    x -= q * ((x * (uint64)q31) >> 31);
+    x -= q;
+    x += (-(x >> 31)) & (uint32)q;
+    return x;
+}
+
+/* ----- Top and Right */
+
+static int8 Top(Fq C) {
+    return (tau1 * (int32)(C + tau0) + 16384) >> 15;
+}
+
+static Fq Right(int8 T) {
+    return Fq_freeze(tau3 * (int32)T - tau2);
+}
+
+/* ----- polynomials mod q */
+
+/* h = h*g in the ring Rq */
+static void Rq_mult_small(Fq *h, const small *g) {
+    crypto_encode_pxint16((unsigned char *) h, h);
+    crypto_core_mult((unsigned char *) h, (const unsigned char *) h, (const unsigned char *) g);
+    crypto_decode_pxint16(h, (const unsigned char *) h);
+}
+
+/* ----- sorting to generate short polynomial */
+
+static void Short_fromlist(small *out, const uint32 *in) {
+    uint32 L[ppadsort];
+    int i;
+
+    for (i = 0; i < w; ++i) {
+        L[i] = in[i] & (uint32) - 2;
+    }
+    for (i = w; i < p; ++i) {
+        L[i] = (in[i] & (uint32) - 3) | 1;
+    }
+    for (i = p; i < ppadsort; ++i) {
+        L[i] = 0xffffffff;
+    }
+    PQCLEAN_NTRULPR653_CLEAN_crypto_sort_uint32(L, ppadsort);
+    for (i = 0; i < p; ++i) {
+        out[i] = (L[i] & 3) - 1;
+    }
+}
+
+/* ----- underlying hash function */
+
+#define Hash_bytes 32
+
+static void Hash(unsigned char *out, const unsigned char *in, int inlen) {
+    unsigned char h[64];
+    int i;
+    sha512(h, in, inlen);
+    for (i = 0; i < 32; ++i) {
+        out[i] = h[i];
+    }
+}
+
+/* ----- higher-level randomness */
+
+static void Short_random(small *out) {
+    uint32 L[p];
+
+    randombytes((unsigned char *) L, sizeof L);
+    crypto_decode_pxint32(L, (unsigned char *) L);
+    Short_fromlist(out, L);
+}
+
+/* ----- Inputs, Generator */
+
+typedef int8 Inputs[I]; /* passed by reference */
+
+static const unsigned char aes_nonce[16] = {0};
+
+/* G = Generator(pk) */
+static void Generator(Fq *G, const unsigned char *pk) {
+    uint32 L[p];
+    int i;
+
+    PQCLEAN_NTRULPR653_CLEAN_crypto_stream_aes256ctr((unsigned char *) L, 4 * p, aes_nonce, pk);
+    crypto_decode_pxint32(L, (unsigned char *) L);
+    for (i = 0; i < p; ++i) {
+        G[i] = Fq_bigfreeze(L[i]) - q12;
+    }
+}
+
+/* ----- NTRU LPRime */
+
+#define Seeds_bytes 32
+#define Ciphertexts_bytes (Rounded_bytes+Top_bytes)
+#define SecretKeys_bytes Small_bytes
+#define PublicKeys_bytes (Seeds_bytes+Rounded_bytes)
+#define Confirm_bytes 32
+
+/* c,r_enc[1:] = Hide(r,pk,cache); cache is Hash4(pk) */
+static void Hide(unsigned char *c, unsigned char *r_enc, const Inputs r, const unsigned char *pk, const unsigned char *cache) {
+    small b[p];
+    int i;
+
+    Inputs_encode(r_enc + 1, r);
+    {
+        unsigned char h[Hash_bytes];
+        uint32 L[p];
+        {
+            unsigned char s[1 + Inputs_bytes];
+            Inputs_encode(s + 1, r);
+            s[0] = 5;
+            Hash(h, s, sizeof s);
+        }
+        PQCLEAN_NTRULPR653_CLEAN_crypto_stream_aes256ctr((unsigned char *) L, 4 * p, aes_nonce, h);
+        crypto_decode_pxint32(L, (unsigned char *) L);
+        Short_fromlist(b, L);
+    }
+    {
+        Fq bG[p];
+        Generator(bG, pk);
+        Rq_mult_small(bG, b);
+        Round_and_encode(c, bG);
+        c += Rounded_bytes;
+    }
+    {
+        Fq bA[p];
+        int8 T[I];
+        Rounded_decode(bA, pk + Seeds_bytes);
+        Rq_mult_small(bA, b);
+        for (i = 0; i < I; ++i) {
+            T[i] = Top(Fq_freeze(bA[i] + r[i] * q12));
+        }
+        Top_encode(c, T);
+        c += Top_bytes;
+    }
+    {
+        unsigned char x[1 + Inputs_bytes + Hash_bytes];
+        for (i = 0; i < Inputs_bytes; ++i) {
+            x[1 + i] = r_enc[1 + i];
+        }
+        for (i = 0; i < Hash_bytes; ++i) {
+            x[1 + Inputs_bytes + i] = cache[i];
+        }
+        x[0] = 2;
+        Hash(c, x, sizeof x);
+    }
+}
+
+
+int PQCLEAN_NTRULPR653_CLEAN_crypto_kem_keypair(unsigned char *pk, unsigned char *sk) {
+    Fq aG[p];
+    int i;
+    randombytes(pk, Seeds_bytes);
+    Generator(aG, pk);
+    {
+        small a[p];
+        Short_random(a);
+        Rq_mult_small(aG, a);
+        Small_encode(sk, a);
+    }
+    Round_and_encode(pk + Seeds_bytes, aG);
+    {
+        unsigned char sksave = sk[SecretKeys_bytes - 1];
+        for (i = 0; i < PublicKeys_bytes; ++i) {
+            sk[SecretKeys_bytes + i] = pk[i];
+        }
+        sk[SecretKeys_bytes - 1] = 4;
+        Hash(sk + SecretKeys_bytes + PublicKeys_bytes + Inputs_bytes, sk + SecretKeys_bytes - 1, 1 + PublicKeys_bytes);
+        sk[SecretKeys_bytes - 1] = sksave;
+        randombytes(sk + SecretKeys_bytes + PublicKeys_bytes, Inputs_bytes);
+    }
+    return 0;
+}
+
+int PQCLEAN_NTRULPR653_CLEAN_crypto_kem_enc(unsigned char *c, unsigned char *k, const unsigned char *pk) {
+    int i;
+    unsigned char cache[Hash_bytes];
+    {
+        unsigned char y[1 + PublicKeys_bytes];
+        for (i = 0; i < PublicKeys_bytes; ++i) {
+            y[1 + i] = pk[i];
+        }
+        y[0] = 4;
+        Hash(cache, y, sizeof y);
+    }
+    Inputs r;
+    {
+        unsigned char s[Inputs_bytes];
+        randombytes(s, sizeof s);
+        Inputs_decode(r, s);
+    }
+    {
+        unsigned char x[1 + Inputs_bytes + Ciphertexts_bytes + Confirm_bytes];
+        Hide(c, x, r, pk, cache);
+        for (i = 0; i < Ciphertexts_bytes + Confirm_bytes; ++i) {
+            x[1 + Inputs_bytes + i] = c[i];
+        }
+        x[0] = 1;
+        Hash(k, x, sizeof x);
+    }
+    return 0;
+}
+
+int PQCLEAN_NTRULPR653_CLEAN_crypto_kem_dec(unsigned char *k, const unsigned char *c, const unsigned char *sk) {
+    const unsigned char *pk = sk + SecretKeys_bytes;
+    const unsigned char *rho = pk + PublicKeys_bytes;
+    const unsigned char *cache = rho + Inputs_bytes;
+    Inputs r;
+    int i;
+    {
+        Fq aB[p];
+        Rounded_decode(aB, c);
+        {
+            small a[p];
+            Small_decode(a, sk);
+            Rq_mult_small(aB, a);
+        }
+        {
+            int8 T[I];
+            Top_decode(T, c + Rounded_bytes);
+            for (i = 0; i < I; ++i) {
+                r[i] = -int16_negative_mask(Fq_freeze(Right(T[i]) - aB[i] + 4 * w + 1));
+            }
+        }
+    }
+    {
+        unsigned char cnew[Ciphertexts_bytes + Confirm_bytes];
+        int mask;
+        unsigned char x[1 + Inputs_bytes + Ciphertexts_bytes + Confirm_bytes];
+        Hide(cnew, x, r, pk, cache);
+        mask = crypto_verify_clen(c, cnew);
+        for (i = 0; i < Inputs_bytes; ++i) {
+            x[1 + i] ^= mask & (x[1 + i] ^ rho[i]);
+        }
+        for (i = 0; i < Ciphertexts_bytes + Confirm_bytes; ++i) {
+            x[1 + Inputs_bytes + i] = c[i];
+        }
+        x[0] = 1 + mask;
+        Hash(k, x, sizeof x);
+    }
+    return 0;
+}
diff --git a/crypto_kem/ntrulpr653/clean/params.h b/crypto_kem/ntrulpr653/clean/params.h
new file mode 100644
index 00000000..087d6af7
--- /dev/null
+++ b/crypto_kem/ntrulpr653/clean/params.h
@@ -0,0 +1,63 @@
+#ifndef params_H
+#define params_H
+#include "crypto_core_multsntrup653.h"
+#include "crypto_decode_256x16.h"
+#include "crypto_decode_256x2.h"
+#include "crypto_decode_653x1541.h"
+#include "crypto_decode_653x3.h"
+#include "crypto_decode_653xint16.h"
+#include "crypto_decode_653xint32.h"
+#include "crypto_encode_256x16.h"
+#include "crypto_encode_256x2.h"
+#include "crypto_encode_653x1541.h"
+#include "crypto_encode_653x1541round.h"
+#include "crypto_encode_653x3.h"
+#include "crypto_encode_653xint16.h"
+#include "crypto_verify_1025.h"
+
+
+#define p 653
+#define q 4621
+#define w 252
+#define q27 29045 /* closest integer to 2^27/q */
+#define q18 57 /* closest integer to 2^18/q */
+#define tau0 2175
+#define tau1 113
+#define tau2 2031
+#define tau3 290
+#define I 256
+
+#define ppadsort 653
+
+#define q18 57 /* round(2^18/q) */
+#define q27 29045 /* round(2^27/q) */
+#define q31 464722 /* floor(2^31/q) */
+
+#define crypto_verify_clen PQCLEAN_NTRULPR653_CLEAN_crypto_verify_1025
+
+#define Rounded_bytes PQCLEAN_NTRULPR653_CLEAN_crypto_decode_653x1541_STRBYTES
+#define Rounded_decode PQCLEAN_NTRULPR653_CLEAN_crypto_decode_653x1541
+
+#define Round_and_encode PQCLEAN_NTRULPR653_CLEAN_crypto_encode_653x1541round
+
+#define Small_bytes PQCLEAN_NTRULPR653_CLEAN_crypto_encode_653x3_STRBYTES
+#define Small_encode PQCLEAN_NTRULPR653_CLEAN_crypto_encode_653x3
+#define Small_decode PQCLEAN_NTRULPR653_CLEAN_crypto_decode_653x3
+
+#define Top_bytes PQCLEAN_NTRULPR653_CLEAN_crypto_encode_256x16_STRBYTES
+#define Top_encode PQCLEAN_NTRULPR653_CLEAN_crypto_encode_256x16
+#define Top_decode PQCLEAN_NTRULPR653_CLEAN_crypto_decode_256x16
+
+#define Inputs_bytes PQCLEAN_NTRULPR653_CLEAN_crypto_encode_256x2_STRBYTES
+#define Inputs_encode PQCLEAN_NTRULPR653_CLEAN_crypto_encode_256x2
+#define Inputs_decode PQCLEAN_NTRULPR653_CLEAN_crypto_decode_256x2
+
+#define crypto_decode_pxint32 PQCLEAN_NTRULPR653_CLEAN_crypto_decode_653xint32
+
+#define crypto_decode_pxint16 PQCLEAN_NTRULPR653_CLEAN_crypto_decode_653xint16
+
+#define crypto_encode_pxint16 PQCLEAN_NTRULPR653_CLEAN_crypto_encode_653xint16
+
+#define crypto_core_mult PQCLEAN_NTRULPR653_CLEAN_crypto_core_multsntrup653
+
+#endif
diff --git a/crypto_kem/ntrulpr761/META.yml b/crypto_kem/ntrulpr761/META.yml
new file mode 100644
index 00000000..62529f47
--- /dev/null
+++ b/crypto_kem/ntrulpr761/META.yml
@@ -0,0 +1,26 @@
+name: ntrulpr761
+type: kem
+claimed-nist-level: 3
+claimed-security: IND-CCA2
+length-public-key: 1039
+length-secret-key: 1294
+length-ciphertext: 1167
+length-shared-secret: 32
+nistkat-sha256: 212f68484864e927c674a656ea44ea0f47c048d0dd3518b102c98a9eacd16a72
+principal-submitters:
+  - Daniel J. Bernstein
+  - Chitchanok Chuengsatiansup
+  - Tanja Lange
+  - Christine van Vredendaal
+implementations:
+    - name: clean
+      version: supercop-20200826
+    - name: avx2
+      version: supercop-20200826
+      supported_platforms:
+          - architecture: x86_64
+            operating_systems:
+                - Linux
+                - Darwin
+            required_flags:
+                - avx2
diff --git a/crypto_kem/ntrulpr761/avx2/LICENSE b/crypto_kem/ntrulpr761/avx2/LICENSE
new file mode 100644
index 00000000..d5d21fff
--- /dev/null
+++ b/crypto_kem/ntrulpr761/avx2/LICENSE
@@ -0,0 +1 @@
+Public Domain
diff --git a/crypto_kem/ntrulpr761/avx2/Makefile b/crypto_kem/ntrulpr761/avx2/Makefile
new file mode 100644
index 00000000..053dd074
--- /dev/null
+++ b/crypto_kem/ntrulpr761/avx2/Makefile
@@ -0,0 +1,22 @@
+# This Makefile can be used with GNU Make or BSD Make
+
+LIB=libntrulpr761_avx2.a
+HEADERS=api.h crypto_core_multsntrup761.h crypto_core_multsntrup761_ntt.h crypto_decode_256x16.h crypto_decode_256x2.h crypto_decode_761x1531.h crypto_decode_761x3.h crypto_decode_761xint16.h crypto_decode_761xint32.h crypto_encode_256x16.h crypto_encode_256x2.h crypto_encode_761x1531.h crypto_encode_761x1531round.h crypto_encode_761x3.h crypto_encode_761xint16.h crypto_sort_int32.h crypto_sort_uint32.h crypto_stream_aes256ctr.h crypto_verify_1167.h params.h 
+OBJECTS=crypto_core_multsntrup761.o crypto_core_multsntrup761_ntt.o crypto_decode_256x16.o crypto_decode_256x2.o crypto_decode_761x1531.o crypto_decode_761x3.o crypto_decode_761xint16.o crypto_decode_761xint32.o crypto_encode_256x16.o crypto_encode_256x2.o crypto_encode_761x1531.o crypto_encode_761x1531round.o crypto_encode_761x3.o crypto_encode_761xint16.o crypto_sort_int32.o crypto_sort_uint32.o crypto_stream_aes256ctr.o crypto_verify_1167.o kem.o 
+
+CFLAGS=-O3 -mavx2 -mbmi2 -Wall -Wextra -Wpedantic -Wvla -Werror -Wredundant-decls -Wmissing-prototypes -std=c99 -I../../../common $(EXTRAFLAGS)
+
+all: $(LIB)
+
+%.o: %.s $(HEADERS)
+	$(AS) -o $@ $<
+
+%.o: %.c $(HEADERS)
+	$(CC) $(CFLAGS) -c -o $@ $<
+
+$(LIB): $(OBJECTS)
+	$(AR) -r $@ $(OBJECTS)
+
+clean:
+	$(RM) $(OBJECTS)
+	$(RM) $(LIB)
diff --git a/crypto_kem/ntrulpr761/avx2/api.h b/crypto_kem/ntrulpr761/avx2/api.h
new file mode 100644
index 00000000..7e1e5d40
--- /dev/null
+++ b/crypto_kem/ntrulpr761/avx2/api.h
@@ -0,0 +1,16 @@
+#ifndef PQCLEAN_NTRULPR761_AVX2_API_H
+#define PQCLEAN_NTRULPR761_AVX2_API_H
+
+
+
+#define PQCLEAN_NTRULPR761_AVX2_CRYPTO_ALGNAME "ntrulpr761"
+
+#define PQCLEAN_NTRULPR761_AVX2_CRYPTO_SECRETKEYBYTES 1294
+#define PQCLEAN_NTRULPR761_AVX2_CRYPTO_PUBLICKEYBYTES 1039
+#define PQCLEAN_NTRULPR761_AVX2_CRYPTO_CIPHERTEXTBYTES 1167
+#define PQCLEAN_NTRULPR761_AVX2_CRYPTO_BYTES 32
+
+int PQCLEAN_NTRULPR761_AVX2_crypto_kem_keypair(unsigned char *pk, unsigned char *sk);
+int PQCLEAN_NTRULPR761_AVX2_crypto_kem_enc(unsigned char *c, unsigned char *k, const unsigned char *pk);
+int PQCLEAN_NTRULPR761_AVX2_crypto_kem_dec(unsigned char *k, const unsigned char *c, const unsigned char *sk);
+#endif
diff --git a/crypto_kem/ntrulpr761/avx2/crypto_core_multsntrup761.c b/crypto_kem/ntrulpr761/avx2/crypto_core_multsntrup761.c
new file mode 100644
index 00000000..b38397a3
--- /dev/null
+++ b/crypto_kem/ntrulpr761/avx2/crypto_core_multsntrup761.c
@@ -0,0 +1,314 @@
+#include "crypto_core_multsntrup761.h"
+#include "crypto_core_multsntrup761_ntt.h"
+#include "crypto_decode_761xint16.h"
+#include "crypto_encode_761xint16.h"
+#include <immintrin.h>
+
+typedef int8_t int8;
+typedef int16_t int16;
+
+#define int16x16 __m256i
+#define load_x16(p) _mm256_loadu_si256((int16x16 *) (p))
+#define store_x16(p,v) _mm256_storeu_si256((int16x16 *) (p),(v))
+#define const_x16 _mm256_set1_epi16
+#define add_x16 _mm256_add_epi16
+#define sub_x16 _mm256_sub_epi16
+#define mullo_x16 _mm256_mullo_epi16
+#define mulhi_x16 _mm256_mulhi_epi16
+#define mulhrs_x16 _mm256_mulhrs_epi16
+#define signmask_x16(x) _mm256_srai_epi16((x),15)
+
+typedef union {
+    int16 v[3][512];
+    int16x16 _dummy;
+} vec3x512;
+
+typedef union {
+    int16 v[768];
+    int16x16 _dummy;
+} vec768;
+
+typedef union {
+    int16 v[3 * 512];
+    int16x16 _dummy;
+} vec1536;
+
+static inline int16x16 squeeze_4591_x16(int16x16 x) {
+    return sub_x16(x, mullo_x16(mulhrs_x16(x, const_x16(7)), const_x16(4591)));
+}
+
+static inline int16x16 squeeze_7681_x16(int16x16 x) {
+    return sub_x16(x, mullo_x16(mulhrs_x16(x, const_x16(4)), const_x16(7681)));
+}
+
+static inline int16x16 squeeze_10753_x16(int16x16 x) {
+    return sub_x16(x, mullo_x16(mulhrs_x16(x, const_x16(3)), const_x16(10753)));
+}
+
+static inline int16x16 mulmod_4591_x16(int16x16 x, int16x16 y) {
+    int16x16 yqinv = mullo_x16(y, const_x16(15631)); /* XXX: precompute */
+    int16x16 b = mulhi_x16(x, y);
+    int16x16 d = mullo_x16(x, yqinv);
+    int16x16 e = mulhi_x16(d, const_x16(4591));
+    return sub_x16(b, e);
+}
+
+static inline int16x16 mulmod_7681_x16(int16x16 x, int16x16 y) {
+    int16x16 yqinv = mullo_x16(y, const_x16(-7679)); /* XXX: precompute */
+    int16x16 b = mulhi_x16(x, y);
+    int16x16 d = mullo_x16(x, yqinv);
+    int16x16 e = mulhi_x16(d, const_x16(7681));
+    return sub_x16(b, e);
+}
+
+static inline int16x16 mulmod_10753_x16(int16x16 x, int16x16 y) {
+    int16x16 yqinv = mullo_x16(y, const_x16(-10751)); /* XXX: precompute */
+    int16x16 b = mulhi_x16(x, y);
+    int16x16 d = mullo_x16(x, yqinv);
+    int16x16 e = mulhi_x16(d, const_x16(10753));
+    return sub_x16(b, e);
+}
+
+#define mask0 _mm256_set_epi16(-1,0,0,-1,0,0,-1,0,0,-1,0,0,-1,0,0,-1)
+#define mask1 _mm256_set_epi16(0,0,-1,0,0,-1,0,0,-1,0,0,-1,0,0,-1,0)
+#define mask2 _mm256_set_epi16(0,-1,0,0,-1,0,0,-1,0,0,-1,0,0,-1,0,0)
+
+static void good(int16 fpad[3][512], const int16 f[768]) {
+    int j;
+    int16x16 f0, f1;
+
+    j = 0;
+    for (;;) {
+        f0 = load_x16(f + j);
+        f1 = load_x16(f + 512 + j);
+        store_x16(&fpad[0][j], (f0 & mask0) | (f1 & mask1));
+        store_x16(&fpad[1][j], (f0 & mask1) | (f1 & mask2));
+        store_x16(&fpad[2][j], (f0 & mask2) | (f1 & mask0));
+        j += 16;
+        if (j == 256) {
+            break;
+        }
+
+        f0 = load_x16(f + j);
+        f1 = load_x16(f + 512 + j);
+        store_x16(&fpad[0][j], (f0 & mask2) | (f1 & mask0));
+        store_x16(&fpad[1][j], (f0 & mask0) | (f1 & mask1));
+        store_x16(&fpad[2][j], (f0 & mask1) | (f1 & mask2));
+        j += 16;
+
+        f0 = load_x16(f + j);
+        f1 = load_x16(f + 512 + j);
+        store_x16(&fpad[0][j], (f0 & mask1) | (f1 & mask2));
+        store_x16(&fpad[1][j], (f0 & mask2) | (f1 & mask0));
+        store_x16(&fpad[2][j], (f0 & mask0) | (f1 & mask1));
+        j += 16;
+    }
+    for (;;) {
+        f0 = load_x16(f + j);
+        store_x16(&fpad[0][j], f0 & mask2);
+        store_x16(&fpad[1][j], f0 & mask0);
+        store_x16(&fpad[2][j], f0 & mask1);
+        j += 16;
+        if (j == 512) {
+            break;
+        }
+
+        f0 = load_x16(f + j);
+        store_x16(&fpad[0][j], f0 & mask1);
+        store_x16(&fpad[1][j], f0 & mask2);
+        store_x16(&fpad[2][j], f0 & mask0);
+        j += 16;
+
+        f0 = load_x16(f + j);
+        store_x16(&fpad[0][j], f0 & mask0);
+        store_x16(&fpad[1][j], f0 & mask1);
+        store_x16(&fpad[2][j], f0 & mask2);
+        j += 16;
+    }
+}
+
+static void ungood(int16 f[1536], const int16 fpad[3][512]) {
+    int j;
+    int16x16 f0, f1, f2, g0, g1, g2;
+
+    j = 0;
+
+    for (;;) {
+        f0 = load_x16(&fpad[0][j]);
+        f1 = load_x16(&fpad[1][j]);
+        f2 = load_x16(&fpad[2][j]);
+        g0 = (f0 & mask0) | (f1 & mask1) | (f2 & mask2);
+        g1 = (f0 & mask1) | (f1 & mask2) | (f2 & mask0);
+        g2 = f0 ^ f1 ^ f2 ^ g0 ^ g1; /* same as (f0&mask2)|(f1&mask0)|(f2&mask1) */
+        store_x16(f + 0 + j, g0);
+        store_x16(f + 512 + j, g1);
+        store_x16(f + 1024 + j, g2);
+        j += 16;
+
+        f0 = load_x16(&fpad[0][j]);
+        f1 = load_x16(&fpad[1][j]);
+        f2 = load_x16(&fpad[2][j]);
+        g0 = (f0 & mask2) | (f1 & mask0) | (f2 & mask1);
+        g1 = (f0 & mask0) | (f1 & mask1) | (f2 & mask2);
+        g2 = f0 ^ f1 ^ f2 ^ g0 ^ g1; /* same as (f0&mask1)|(f1&mask2)|(f2&mask0) */
+        store_x16(f + 0 + j, g0);
+        store_x16(f + 512 + j, g1);
+        store_x16(f + 1024 + j, g2);
+        j += 16;
+        if (j == 512) {
+            break;
+        }
+
+        f0 = load_x16(&fpad[0][j]);
+        f1 = load_x16(&fpad[1][j]);
+        f2 = load_x16(&fpad[2][j]);
+        g0 = (f0 & mask1) | (f1 & mask2) | (f2 & mask0);
+        g1 = (f0 & mask2) | (f1 & mask0) | (f2 & mask1);
+        g2 = f0 ^ f1 ^ f2 ^ g0 ^ g1; /* same as (f0&mask0)|(f1&mask1)|(f2&mask2) */
+        store_x16(f + 0 + j, g0);
+        store_x16(f + 512 + j, g1);
+        store_x16(f + 1024 + j, g2);
+        j += 16;
+    }
+}
+
+static void mult768(int16 h[1536], const int16 f[768], const int16 g[768]) {
+    vec3x512 x1, x2;
+    vec1536 x3, x4;
+#define fpad (x1.v)
+#define gpad (x2.v)
+#define hpad fpad
+#define h_7681 (x3.v)
+#define h_10753 (x4.v)
+    int i;
+
+    good(fpad, f);
+    PQCLEAN_NTRULPR761_AVX2_ntt512_7681(fpad[0], 3);
+
+    good(gpad, g);
+    PQCLEAN_NTRULPR761_AVX2_ntt512_7681(gpad[0], 3);
+
+    for (i = 0; i < 512; i += 16) {
+        int16x16 f0 = squeeze_7681_x16(load_x16(&fpad[0][i]));
+        int16x16 f1 = squeeze_7681_x16(load_x16(&fpad[1][i]));
+        int16x16 f2 = squeeze_7681_x16(load_x16(&fpad[2][i]));
+        int16x16 g0 = squeeze_7681_x16(load_x16(&gpad[0][i]));
+        int16x16 g1 = squeeze_7681_x16(load_x16(&gpad[1][i]));
+        int16x16 g2 = squeeze_7681_x16(load_x16(&gpad[2][i]));
+        int16x16 d0 = mulmod_7681_x16(f0, g0);
+        int16x16 d1 = mulmod_7681_x16(f1, g1);
+        int16x16 d2 = mulmod_7681_x16(f2, g2);
+        int16x16 dsum = add_x16(add_x16(d0, d1), d2);
+        int16x16 h0 = add_x16(dsum, mulmod_7681_x16(sub_x16(f2, f1), sub_x16(g1, g2)));
+        int16x16 h1 = add_x16(dsum, mulmod_7681_x16(sub_x16(f1, f0), sub_x16(g0, g1)));
+        int16x16 h2 = add_x16(dsum, mulmod_7681_x16(sub_x16(f0, f2), sub_x16(g2, g0)));
+        store_x16(&hpad[0][i], squeeze_7681_x16(h0));
+        store_x16(&hpad[1][i], squeeze_7681_x16(h1));
+        store_x16(&hpad[2][i], squeeze_7681_x16(h2));
+    }
+
+    PQCLEAN_NTRULPR761_AVX2_invntt512_7681(hpad[0], 3);
+    ungood(h_7681, (const int16(*)[512]) hpad);
+
+    good(fpad, f);
+    PQCLEAN_NTRULPR761_AVX2_ntt512_10753(fpad[0], 3);
+
+    good(gpad, g);
+    PQCLEAN_NTRULPR761_AVX2_ntt512_10753(gpad[0], 3);
+
+    for (i = 0; i < 512; i += 16) {
+        int16x16 f0 = squeeze_10753_x16(load_x16(&fpad[0][i]));
+        int16x16 f1 = squeeze_10753_x16(load_x16(&fpad[1][i]));
+        int16x16 f2 = squeeze_10753_x16(load_x16(&fpad[2][i]));
+        int16x16 g0 = squeeze_10753_x16(load_x16(&gpad[0][i]));
+        int16x16 g1 = squeeze_10753_x16(load_x16(&gpad[1][i]));
+        int16x16 g2 = squeeze_10753_x16(load_x16(&gpad[2][i]));
+        int16x16 d0 = mulmod_10753_x16(f0, g0);
+        int16x16 d1 = mulmod_10753_x16(f1, g1);
+        int16x16 d2 = mulmod_10753_x16(f2, g2);
+        int16x16 dsum = add_x16(add_x16(d0, d1), d2);
+        int16x16 h0 = add_x16(dsum, mulmod_10753_x16(sub_x16(f2, f1), sub_x16(g1, g2)));
+        int16x16 h1 = add_x16(dsum, mulmod_10753_x16(sub_x16(f1, f0), sub_x16(g0, g1)));
+        int16x16 h2 = add_x16(dsum, mulmod_10753_x16(sub_x16(f0, f2), sub_x16(g2, g0)));
+        store_x16(&hpad[0][i], squeeze_10753_x16(h0));
+        store_x16(&hpad[1][i], squeeze_10753_x16(h1));
+        store_x16(&hpad[2][i], squeeze_10753_x16(h2));
+    }
+
+    PQCLEAN_NTRULPR761_AVX2_invntt512_10753(hpad[0], 3);
+    ungood(h_10753, (const int16(*)[512]) hpad);
+
+    for (i = 0; i < 1536; i += 16) {
+        int16x16 u1 = load_x16(&h_10753[i]);
+        int16x16 u2 = load_x16(&h_7681[i]);
+        int16x16 t;
+        u1 = mulmod_10753_x16(u1, const_x16(1268));
+        u2 = mulmod_7681_x16(u2, const_x16(956));
+        t = mulmod_7681_x16(sub_x16(u2, u1), const_x16(-2539));
+        t = add_x16(u1, mulmod_4591_x16(t, const_x16(-710)));
+        store_x16(&h[i], t);
+    }
+}
+
+#define crypto_decode_pxint16 PQCLEAN_NTRULPR761_AVX2_crypto_decode_761xint16
+#define crypto_encode_pxint16 PQCLEAN_NTRULPR761_AVX2_crypto_encode_761xint16
+
+#define p 761
+#define q 4591
+
+static inline int16x16 freeze_4591_x16(int16x16 x) {
+    int16x16 mask, xq;
+    x = add_x16(x, const_x16(q)&signmask_x16(x));
+    mask = signmask_x16(sub_x16(x, const_x16((q + 1) / 2)));
+    xq = sub_x16(x, const_x16(q));
+    x = _mm256_blendv_epi8(xq, x, mask);
+    return x;
+}
+
+int PQCLEAN_NTRULPR761_AVX2_crypto_core_multsntrup761(unsigned char *outbytes, const unsigned char *inbytes, const unsigned char *kbytes) {
+    vec768 x1, x2;
+    vec1536 x3;
+#define f (x1.v)
+#define g (x2.v)
+#define fg (x3.v)
+#define h f
+    int i;
+    int16x16 x;
+
+    x = const_x16(0);
+    for (i = p & ~15; i < 768; i += 16) {
+        store_x16(&f[i], x);
+    }
+    for (i = p & ~15; i < 768; i += 16) {
+        store_x16(&g[i], x);
+    }
+
+    crypto_decode_pxint16(f, inbytes);
+
+    for (i = 0; i < 768; i += 16) {
+        x = load_x16(&f[i]);
+        x = freeze_4591_x16(squeeze_4591_x16(x));
+        store_x16(&f[i], x);
+    }
+    for (i = 0; i < p; ++i) {
+        int8 gi = kbytes[i];
+        int8 gi0 = gi & 1;
+        g[i] = gi0 - (gi & (gi0 << 1));
+    }
+
+    mult768(fg, f, g);
+
+    fg[0] -= fg[p - 1];
+    for (i = 0; i < 768; i += 16) {
+        int16x16 fgi = load_x16(&fg[i]);
+        int16x16 fgip = load_x16(&fg[i + p]);
+        int16x16 fgip1 = load_x16(&fg[i + p - 1]);
+        x = add_x16(fgi, add_x16(fgip, fgip1));
+        x = freeze_4591_x16(squeeze_4591_x16(x));
+        store_x16(&h[i], x);
+    }
+
+    crypto_encode_pxint16(outbytes, h);
+
+    return 0;
+}
diff --git a/crypto_kem/ntrulpr761/avx2/crypto_core_multsntrup761.h b/crypto_kem/ntrulpr761/avx2/crypto_core_multsntrup761.h
new file mode 100644
index 00000000..9b2a6d15
--- /dev/null
+++ b/crypto_kem/ntrulpr761/avx2/crypto_core_multsntrup761.h
@@ -0,0 +1,11 @@
+#ifndef PQCLEAN_NTRULPR761_AVX2_CRYPTO_CORE_MULTSNTRUP761_H
+#define PQCLEAN_NTRULPR761_AVX2_CRYPTO_CORE_MULTSNTRUP761_H
+
+#include <stdint.h>
+#define PQCLEAN_NTRULPR761_AVX2_crypto_core_multsntrup761_OUTPUTBYTES 1522
+#define PQCLEAN_NTRULPR761_AVX2_crypto_core_multsntrup761_INPUTBYTES 1522
+#define PQCLEAN_NTRULPR761_AVX2_crypto_core_multsntrup761_KEYBYTES 761
+#define PQCLEAN_NTRULPR761_AVX2_crypto_core_multsntrup761_CONSTBYTES 0
+
+int PQCLEAN_NTRULPR761_AVX2_crypto_core_multsntrup761(unsigned char *outbytes, const unsigned char *inbytes, const unsigned char *kbytes);
+#endif
diff --git a/crypto_kem/ntrulpr761/avx2/crypto_core_multsntrup761_ntt.c b/crypto_kem/ntrulpr761/avx2/crypto_core_multsntrup761_ntt.c
new file mode 100644
index 00000000..dfb90c53
--- /dev/null
+++ b/crypto_kem/ntrulpr761/avx2/crypto_core_multsntrup761_ntt.c
@@ -0,0 +1,927 @@
+#include "crypto_core_multsntrup761.h"
+#include "crypto_core_multsntrup761_ntt.h"
+#include <immintrin.h>
+#include <stdint.h>
+
+/* auto-generated; do not edit */
+
+
+typedef int8_t int8;
+typedef int16_t int16;
+
+#define zeta(n,i) (((__m256i *) zeta_##n)[(i)])
+#define zeta_x4(n,i) (((__m256i *) zeta_x4_##n)[(i)])
+#define zeta_qinv(n,i) (((__m256i *) qinvzeta_##n)[(i)])
+#define zeta_x4_qinv(n,i) (((__m256i *) qinvzeta_x4_##n)[(i)])
+#define zetainv(n,i) _mm256_loadu_reverse16((__m256i *) ((int16 *) zeta_##n+(n)/2+1-16*((i)+1)))
+#define zetainv_x4(n,i) _mm256_loadu_reverse16((__m256i *) ((int16 *) zeta_x4_##n+2*(n)+4-16*((i)+1)))
+#define zetainv_qinv(n,i) _mm256_loadu_reverse16((__m256i *) ((int16 *) qinvzeta_##n+(n)/2+1-16*((i)+1)))
+#define zetainv_x4_qinv(n,i) _mm256_loadu_reverse16((__m256i *) ((int16 *) qinvzeta_x4_##n+2*(n)+4-16*((i)+1)))
+
+typedef union {
+    int16 data[93 * 16];
+    __m256i _dummy;
+} vec1488;
+
+static const vec1488 qdata_7681 = { .data = {
+
+#define q_x16 (qdata[0])
+        7681, 7681, 7681, 7681, 7681, 7681, 7681, 7681, 7681, 7681, 7681, 7681, 7681, 7681, 7681, 7681,
+
+#define qrecip_x16 (qdata[1])
+        17474, 17474, 17474, 17474, 17474, 17474, 17474, 17474, 17474, 17474, 17474, 17474, 17474, 17474, 17474, 17474,
+
+#define qshift_x16 (qdata[2])
+        16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+
+#define zeta4_x16 (qdata[3])
+        -3777, -3777, -3777, -3777, -3777, -3777, -3777, -3777, -3777, -3777, -3777, -3777, -3777, -3777, -3777, -3777,
+
+#define zeta4_x16_qinv (qdata[4])
+        -28865, -28865, -28865, -28865, -28865, -28865, -28865, -28865, -28865, -28865, -28865, -28865, -28865, -28865, -28865, -28865,
+
+#define zeta8_x16 (qdata[5])
+        -3625, -3625, -3625, -3625, -3625, -3625, -3625, -3625, -3625, -3625, -3625, -3625, -3625, -3625, -3625, -3625,
+
+#define zeta8_x16_qinv (qdata[6])
+        -16425, -16425, -16425, -16425, -16425, -16425, -16425, -16425, -16425, -16425, -16425, -16425, -16425, -16425, -16425, -16425,
+
+#define zetainv8_x16 (qdata[7])
+        -3182, -3182, -3182, -3182, -3182, -3182, -3182, -3182, -3182, -3182, -3182, -3182, -3182, -3182, -3182, -3182,
+
+#define zetainv8_x16_qinv (qdata[8])
+        -10350, -10350, -10350, -10350, -10350, -10350, -10350, -10350, -10350, -10350, -10350, -10350, -10350, -10350, -10350, -10350,
+
+#define zeta_x4_16 (qdata+9)
+        -3593, -3593, -3593, -3593, -2194, -2194, -2194, -2194, -3625, -3625, -3625, -3625, 1100, 1100, 1100, 1100,
+        -3777, -3777, -3777, -3777, -2456, -2456, -2456, -2456, 3182, 3182, 3182, 3182, 3696, 3696, 3696, 3696,
+        3593, 3593, 3593, 3593, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+#define qinvzeta_x4_16 (qdata+12)
+        -9, -9, -9, -9, 4974, 4974, 4974, 4974, -16425, -16425, -16425, -16425, 7244, 7244, 7244, 7244,
+        -28865, -28865, -28865, -28865, -14744, -14744, -14744, -14744, 10350, 10350, 10350, 10350, -4496, -4496, -4496, -4496,
+        9, 9, 9, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+#define zeta_x4_32 (qdata+15)
+        -3593, -3593, -3593, -3593, 1414, 1414, 1414, 1414, -2194, -2194, -2194, -2194, -2495, -2495, -2495, -2495,
+        -3625, -3625, -3625, -3625, 2876, 2876, 2876, 2876, 1100, 1100, 1100, 1100, -2250, -2250, -2250, -2250,
+        -3777, -3777, -3777, -3777, -1701, -1701, -1701, -1701, -2456, -2456, -2456, -2456, 834, 834, 834, 834,
+        3182, 3182, 3182, 3182, -2319, -2319, -2319, -2319, 3696, 3696, 3696, 3696, 121, 121, 121, 121,
+        3593, 3593, 3593, 3593, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+#define qinvzeta_x4_32 (qdata+20)
+        -9, -9, -9, -9, 20870, 20870, 20870, 20870, 4974, 4974, 4974, 4974, 22593, 22593, 22593, 22593,
+        -16425, -16425, -16425, -16425, 828, 828, 828, 828, 7244, 7244, 7244, 7244, -23754, -23754, -23754, -23754,
+        -28865, -28865, -28865, -28865, 20315, 20315, 20315, 20315, -14744, -14744, -14744, -14744, 18242, 18242, 18242, 18242,
+        10350, 10350, 10350, 10350, -18191, -18191, -18191, -18191, -4496, -4496, -4496, -4496, -11655, -11655, -11655, -11655,
+        9, 9, 9, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+#define zeta_64 (qdata+25)
+        -3593, -617, 1414, 3706, -2194, -1296, -2495, -2237, -3625, 2830, 2876, -1599, 1100, 1525, -2250, 2816,
+        -3777, 1921, -1701, 2006, -2456, 1483, 834, -1986, 3182, 3364, -2319, -1993, 3696, -2557, 121, 2088,
+        3593, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+#define qinvzeta_64 (qdata+28)
+        -9, 19351, 20870, -15750, 4974, -9488, 22593, 7491, -16425, 26382, 828, 23489, 7244, 20469, -23754, 2816,
+        -28865, -5759, 20315, -3114, -14744, 15307, 18242, -19394, 10350, -10972, -18191, -31177, -4496, -25597, -11655, 22568,
+        9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+#define zeta_128 (qdata+31)
+        -3593, -2804, -617, -396, 1414, -549, 3706, 810, -2194, -1321, -1296, 438, -2495, -2535, -2237, -3689,
+        -3625, 2043, 2830, -1881, 2876, 3153, -1599, 7, 1100, -514, 1525, -1760, -2250, -2440, 2816, 3600,
+        -3777, 103, 1921, -3174, -1701, 1535, 2006, -1887, -2456, 1399, 1483, -679, 834, 3772, -1986, 1738,
+        3182, -1431, 3364, -3555, -2319, -2310, -1993, 638, 3696, -2956, -2557, -1305, 121, 2555, 2088, -3266,
+        3593, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+#define qinvzeta_128 (qdata+36)
+        -9, -29428, 19351, 26228, 20870, 21467, -15750, 5930, 4974, -14121, -9488, -21066, 22593, 2073, 7491, 16279,
+        -16425, -25093, 26382, 26279, 828, -29103, 23489, 11783, 7244, 14846, 20469, 14624, -23754, -6536, 2816, 11792,
+        -28865, -4505, -5759, -6246, 20315, 9215, -3114, 6817, -14744, 4983, 15307, -28839, 18242, 1724, -19394, 23242,
+        10350, -21399, -10972, -29667, -18191, -21766, -31177, 15998, -4496, 23668, -25597, -5913, -11655, -24581, 22568, -20674,
+        9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+#define zeta_256 (qdata+41)
+        -3593, 2665, -2804, -2572, -617, 727, -396, 3417, 1414, 2579, -549, 373, 3706, 3750, 810, -1054,
+        -2194, -2133, -1321, 1681, -1296, -1386, 438, -2732, -2495, 1919, -2535, -2391, -2237, 2835, -3689, 2,
+        -3625, -783, 2043, 3145, 2830, 1533, -1881, 2789, 2876, 2649, 3153, 3692, -1599, -1390, 7, -1166,
+        1100, 3310, -514, 2224, 1525, -2743, -1760, 2385, -2250, -486, -2440, -1756, 2816, -3816, 3600, -3831,
+        -3777, -1799, 103, 1497, 1921, 1521, -3174, -194, -1701, -859, 1535, 2175, 2006, -2762, -1887, -1698,
+        -2456, -3480, 1399, 2883, 1483, -3428, -679, -2113, 834, 1532, 3772, -660, -1986, -2764, 1738, -915,
+        3182, 1056, -1431, 1350, 3364, 1464, -3555, 2919, -2319, -2160, -2310, 730, -1993, -1598, 638, 3456,
+        3696, -1168, -2956, -3588, -2557, -921, -1305, 3405, 121, -404, 2555, -3135, 2088, 2233, -3266, -2426,
+        3593, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+#define qinvzeta_256 (qdata+50)
+        -9, -17303, -29428, 24052, 19351, -12073, 26228, -24743, 20870, -12269, 21467, 19317, -15750, -25946, 5930, 32738,
+        4974, -4693, -14121, 2193, -9488, 26262, -21066, 7508, 22593, 9599, 2073, 10409, 7491, -12013, 16279, -15358,
+        -16425, -16655, -25093, 32329, 26382, 24573, 26279, 13541, 828, -25511, -29103, 26220, 23489, -8558, 11783, -24718,
+        7244, 10478, 14846, 26800, 20469, 26441, 14624, -29871, -23754, -3558, -6536, -16092, 2816, 8472, 11792, -7415,
+        -28865, -13575, -4505, -26663, -5759, -14351, -6246, -17602, 20315, -22875, 9215, 9855, -3114, -24266, 6817, -2722,
+        -14744, -15768, 4983, 12611, 15307, -21860, -28839, -27201, 18242, 32252, 1724, 21868, -19394, -8908, 23242, 13933,
+        10350, 17440, -21399, -11962, -10972, 30136, -29667, -1689, -18191, 6032, -21766, 30426, -31177, 15810, 15998, 3456,
+        -4496, -9360, 23668, 27132, -25597, -5529, -5913, 1869, -11655, 22124, -24581, 21953, 22568, 23225, -20674, 17030,
+        9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+#define zeta_512 (qdata+59)
+        -3593, 2005, 2665, 2891, -2804, 2345, -2572, 1121, -617, -188, 727, 2786, -396, -3208, 3417, -17,
+        1414, -3752, 2579, 2815, -549, 1837, 373, 151, 3706, -1012, 3750, -1509, 810, -3214, -1054, 3177,
+        -2194, -1403, -2133, -3314, -1321, 83, 1681, -658, -1296, 2070, -1386, -3547, 438, 3781, -2732, 2230,
+        -2495, -1669, 1919, 2589, -2535, -3312, -2391, -3542, -2237, -1441, 2835, -3568, -3689, -402, 2, -1070,
+        -3625, 3763, -783, -3550, 2043, -2303, 3145, -436, 2830, -893, 1533, 1712, -1881, 124, 2789, -2001,
+        2876, -2460, 2649, 3770, 3153, 2965, 3692, -1203, -1599, 2874, -1390, -1407, 7, -3745, -1166, 1649,
+        1100, 2937, 3310, 3461, -514, -1526, 2224, 715, 1525, -1689, -2743, 434, -1760, -3163, 2385, -929,
+        -2250, -2167, -486, -1144, -2440, -370, -1756, 2378, 2816, -1084, -3816, -1586, 3600, 1931, -3831, -1242,
+        -3777, 592, -1799, 2340, 103, -1338, 1497, -2071, 1921, 1519, 1521, 451, -3174, 589, -194, -3744,
+        -1701, 3677, -859, -1295, 1535, 642, 2175, -3794, 2006, 2130, -2762, 2918, -1887, 3334, -1698, 2072,
+        -2456, 509, -3480, 2998, 1399, -3408, 2883, 1476, 1483, -2262, -3428, -1779, -679, 2258, -2113, 1348,
+        834, -692, 1532, 2247, 3772, 2083, -660, -226, -1986, 2532, -2764, -3693, 1738, -429, -915, -2059,
+        3182, 2812, 1056, 3434, -1431, -2515, 1350, -236, 3364, -2386, 1464, 222, -3555, -2963, 2919, -2422,
+        -2319, -3657, -2160, 3450, -2310, -791, 730, 1181, -1993, -1404, -1598, 2339, 638, -3366, 3456, 2161,
+        3696, -3343, -1168, 2719, -2956, -826, -3588, -670, -2557, 777, -921, 1151, -1305, -796, 3405, -1278,
+        121, -3287, -404, 1072, 2555, 293, -3135, 2767, 2088, -3335, 2233, 3581, -3266, 3723, -2426, -179,
+        3593, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+#define qinvzeta_512 (qdata+76)
+        -9, 4565, -17303, 16715, -29428, 15145, 24052, -22943, 19351, 1860, -12073, -28958, 26228, -7304, -24743, -529,
+        20870, -24232, -12269, 10495, 21467, -16083, 19317, 20119, -15750, -27636, -25946, -12261, 5930, -26766, 32738, -16791,
+        4974, 25733, -4693, 20238, -14121, 18003, 2193, 6510, -9488, 29718, 26262, -25563, -21066, -1851, 7508, -19274,
+        22593, -28805, 9599, -23523, 2073, 4880, 10409, 1578, 7491, -10145, -12013, 4624, 16279, 6766, -15358, 24530,
+        -16425, 5299, -16655, -2526, -25093, -9983, 32329, 5708, 26382, -23933, 24573, 26288, 26279, 30844, 13541, 30255,
+        828, 15972, -25511, 17082, -29103, -27243, 26220, -2739, 23489, 16186, -8558, -9087, 11783, -12449, -24718, -14223,
+        7244, -8839, 10478, 30597, 14846, -12790, 26800, 14539, 20469, -6297, 26441, 9650, 14624, -25179, -29871, -9633,
+        -23754, -5751, -3558, 2952, -6536, 23182, -16092, 23882, 2816, 964, 8472, -10802, 11792, -17013, -7415, -30938,
+        -28865, -23984, -13575, -11996, -4505, -14650, -26663, -22039, -5759, 1007, -14351, 10179, -6246, -947, -17602, -20128,
+        20315, 10333, -22875, -17167, 9215, -14718, 9855, -29394, -3114, 27730, -24266, 5990, 6817, 22790, -2722, 14360,
+        -14744, 23549, -15768, -18506, 4983, 21168, 12611, 3524, 15307, 2858, -21860, 29453, -28839, 27858, -27201, 3396,
+        18242, 5452, 32252, -18745, 1724, -4573, 21868, 31518, -19394, 20964, -8908, -18541, 23242, 17491, 13933, 16885,
+        10350, -32004, 17440, -24214, -21399, -20435, -11962, -22764, -10972, -27986, 30136, -802, -29667, 11885, -1689, -13686,
+        -18191, 32695, 6032, -16006, -21766, -20759, 30426, -24931, -31177, -32124, 15810, -4317, 15998, 26330, 3456, -13711,
+        -4496, -19215, -9360, 26783, 23668, -14138, 27132, -32414, -25597, -2807, -5529, 8831, -5913, 17636, 1869, -16638,
+        -11655, 9513, 22124, 25648, -24581, -21723, 21953, -14129, 22568, -15111, 23225, 26621, -20674, -15221, 17030, -1715,
+        9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    }
+};
+
+static const vec1488 qdata_10753 = { .data = {
+
+        10753, 10753, 10753, 10753, 10753, 10753, 10753, 10753, 10753, 10753, 10753, 10753, 10753, 10753, 10753, 10753,
+
+        24964, 24964, 24964, 24964, 24964, 24964, 24964, 24964, 24964, 24964, 24964, 24964, 24964, 24964, 24964, 24964,
+
+        8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+
+        223, 223, 223, 223, 223, 223, 223, 223, 223, 223, 223, 223, 223, 223, 223, 223,
+
+        27359, 27359, 27359, 27359, 27359, 27359, 27359, 27359, 27359, 27359, 27359, 27359, 27359, 27359, 27359, 27359,
+
+        4188, 4188, 4188, 4188, 4188, 4188, 4188, 4188, 4188, 4188, 4188, 4188, 4188, 4188, 4188, 4188,
+
+        -1956, -1956, -1956, -1956, -1956, -1956, -1956, -1956, -1956, -1956, -1956, -1956, -1956, -1956, -1956, -1956,
+
+        3688, 3688, 3688, 3688, 3688, 3688, 3688, 3688, 3688, 3688, 3688, 3688, 3688, 3688, 3688, 3688,
+
+        -408, -408, -408, -408, -408, -408, -408, -408, -408, -408, -408, -408, -408, -408, -408, -408,
+
+        1018, 1018, 1018, 1018, 2413, 2413, 2413, 2413, 4188, 4188, 4188, 4188, 357, 357, 357, 357,
+        223, 223, 223, 223, -3686, -3686, -3686, -3686, -3688, -3688, -3688, -3688, -376, -376, -376, -376,
+        -1018, -1018, -1018, -1018, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+        -6, -6, -6, -6, 10093, 10093, 10093, 10093, -1956, -1956, -1956, -1956, 28517, 28517, 28517, 28517,
+        27359, 27359, 27359, 27359, -21094, -21094, -21094, -21094, 408, 408, 408, 408, -20856, -20856, -20856, -20856,
+        6, 6, 6, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+        1018, 1018, 1018, 1018, -2695, -2695, -2695, -2695, 2413, 2413, 2413, 2413, 425, 425, 425, 425,
+        4188, 4188, 4188, 4188, -4855, -4855, -4855, -4855, 357, 357, 357, 357, -3364, -3364, -3364, -3364,
+        223, 223, 223, 223, 730, 730, 730, 730, -3686, -3686, -3686, -3686, -4544, -4544, -4544, -4544,
+        -3688, -3688, -3688, -3688, -2236, -2236, -2236, -2236, -376, -376, -376, -376, 3784, 3784, 3784, 3784,
+        -1018, -1018, -1018, -1018, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+        -6, -6, -6, -6, 7033, 7033, 7033, 7033, 10093, 10093, 10093, 10093, 18345, 18345, 18345, 18345,
+        -1956, -1956, -1956, -1956, 29449, 29449, 29449, 29449, 28517, 28517, 28517, 28517, -9508, -9508, -9508, -9508,
+        27359, 27359, 27359, 27359, 16090, 16090, 16090, 16090, -21094, -21094, -21094, -21094, 28224, 28224, 28224, 28224,
+        408, 408, 408, 408, -12476, -12476, -12476, -12476, -20856, -20856, -20856, -20856, 16072, 16072, 16072, 16072,
+        6, 6, 6, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+        1018, -1520, -2695, 1341, 2413, 918, 425, 5175, 4188, -4035, -4855, 341, 357, 4347, -3364, 5213,
+        223, -4875, 730, 1931, -3686, -2503, -4544, -4095, -3688, 5063, -2236, -3823, -376, 3012, 3784, -2629,
+        -1018, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+        -6, 23056, 7033, 829, 10093, 26518, 18345, 3639, -1956, -4547, 29449, 3925, 28517, -7429, -9508, -11683,
+        27359, -17675, 16090, 14731, -21094, -25543, 28224, -14847, 408, 28103, -12476, 10001, -20856, -7228, 16072, 18363,
+        6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+        1018, -2935, -1520, -4744, -2695, -205, 1341, 1299, 2413, 4, 918, -4379, 425, -4616, 5175, -544,
+        4188, 4129, -4035, 4102, -4855, -1287, 341, -2388, 357, 1284, 4347, 2984, -3364, 2178, 5213, -2576,
+        223, 2790, -4875, 4876, 730, -4513, 1931, -3085, -3686, 3550, -2503, 847, -4544, 193, -4095, 1085,
+        -3688, 3091, 5063, -4742, -2236, 2982, -3823, -1009, -376, -268, 3012, 3062, 3784, -2565, -2629, 4189,
+        -1018, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+        -6, 31369, 23056, 15736, 7033, -24269, 829, -6381, 10093, 22532, 26518, 23781, 18345, 15864, 3639, 15840,
+        -1956, -23007, -4547, 5126, 29449, 8441, 3925, -16724, 28517, 23812, -7429, 31656, -9508, -19326, -11683, -27152,
+        27359, 20198, -17675, 6924, 16090, 22623, 14731, 5619, -21094, -24098, -25543, 3407, 28224, 22209, -14847, 573,
+        408, -4589, 28103, -5766, -12476, -12378, 10001, -31217, -20856, -2316, -7228, -20490, 16072, -14341, 18363, -12707,
+        6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+        1018, -4734, -2935, -400, -1520, 4977, -4744, -2973, -2695, 512, -205, -779, 1341, -1356, 1299, 635,
+        2413, 567, 4, -4286, 918, -5114, -4379, -1586, 425, 1615, -4616, -336, 5175, -1841, -544, 2234,
+        4188, -3441, 4129, 636, -4035, -4580, 4102, 2684, -4855, 3057, -1287, -2740, 341, -5156, -2388, -472,
+        357, -794, 1284, 578, 4347, 3615, 2984, -3715, -3364, 2271, 2178, -326, 5213, 454, -2576, -3337,
+        223, 2998, 2790, -151, -4875, 2981, 4876, 1324, 730, 2774, -4513, 2206, 1931, 886, -3085, -970,
+        -3686, 3198, 3550, 2737, -2503, -909, 847, 1068, -4544, -2213, 193, 2884, -4095, -4808, 1085, 4123,
+        -3688, 5341, 3091, 5294, 5063, -116, -4742, -5116, -2236, -2045, 2982, -1572, -3823, 4828, -1009, 467,
+        -376, 5023, -268, -3169, 3012, -1458, 3062, -1268, 3784, -675, -2565, 1006, -2629, 5064, 4189, 864,
+        -1018, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+        -6, -26238, 31369, -24976, 23056, -30351, 15736, -18845, 7033, 512, -24269, -13579, 829, 29364, -6381, -11141,
+        10093, -969, 22532, 6978, 26518, -4090, 23781, 11726, 18345, 4175, 15864, 7856, 3639, 719, 15840, -31558,
+        -1956, 31887, -23007, -21892, -4547, 22044, 5126, -19844, 29449, -32271, 8441, 32076, 3925, -11300, -16724, 28200,
+        28517, 16614, 23812, 11842, -7429, -2017, 31656, 28541, -9508, 29407, -19326, 31418, -11683, -31290, -27152, 27895,
+        27359, 12214, 20198, -14999, -17675, -1627, 6924, -13012, 16090, -4394, 22623, 7326, 14731, -22666, 5619, 8246,
+        -21094, 24702, -24098, 177, -25543, 7795, 3407, -13268, 28224, 2395, 22209, -7356, -14847, -17096, 573, -24037,
+        408, -11555, -4589, -30546, 28103, 1932, -5766, 17412, -12476, 31235, -12378, -7716, 10001, -1316, -31217, 25555,
+        -20856, -609, -2316, -8801, -7228, 11854, -20490, 780, 16072, -17571, -14341, -2066, 18363, 17352, -12707, 17248,
+        6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+        1018, 3453, -4734, 4519, -2935, 2118, -400, -554, -1520, 2196, 4977, 1893, -4744, -1409, -2973, -1053,
+        -2695, 4601, 512, 279, -205, -3241, -779, 4889, 1341, 3524, -1356, -1663, 1299, 2283, 635, 73,
+        2413, 2428, 567, 624, 4, -1930, -4286, 3419, 918, -2062, -5114, 5068, -4379, -97, -1586, 1782,
+        425, 4621, 1615, 355, -4616, 1349, -336, 825, 5175, 3135, -1841, 1160, -544, 4408, 2234, -2605,
+        4188, 854, -3441, -1056, 4129, 2439, 636, 4967, -4035, -4782, -4580, -5268, 4102, -663, 2684, -4670,
+        -4855, 3760, 3057, 3535, -1287, 2680, -2740, -569, 341, 2139, -5156, 3827, -2388, 1639, -472, 1927,
+        357, 5172, -794, -4003, 1284, 4144, 578, 693, 4347, 4784, 3615, 3125, 2984, 1122, -3715, 2113,
+        -3364, -573, 2271, -4328, 2178, 2909, -326, -4000, 5213, -4447, 454, -3995, -2576, -4428, -3337, 2529,
+        223, 5309, 2998, 5120, 2790, -2050, -151, 2963, -4875, 2657, 2981, -2807, 4876, 2237, 1324, -4403,
+        730, 2624, 2774, -5083, -4513, 40, 2206, 152, 1931, -1573, 886, 2625, -3085, -778, -970, -5107,
+        -3686, 4250, 3198, -5356, 3550, -3148, 2737, -3360, -2503, -2015, -909, 3096, 847, 5313, 1068, 834,
+        -4544, -1132, -2213, -2151, 193, -1722, 2884, -4393, -4095, 2662, -4808, -2788, 1085, -1992, 4123, 5334,
+        -3688, 5215, 5341, -1689, 3091, -2117, 5294, 4859, 5063, 3410, -116, 2205, -4742, -2374, -5116, -4720,
+        -2236, 3570, -2045, 2813, 2982, 2087, -1572, -4973, -3823, 458, 4828, 3891, -1009, -2419, 467, -4891,
+        -376, -1381, 5023, 1204, -268, 274, -3169, -3260, 3012, -1635, -1458, 4540, 3062, -4254, -1268, -1111,
+        3784, 2230, -675, -2279, -2565, -4359, 1006, -1510, -2629, 5015, 5064, -2449, 4189, -5005, 864, 2487,
+        -1018, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+        -6, -29827, -26238, -21593, 31369, -29626, -24976, -7722, 23056, -16236, -30351, 30053, 15736, 9343, -18845, -16925,
+        7033, 14329, 512, 15127, -24269, -21161, -13579, -1767, 829, -6716, 29364, -12415, -6381, 31467, -11141, 1609,
+        10093, -20100, -969, -23952, 22532, -25482, 6978, 8027, 26518, 17394, -4090, -25652, 23781, -5729, 11726, -21770,
+        18345, -4083, 4175, -15517, 15864, -19643, 7856, -22215, 3639, -18881, 719, -19320, 15840, -7880, -31558, 22483,
+        -1956, -6314, 31887, 15328, -23007, -7289, -21892, 11623, -4547, 31058, 22044, 13164, 5126, -15511, -19844, 6594,
+        29449, 11952, -32271, 6095, 8441, 23160, 32076, 22471, 3925, 6747, -11300, 12531, -16724, 8295, 28200, -7801,
+        28517, -29644, 16614, -20899, 23812, 12336, 11842, 20661, -7429, 12976, -2017, 23093, 31656, -3998, 28541, 24129,
+        -9508, -61, 29407, -232, -19326, -13987, 31418, 12384, -11683, -31583, -31290, 24165, -27152, 26292, 27895, 8161,
+        27359, 4797, 12214, 5120, 20198, 19454, -14999, -4717, -17675, 8289, -1627, 31497, 6924, 1725, -13012, 19661,
+        16090, -30144, -4394, -9691, 22623, 28712, 7326, 4248, 14731, 3035, -22666, 24641, 5619, -24330, 8246, -13811,
+        -21094, -13158, 24702, -23788, -24098, 27572, 177, 13024, -25543, -29151, 7795, 7192, 3407, 27329, -13268, 12098,
+        28224, -19564, 2395, -8807, 22209, 32070, -7356, -22313, -14847, 20070, -17096, 23836, 573, -14280, -24037, -1834,
+        408, 32351, -11555, 4967, -4589, 18875, -30546, -6917, 28103, -26286, 1932, 18077, -5766, 29370, 17412, 19856,
+        -12476, 23026, 31235, -30467, -12378, -24025, -7716, -12653, 10001, -8758, -1316, -20173, -31217, -11123, 25555, 23269,
+        -20856, -29541, -609, 31924, -2316, 3346, -8801, -13500, -7228, 14237, 11854, 14780, -20490, -9374, 780, 16809,
+        16072, 11446, -17571, -8935, -14341, 5369, -2066, -18918, 18363, 19863, 17352, -16273, -12707, 3699, 17248, 951,
+        6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    }
+};
+
+static inline __m256i sub_x16(__m256i a, __m256i b) {
+    //__asm__("vpsubw %1,%0,%0" : "+x"(a),"+x"(b));
+    return _mm256_sub_epi16(a, b);
+}
+
+static inline __m256i add_x16(__m256i a, __m256i b) {
+    return _mm256_add_epi16(a, b);
+}
+
+static inline __m256i reduce_x16(const __m256i *qdata, __m256i x) {
+    __m256i y = _mm256_mulhi_epi16(x, qrecip_x16);
+    y = _mm256_mulhrs_epi16(y, qshift_x16);
+    y = _mm256_mullo_epi16(y, q_x16);
+    return sub_x16(x, y);
+}
+
+static inline __m256i mulmod_x16_scaled(const __m256i *qdata, __m256i x, __m256i y, __m256i yqinv) {
+    __m256i b = _mm256_mulhi_epi16(x, y);
+    __m256i d = _mm256_mullo_epi16(x, yqinv);
+    __m256i e = _mm256_mulhi_epi16(d, q_x16);
+    return sub_x16(b, e);
+}
+
+typedef union {
+    int8 data[32];
+    __m256i _dummy;
+} byte32;
+static const byte32 shuffle_buf = { .data = {
+        14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1,
+        14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1,
+    }
+};
+#define shuffle (*(__m256i *) shuffle_buf.data)
+
+static inline __m256i _mm256_loadu_reverse16(const __m256i *p) {
+    __m256i x = _mm256_loadu_si256(p);
+    x = _mm256_permute2x128_si256(x, x, 1);
+    x = _mm256_shuffle_epi8(x, shuffle);
+    return x;
+}
+
+static void ntt128(int16 *f, int reps, const __m256i *qdata) {
+    __m256i f0, f1, f2, f3, g0, g1, g2, g3, h0, h1, h2, h3;
+    int16 *origf = f;
+    int rep;
+    __m256i zetainv_128_0 = zetainv(128, 0);
+    __m256i zetainv_qinv_128_0 = zetainv_qinv(128, 0);
+    __m256i zetainv_x4_32_0 = zetainv_x4(32, 0);
+    __m256i zetainv_x4_qinv_32_0 = zetainv_x4_qinv(32, 0);
+    __m256i zetainv_128_1 = zetainv(128, 1);
+    __m256i zetainv_qinv_128_1 = zetainv_qinv(128, 1);
+    __m256i zetainv_x4_32_1 = zetainv_x4(32, 1);
+    __m256i zetainv_x4_qinv_32_1 = zetainv_x4_qinv(32, 1);
+    for (rep = 0; rep < reps; ++rep) {
+        f1 = _mm256_loadu_si256((__m256i *) (f + 32));
+        f3 = _mm256_loadu_si256((__m256i *) (f + 96));
+        g3 = sub_x16(f1, f3);
+        g3 = mulmod_x16_scaled(qdata, g3, zeta4_x16, zeta4_x16_qinv);
+        g1 = add_x16(f1, f3);
+
+        f0 = _mm256_loadu_si256((__m256i *) (f + 0));
+        f2 = _mm256_loadu_si256((__m256i *) (f + 64));
+        g2 = sub_x16(f0, f2);
+        g0 = add_x16(f0, f2);
+
+        f3 = sub_x16(g3, g2);
+        f2 = add_x16(g2, g3);
+        f3 = mulmod_x16_scaled(qdata, f3, zetainv_128_0, zetainv_qinv_128_0);
+        f2 = mulmod_x16_scaled(qdata, f2, zeta(128, 0), zeta_qinv(128, 0));
+
+        g2 = _mm256_unpacklo_epi16(f2, f3);
+        g3 = _mm256_unpackhi_epi16(f2, f3);
+
+        f1 = sub_x16(g0, g1);
+        f0 = add_x16(g0, g1);
+        f1 = mulmod_x16_scaled(qdata, f1, zeta(64, 0), zeta_qinv(64, 0));
+        f0 = reduce_x16(qdata, f0);
+
+        g0 = _mm256_unpacklo_epi16(f0, f1);
+        h0 = _mm256_unpacklo_epi32(g0, g2);
+        h1 = _mm256_unpackhi_epi32(g0, g2);
+        g1 = _mm256_unpackhi_epi16(f0, f1);
+        h2 = _mm256_unpacklo_epi32(g1, g3);
+        h3 = _mm256_unpackhi_epi32(g1, g3);
+        f0 = _mm256_permute2x128_si256(h0, h1, 0x20);
+        f2 = _mm256_permute2x128_si256(h0, h1, 0x31);
+        f1 = _mm256_permute2x128_si256(h2, h3, 0x20);
+        f3 = _mm256_permute2x128_si256(h2, h3, 0x31);
+
+        _mm256_storeu_si256((__m256i *) (f + 0), f0);
+        _mm256_storeu_si256((__m256i *) (f + 64), f2);
+        _mm256_storeu_si256((__m256i *) (f + 32), f1);
+        _mm256_storeu_si256((__m256i *) (f + 96), f3);
+
+        f1 = _mm256_loadu_si256((__m256i *) (f + 48));
+        f3 = _mm256_loadu_si256((__m256i *) (f + 112));
+        g3 = sub_x16(f1, f3);
+        g3 = mulmod_x16_scaled(qdata, g3, zeta4_x16, zeta4_x16_qinv);
+        g1 = add_x16(f1, f3);
+
+        f0 = _mm256_loadu_si256((__m256i *) (f + 16));
+        f2 = _mm256_loadu_si256((__m256i *) (f + 80));
+        g2 = sub_x16(f0, f2);
+        g0 = add_x16(f0, f2);
+
+        f3 = sub_x16(g3, g2);
+        f2 = add_x16(g2, g3);
+        f3 = mulmod_x16_scaled(qdata, f3, zetainv_128_1, zetainv_qinv_128_1);
+        f2 = mulmod_x16_scaled(qdata, f2, zeta(128, 1), zeta_qinv(128, 1));
+
+        g2 = _mm256_unpacklo_epi16(f2, f3);
+        g3 = _mm256_unpackhi_epi16(f2, f3);
+
+        f1 = sub_x16(g0, g1);
+        f0 = add_x16(g0, g1);
+        f1 = mulmod_x16_scaled(qdata, f1, zeta(64, 1), zeta_qinv(64, 1));
+        f0 = reduce_x16(qdata, f0);
+
+        g0 = _mm256_unpacklo_epi16(f0, f1);
+        h0 = _mm256_unpacklo_epi32(g0, g2);
+        h1 = _mm256_unpackhi_epi32(g0, g2);
+        g1 = _mm256_unpackhi_epi16(f0, f1);
+        h2 = _mm256_unpacklo_epi32(g1, g3);
+        h3 = _mm256_unpackhi_epi32(g1, g3);
+        f0 = _mm256_permute2x128_si256(h0, h1, 0x20);
+        f2 = _mm256_permute2x128_si256(h0, h1, 0x31);
+        f1 = _mm256_permute2x128_si256(h2, h3, 0x20);
+        f3 = _mm256_permute2x128_si256(h2, h3, 0x31);
+
+        _mm256_storeu_si256((__m256i *) (f + 16), f0);
+        _mm256_storeu_si256((__m256i *) (f + 80), f2);
+        _mm256_storeu_si256((__m256i *) (f + 48), f1);
+        _mm256_storeu_si256((__m256i *) (f + 112), f3);
+
+        f += 128;
+    }
+    f = origf;
+    for (rep = 0; rep < reps; ++rep) {
+        f1 = _mm256_loadu_si256((__m256i *) (f + 64));
+        f3 = _mm256_loadu_si256((__m256i *) (f + 80));
+        g3 = sub_x16(f1, f3);
+        g3 = mulmod_x16_scaled(qdata, g3, zeta4_x16, zeta4_x16_qinv);
+        g1 = add_x16(f1, f3);
+
+        f0 = _mm256_loadu_si256((__m256i *) (f + 0));
+        f2 = _mm256_loadu_si256((__m256i *) (f + 16));
+        g2 = sub_x16(f0, f2);
+        g0 = add_x16(f0, f2);
+
+        f3 = sub_x16(g3, g2);
+        f2 = add_x16(g2, g3);
+        f3 = mulmod_x16_scaled(qdata, f3, zetainv_x4_32_0, zetainv_x4_qinv_32_0);
+        f2 = mulmod_x16_scaled(qdata, f2, zeta_x4(32, 0), zeta_x4_qinv(32, 0));
+
+        g2 = _mm256_unpacklo_epi64(f2, f3);
+        g3 = _mm256_unpackhi_epi64(f2, f3);
+
+        f1 = sub_x16(g0, g1);
+        f0 = add_x16(g0, g1);
+        f1 = mulmod_x16_scaled(qdata, f1, zeta_x4(16, 0), zeta_x4_qinv(16, 0));
+        f0 = reduce_x16(qdata, f0);
+
+        g1 = _mm256_unpackhi_epi64(f0, f1);
+        g0 = _mm256_unpacklo_epi64(f0, f1);
+        f1 = _mm256_permute2x128_si256(g1, g3, 0x20);
+        f3 = _mm256_permute2x128_si256(g1, g3, 0x31);
+        f0 = _mm256_permute2x128_si256(g0, g2, 0x20);
+        f2 = _mm256_permute2x128_si256(g0, g2, 0x31);
+
+        _mm256_storeu_si256((__m256i *) (f + 64), f1);
+        _mm256_storeu_si256((__m256i *) (f + 80), f3);
+        _mm256_storeu_si256((__m256i *) (f + 0), f0);
+        _mm256_storeu_si256((__m256i *) (f + 16), f2);
+
+        f1 = _mm256_loadu_si256((__m256i *) (f + 96));
+        f3 = _mm256_loadu_si256((__m256i *) (f + 112));
+        g3 = sub_x16(f1, f3);
+        g3 = mulmod_x16_scaled(qdata, g3, zeta4_x16, zeta4_x16_qinv);
+        g1 = add_x16(f1, f3);
+
+        f0 = _mm256_loadu_si256((__m256i *) (f + 32));
+        f2 = _mm256_loadu_si256((__m256i *) (f + 48));
+        g2 = sub_x16(f0, f2);
+        g0 = add_x16(f0, f2);
+
+        f3 = sub_x16(g3, g2);
+        f2 = add_x16(g2, g3);
+        f3 = mulmod_x16_scaled(qdata, f3, zetainv_x4_32_1, zetainv_x4_qinv_32_1);
+        f2 = mulmod_x16_scaled(qdata, f2, zeta_x4(32, 1), zeta_x4_qinv(32, 1));
+
+        g2 = _mm256_unpacklo_epi64(f2, f3);
+        g3 = _mm256_unpackhi_epi64(f2, f3);
+
+        f1 = sub_x16(g0, g1);
+        f0 = add_x16(g0, g1);
+        f1 = mulmod_x16_scaled(qdata, f1, zeta_x4(16, 1), zeta_x4_qinv(16, 1));
+        f0 = reduce_x16(qdata, f0);
+
+        g1 = _mm256_unpackhi_epi64(f0, f1);
+        g0 = _mm256_unpacklo_epi64(f0, f1);
+        f1 = _mm256_permute2x128_si256(g1, g3, 0x20);
+        f3 = _mm256_permute2x128_si256(g1, g3, 0x31);
+        f0 = _mm256_permute2x128_si256(g0, g2, 0x20);
+        f2 = _mm256_permute2x128_si256(g0, g2, 0x31);
+
+        _mm256_storeu_si256((__m256i *) (f + 96), f1);
+        _mm256_storeu_si256((__m256i *) (f + 112), f3);
+        _mm256_storeu_si256((__m256i *) (f + 32), f0);
+        _mm256_storeu_si256((__m256i *) (f + 48), f2);
+
+        f += 128;
+    }
+    f = origf;
+    for (rep = 0; rep < reps; ++rep) {
+
+        f1 = _mm256_loadu_si256((__m256i *) (f + 16));
+        f3 = _mm256_loadu_si256((__m256i *) (f + 48));
+        g3 = sub_x16(f1, f3);
+        g3 = mulmod_x16_scaled(qdata, g3, zeta4_x16, zeta4_x16_qinv);
+        g1 = add_x16(f1, f3);
+
+        f0 = _mm256_loadu_si256((__m256i *) (f + 0));
+        f2 = _mm256_loadu_si256((__m256i *) (f + 32));
+        g2 = sub_x16(f0, f2);
+        g0 = add_x16(f0, f2);
+
+        f2 = add_x16(g2, g3);
+        f3 = sub_x16(g2, g3);
+        f2 = reduce_x16(qdata, f2);
+        f3 = reduce_x16(qdata, f3);
+
+        f1 = sub_x16(g0, g1);
+        f0 = add_x16(g0, g1);
+        f0 = reduce_x16(qdata, f0);
+
+        h0 = f0;
+        h1 = f1;
+        h2 = f2;
+        h3 = f3;
+
+        f1 = _mm256_loadu_si256((__m256i *) (f + 80));
+        f3 = _mm256_loadu_si256((__m256i *) (f + 112));
+        g3 = sub_x16(f1, f3);
+        g3 = mulmod_x16_scaled(qdata, g3, zeta4_x16, zeta4_x16_qinv);
+        g1 = add_x16(f1, f3);
+
+        f0 = _mm256_loadu_si256((__m256i *) (f + 64));
+        f2 = _mm256_loadu_si256((__m256i *) (f + 96));
+        g2 = sub_x16(f0, f2);
+        g0 = add_x16(f0, f2);
+
+        f3 = sub_x16(g3, g2);
+        f2 = add_x16(g2, g3);
+        f3 = mulmod_x16_scaled(qdata, f3, zetainv8_x16, zetainv8_x16_qinv);
+        f2 = mulmod_x16_scaled(qdata, f2, zeta8_x16, zeta8_x16_qinv);
+
+        f1 = sub_x16(g0, g1);
+        f0 = add_x16(g0, g1);
+        f1 = mulmod_x16_scaled(qdata, f1, zeta4_x16, zeta4_x16_qinv);
+        f0 = reduce_x16(qdata, f0);
+
+        g0 = add_x16(h0, f0);
+        g1 = add_x16(h1, f1);
+        g2 = add_x16(h2, f2);
+        g3 = add_x16(h3, f3);
+        _mm256_storeu_si256((__m256i *) (f + 0), g0);
+        _mm256_storeu_si256((__m256i *) (f + 16), g1);
+        _mm256_storeu_si256((__m256i *) (f + 32), g2);
+        _mm256_storeu_si256((__m256i *) (f + 48), g3);
+        g0 = sub_x16(h0, f0);
+        g1 = sub_x16(h1, f1);
+        g2 = sub_x16(h2, f2);
+        g3 = sub_x16(h3, f3);
+        _mm256_storeu_si256((__m256i *) (f + 64), g0);
+        _mm256_storeu_si256((__m256i *) (f + 80), g1);
+        _mm256_storeu_si256((__m256i *) (f + 96), g2);
+        _mm256_storeu_si256((__m256i *) (f + 112), g3);
+        f += 128;
+    }
+}
+
+static void ntt512(int16 *f, int reps, const __m256i *qdata) {
+    __m256i f0, f1, f2, f3, g0, g1, g2, g3; /* [-Werror=unused-variable] */ /* ,h0,h1,h2,h3; */
+    int16 *origf = f;
+    int rep;
+    __m256i zetainv_512[8];
+    __m256i zetainv_qinv_512[8];
+    int i;
+    for (i = 0; i < 8; ++i) {
+        zetainv_512[i] = zetainv(512, i);
+    }
+    for (i = 0; i < 8; ++i) {
+        zetainv_qinv_512[i] = zetainv_qinv(512, i);
+    }
+    for (rep = 0; rep < reps; ++rep) {
+        for (i = 0; i < 8; ++i) {
+            f1 = _mm256_loadu_si256((__m256i *) (f + 16 * i + 128));
+            f3 = _mm256_loadu_si256((__m256i *) (f + 16 * i + 384));
+            g3 = sub_x16(f1, f3);
+            g3 = mulmod_x16_scaled(qdata, g3, zeta4_x16, zeta4_x16_qinv);
+            g1 = add_x16(f1, f3);
+
+            f0 = _mm256_loadu_si256((__m256i *) (f + 16 * i));
+            f2 = _mm256_loadu_si256((__m256i *) (f + 16 * i + 256));
+            g2 = sub_x16(f0, f2);
+            g0 = add_x16(f0, f2);
+
+            f3 = sub_x16(g3, g2);
+            f2 = add_x16(g2, g3);
+            f3 = mulmod_x16_scaled(qdata, f3, zetainv_512[i], zetainv_qinv_512[i]);
+            f2 = mulmod_x16_scaled(qdata, f2, zeta(512, i), zeta_qinv(512, i));
+
+            f1 = sub_x16(g0, g1);
+            f0 = add_x16(g0, g1);
+            f1 = mulmod_x16_scaled(qdata, f1, zeta(256, i), zeta_qinv(256, i));
+            f0 = reduce_x16(qdata, f0);
+
+            _mm256_storeu_si256((__m256i *) (f + 16 * i + 384), f3);
+            _mm256_storeu_si256((__m256i *) (f + 16 * i + 256), f2);
+            _mm256_storeu_si256((__m256i *) (f + 16 * i + 128), f1);
+            _mm256_storeu_si256((__m256i *) (f + 16 * i), f0);
+
+        }
+        f += 512;
+    }
+    f = origf;
+    ntt128(f, reps * 4, qdata);
+}
+
+void PQCLEAN_NTRULPR761_AVX2_ntt512_7681(int16 *f, int reps) {
+    ntt512(f, reps, (const __m256i *) qdata_7681.data);
+}
+
+void PQCLEAN_NTRULPR761_AVX2_ntt512_10753(int16 *f, int reps) {
+    ntt512(f, reps, (const __m256i *) qdata_10753.data);
+}
+
+static void invntt128(int16 *f, int reps, const __m256i *qdata) {
+    __m256i f0, f1, f2, f3, g0, g1, g2, g3, h0, h1, h2, h3;
+    int16 *origf = f;
+    int rep;
+    __m256i zetainv_x4_16_0 = zetainv_x4(16, 0);
+    __m256i zetainv_x4_qinv_16_0 = zetainv_x4_qinv(16, 0);
+    __m256i zetainv_x4_32_0 = zetainv_x4(32, 0);
+    __m256i zetainv_x4_qinv_32_0 = zetainv_x4_qinv(32, 0);
+    __m256i zetainv_64_0 = zetainv(64, 0);
+    __m256i zetainv_qinv_64_0 = zetainv_qinv(64, 0);
+    __m256i zetainv_128_0 = zetainv(128, 0);
+    __m256i zetainv_qinv_128_0 = zetainv_qinv(128, 0);
+    __m256i zetainv_x4_16_1 = zetainv_x4(16, 1);
+    __m256i zetainv_x4_qinv_16_1 = zetainv_x4_qinv(16, 1);
+    __m256i zetainv_x4_32_1 = zetainv_x4(32, 1);
+    __m256i zetainv_x4_qinv_32_1 = zetainv_x4_qinv(32, 1);
+    __m256i zetainv_64_1 = zetainv(64, 1);
+    __m256i zetainv_qinv_64_1 = zetainv_qinv(64, 1);
+    __m256i zetainv_128_1 = zetainv(128, 1);
+    __m256i zetainv_qinv_128_1 = zetainv_qinv(128, 1);
+    for (rep = 0; rep < reps; ++rep) {
+        f0 = _mm256_loadu_si256((__m256i *) (f +   0));
+        f1 = _mm256_loadu_si256((__m256i *) (f +  64));
+        f2 = _mm256_loadu_si256((__m256i *) (f +  16));
+        f3 = _mm256_loadu_si256((__m256i *) (f +  80));
+        g0 = _mm256_loadu_si256((__m256i *) (f +  32));
+        g1 = _mm256_loadu_si256((__m256i *) (f +  96));
+        g2 = _mm256_loadu_si256((__m256i *) (f +  48));
+        g3 = _mm256_loadu_si256((__m256i *) (f + 112));
+
+        h1 = sub_x16(f0, f1);
+        h1 = reduce_x16(qdata, h1);
+        h0 = add_x16(f0, f1);
+        h3 = sub_x16(f2, f3);
+        h3 = mulmod_x16_scaled(qdata, h3, zeta4_x16, zeta4_x16_qinv);
+        h2 = add_x16(f2, f3);
+        f1 = sub_x16(g0, g1);
+        f1 = mulmod_x16_scaled(qdata, f1, zetainv8_x16, zetainv8_x16_qinv);
+        f0 = add_x16(g0, g1);
+        f3 = sub_x16(g2, g3);
+        f3 = mulmod_x16_scaled(qdata, f3, zeta8_x16, zeta8_x16_qinv);
+        f2 = add_x16(g2, g3);
+
+        g0 = add_x16(h0, h2);
+        g0 = reduce_x16(qdata, g0);
+        g2 = sub_x16(h0, h2);
+        g2 = reduce_x16(qdata, g2);
+        g1 = sub_x16(h1, h3);
+        g3 = add_x16(h1, h3);
+        h2 = sub_x16(f0, f2);
+        h2 = mulmod_x16_scaled(qdata, h2, zeta4_x16, zeta4_x16_qinv);
+        h0 = add_x16(f0, f2);
+        h3 = add_x16(f1, f3);
+        h3 = mulmod_x16_scaled(qdata, h3, zeta4_x16, zeta4_x16_qinv);
+        h1 = sub_x16(f1, f3);
+
+        f0 = add_x16(g0, h0);
+        g0 = sub_x16(g0, h0);
+        f1 = add_x16(g1, h1);
+        g1 = sub_x16(g1, h1);
+        f2 = sub_x16(g2, h2);
+        g2 = add_x16(g2, h2);
+        f3 = sub_x16(g3, h3);
+        g3 = add_x16(g3, h3);
+
+        _mm256_storeu_si256((__m256i *) (f +   0), f0);
+        _mm256_storeu_si256((__m256i *) (f +  32), g0);
+        _mm256_storeu_si256((__m256i *) (f +  64), f1);
+        _mm256_storeu_si256((__m256i *) (f +  96), g1);
+        _mm256_storeu_si256((__m256i *) (f +  16), f2);
+        _mm256_storeu_si256((__m256i *) (f +  48), g2);
+        _mm256_storeu_si256((__m256i *) (f +  80), f3);
+        _mm256_storeu_si256((__m256i *) (f + 112), g3);
+
+        f += 128;
+    }
+    f = origf;
+    for (rep = 0; rep < reps; ++rep) {
+        f0 = _mm256_loadu_si256((__m256i *) (f + 0));
+        f1 = _mm256_loadu_si256((__m256i *) (f + 64));
+        f2 = _mm256_loadu_si256((__m256i *) (f + 16));
+        f3 = _mm256_loadu_si256((__m256i *) (f + 80));
+
+        g0 = _mm256_unpacklo_epi64(f0, f1);
+        g1 = _mm256_unpacklo_epi64(f2, f3);
+        g2 = _mm256_unpackhi_epi64(f0, f1);
+        g3 = _mm256_unpackhi_epi64(f2, f3);
+        f2 = _mm256_permute2x128_si256(g0, g1, 0x31);
+        f3 = _mm256_permute2x128_si256(g2, g3, 0x31);
+        f0 = _mm256_permute2x128_si256(g0, g1, 0x20);
+        f1 = _mm256_permute2x128_si256(g2, g3, 0x20);
+
+        f2 = mulmod_x16_scaled(qdata, f2, zetainv_x4_32_0, zetainv_x4_qinv_32_0);
+        f3 = mulmod_x16_scaled(qdata, f3, zeta_x4(32, 0), zeta_x4_qinv(32, 0));
+
+        g3 = add_x16(f3, f2);
+        g3 = mulmod_x16_scaled(qdata, g3, zeta4_x16, zeta4_x16_qinv);
+        g2 = sub_x16(f3, f2);
+
+        f0 = reduce_x16(qdata, f0);
+        f1 = mulmod_x16_scaled(qdata, f1, zetainv_x4_16_0, zetainv_x4_qinv_16_0);
+
+        g1 = add_x16(f0, f1);
+        g0 = sub_x16(f0, f1);
+
+        f1 = add_x16(g1, g3);
+        f3 = sub_x16(g1, g3);
+        f0 = add_x16(g0, g2);
+        f2 = sub_x16(g0, g2);
+
+        _mm256_storeu_si256((__m256i *) (f + 64), f1);
+        _mm256_storeu_si256((__m256i *) (f + 80), f3);
+        _mm256_storeu_si256((__m256i *) (f + 0), f0);
+        _mm256_storeu_si256((__m256i *) (f + 16), f2);
+
+        f0 = _mm256_loadu_si256((__m256i *) (f + 32));
+        f1 = _mm256_loadu_si256((__m256i *) (f + 96));
+        f2 = _mm256_loadu_si256((__m256i *) (f + 48));
+        f3 = _mm256_loadu_si256((__m256i *) (f + 112));
+
+        g0 = _mm256_unpacklo_epi64(f0, f1);
+        g1 = _mm256_unpacklo_epi64(f2, f3);
+        g2 = _mm256_unpackhi_epi64(f0, f1);
+        g3 = _mm256_unpackhi_epi64(f2, f3);
+        f2 = _mm256_permute2x128_si256(g0, g1, 0x31);
+        f3 = _mm256_permute2x128_si256(g2, g3, 0x31);
+        f0 = _mm256_permute2x128_si256(g0, g1, 0x20);
+        f1 = _mm256_permute2x128_si256(g2, g3, 0x20);
+
+        f2 = mulmod_x16_scaled(qdata, f2, zetainv_x4_32_1, zetainv_x4_qinv_32_1);
+        f3 = mulmod_x16_scaled(qdata, f3, zeta_x4(32, 1), zeta_x4_qinv(32, 1));
+
+        g3 = add_x16(f3, f2);
+        g3 = mulmod_x16_scaled(qdata, g3, zeta4_x16, zeta4_x16_qinv);
+        g2 = sub_x16(f3, f2);
+
+        f0 = reduce_x16(qdata, f0);
+        f1 = mulmod_x16_scaled(qdata, f1, zetainv_x4_16_1, zetainv_x4_qinv_16_1);
+
+        g1 = add_x16(f0, f1);
+        g0 = sub_x16(f0, f1);
+
+        f1 = add_x16(g1, g3);
+        f3 = sub_x16(g1, g3);
+        f0 = add_x16(g0, g2);
+        f2 = sub_x16(g0, g2);
+
+        _mm256_storeu_si256((__m256i *) (f + 96), f1);
+        _mm256_storeu_si256((__m256i *) (f + 112), f3);
+        _mm256_storeu_si256((__m256i *) (f + 32), f0);
+        _mm256_storeu_si256((__m256i *) (f + 48), f2);
+
+        f += 128;
+    }
+    f = origf;
+    for (rep = 0; rep < reps; ++rep) {
+        f0 = _mm256_loadu_si256((__m256i *) (f + 0));
+        f2 = _mm256_loadu_si256((__m256i *) (f + 64));
+        f1 = _mm256_loadu_si256((__m256i *) (f + 32));
+        f3 = _mm256_loadu_si256((__m256i *) (f + 96));
+
+        g0 = _mm256_permute2x128_si256(f0, f2, 0x20);
+        g2 = _mm256_permute2x128_si256(f0, f2, 0x31);
+        f0 = _mm256_unpacklo_epi16(g0, g2);
+        f2 = _mm256_unpackhi_epi16(g0, g2);
+        g1 = _mm256_permute2x128_si256(f1, f3, 0x20);
+        g3 = _mm256_permute2x128_si256(f1, f3, 0x31);
+        f1 = _mm256_unpacklo_epi16(g1, g3);
+        f3 = _mm256_unpackhi_epi16(g1, g3);
+        g1 = _mm256_unpackhi_epi16(f0, f2);
+        g0 = _mm256_unpacklo_epi16(f0, f2);
+        g3 = _mm256_unpackhi_epi16(f1, f3);
+        g2 = _mm256_unpacklo_epi16(f1, f3);
+        f2 = _mm256_unpacklo_epi64(g1, g3);
+        f3 = _mm256_unpackhi_epi64(g1, g3);
+        f0 = _mm256_unpacklo_epi64(g0, g2);
+        f1 = _mm256_unpackhi_epi64(g0, g2);
+
+        f2 = mulmod_x16_scaled(qdata, f2, zetainv_128_0, zetainv_qinv_128_0);
+        f3 = mulmod_x16_scaled(qdata, f3, zeta(128, 0), zeta_qinv(128, 0));
+        f0 = reduce_x16(qdata, f0);
+        f1 = mulmod_x16_scaled(qdata, f1, zetainv_64_0, zetainv_qinv_64_0);
+
+        g3 = add_x16(f3, f2);
+        g3 = mulmod_x16_scaled(qdata, g3, zeta4_x16, zeta4_x16_qinv);
+        g1 = add_x16(f0, f1);
+        g2 = sub_x16(f3, f2);
+        g0 = sub_x16(f0, f1);
+
+        f1 = add_x16(g1, g3);
+        f3 = sub_x16(g1, g3);
+        f0 = add_x16(g0, g2);
+        f2 = sub_x16(g0, g2);
+
+        _mm256_storeu_si256((__m256i *) (f + 32), f1);
+        _mm256_storeu_si256((__m256i *) (f + 96), f3);
+        _mm256_storeu_si256((__m256i *) (f + 0), f0);
+        _mm256_storeu_si256((__m256i *) (f + 64), f2);
+
+        f0 = _mm256_loadu_si256((__m256i *) (f + 16));
+        f2 = _mm256_loadu_si256((__m256i *) (f + 80));
+        f1 = _mm256_loadu_si256((__m256i *) (f + 48));
+        f3 = _mm256_loadu_si256((__m256i *) (f + 112));
+
+        g0 = _mm256_permute2x128_si256(f0, f2, 0x20);
+        g2 = _mm256_permute2x128_si256(f0, f2, 0x31);
+        f0 = _mm256_unpacklo_epi16(g0, g2);
+        f2 = _mm256_unpackhi_epi16(g0, g2);
+        g1 = _mm256_permute2x128_si256(f1, f3, 0x20);
+        g3 = _mm256_permute2x128_si256(f1, f3, 0x31);
+        f1 = _mm256_unpacklo_epi16(g1, g3);
+        f3 = _mm256_unpackhi_epi16(g1, g3);
+        g1 = _mm256_unpackhi_epi16(f0, f2);
+        g0 = _mm256_unpacklo_epi16(f0, f2);
+        g3 = _mm256_unpackhi_epi16(f1, f3);
+        g2 = _mm256_unpacklo_epi16(f1, f3);
+        f2 = _mm256_unpacklo_epi64(g1, g3);
+        f3 = _mm256_unpackhi_epi64(g1, g3);
+        f0 = _mm256_unpacklo_epi64(g0, g2);
+        f1 = _mm256_unpackhi_epi64(g0, g2);
+
+        f2 = mulmod_x16_scaled(qdata, f2, zetainv_128_1, zetainv_qinv_128_1);
+        f3 = mulmod_x16_scaled(qdata, f3, zeta(128, 1), zeta_qinv(128, 1));
+        f0 = reduce_x16(qdata, f0);
+        f1 = mulmod_x16_scaled(qdata, f1, zetainv_64_1, zetainv_qinv_64_1);
+
+        g3 = add_x16(f3, f2);
+        g3 = mulmod_x16_scaled(qdata, g3, zeta4_x16, zeta4_x16_qinv);
+        g1 = add_x16(f0, f1);
+        g2 = sub_x16(f3, f2);
+        g0 = sub_x16(f0, f1);
+
+        f1 = add_x16(g1, g3);
+        f3 = sub_x16(g1, g3);
+        f0 = add_x16(g0, g2);
+        f2 = sub_x16(g0, g2);
+
+        _mm256_storeu_si256((__m256i *) (f + 48), f1);
+        _mm256_storeu_si256((__m256i *) (f + 112), f3);
+        _mm256_storeu_si256((__m256i *) (f + 16), f0);
+        _mm256_storeu_si256((__m256i *) (f + 80), f2);
+
+        f += 128;
+    }
+}
+
+static void invntt512(int16 *f, int reps, const __m256i *qdata) {
+    __m256i f0, f1, f2, f3, g0, g1, g2, g3; /* [-Werror=unused-variable] */ /* ,h0,h1,h2,h3; */
+    /* [-Werror=unused-variable] */ /* int16 *origf = f; */
+    int rep;
+    __m256i zetainv_512[8];
+    __m256i zetainv_qinv_512[8];
+    __m256i zetainv_256[8];
+    __m256i zetainv_qinv_256[8];
+    int i;
+    for (i = 0; i < 8; ++i) {
+        zetainv_512[i] = zetainv(512, i);
+    }
+    for (i = 0; i < 8; ++i) {
+        zetainv_qinv_512[i] = zetainv_qinv(512, i);
+    }
+    for (i = 0; i < 8; ++i) {
+        zetainv_256[i] = zetainv(256, i);
+    }
+    for (i = 0; i < 8; ++i) {
+        zetainv_qinv_256[i] = zetainv_qinv(256, i);
+    }
+    invntt128(f, 4 * reps, qdata);
+    for (rep = 0; rep < reps; ++rep) {
+        for (i = 0; i < 8; ++i) {
+            f2 = _mm256_loadu_si256((__m256i *) (f + 16 * i + 256));
+            f3 = _mm256_loadu_si256((__m256i *) (f + 16 * i + 384));
+
+            f2 = mulmod_x16_scaled(qdata, f2, zetainv_512[i], zetainv_qinv_512[i]);
+            f3 = mulmod_x16_scaled(qdata, f3, zeta(512, i), zeta_qinv(512, i));
+            g3 = add_x16(f3, f2);
+            g3 = mulmod_x16_scaled(qdata, g3, zeta4_x16, zeta4_x16_qinv);
+            g2 = sub_x16(f3, f2);
+
+            f0 = _mm256_loadu_si256((__m256i *) (f + 16 * i + 0));
+            f1 = _mm256_loadu_si256((__m256i *) (f + 16 * i + 128));
+
+            f0 = reduce_x16(qdata, f0);
+            f1 = mulmod_x16_scaled(qdata, f1, zetainv_256[i], zetainv_qinv_256[i]);
+            g1 = add_x16(f0, f1);
+            g0 = sub_x16(f0, f1);
+
+            f1 = add_x16(g1, g3);
+            f3 = sub_x16(g1, g3);
+            f0 = add_x16(g0, g2);
+            f2 = sub_x16(g0, g2);
+
+            _mm256_storeu_si256((__m256i *) (f + 16 * i + 128), f1);
+            _mm256_storeu_si256((__m256i *) (f + 16 * i + 384), f3);
+            _mm256_storeu_si256((__m256i *) (f + 16 * i + 0), f0);
+            _mm256_storeu_si256((__m256i *) (f + 16 * i + 256), f2);
+        }
+        f += 512;
+    }
+}
+
+void PQCLEAN_NTRULPR761_AVX2_invntt512_7681(int16 *f, int reps) {
+    invntt512(f, reps, (const __m256i *) qdata_7681.data);
+}
+
+void PQCLEAN_NTRULPR761_AVX2_invntt512_10753(int16 *f, int reps) {
+    invntt512(f, reps, (const __m256i *) qdata_10753.data);
+}
diff --git a/crypto_kem/ntrulpr761/avx2/crypto_core_multsntrup761_ntt.h b/crypto_kem/ntrulpr761/avx2/crypto_core_multsntrup761_ntt.h
new file mode 100644
index 00000000..5be35b95
--- /dev/null
+++ b/crypto_kem/ntrulpr761/avx2/crypto_core_multsntrup761_ntt.h
@@ -0,0 +1,13 @@
+#ifndef ntt_H
+#define ntt_H
+
+#include <stdint.h>
+
+
+
+extern void PQCLEAN_NTRULPR761_AVX2_ntt512_7681(int16_t *f, int reps);
+extern void PQCLEAN_NTRULPR761_AVX2_ntt512_10753(int16_t *f, int reps);
+extern void PQCLEAN_NTRULPR761_AVX2_invntt512_7681(int16_t *f, int reps);
+extern void PQCLEAN_NTRULPR761_AVX2_invntt512_10753(int16_t *f, int reps);
+
+#endif
diff --git a/crypto_kem/ntrulpr761/avx2/crypto_decode_256x16.c b/crypto_kem/ntrulpr761/avx2/crypto_decode_256x16.c
new file mode 100644
index 00000000..e7bcbcd9
--- /dev/null
+++ b/crypto_kem/ntrulpr761/avx2/crypto_decode_256x16.c
@@ -0,0 +1,11 @@
+#include "crypto_decode_256x16.h"
+
+
+void PQCLEAN_NTRULPR761_AVX2_crypto_decode_256x16(void *v, const unsigned char *s) {
+    unsigned char *T = v;
+    int i;
+    for (i = 0; i < 128; ++i) {
+        T[2 * i] = s[i] & 15;
+        T[2 * i + 1] = s[i] >> 4;
+    }
+}
diff --git a/crypto_kem/ntrulpr761/avx2/crypto_decode_256x16.h b/crypto_kem/ntrulpr761/avx2/crypto_decode_256x16.h
new file mode 100644
index 00000000..ec6d214e
--- /dev/null
+++ b/crypto_kem/ntrulpr761/avx2/crypto_decode_256x16.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_NTRULPR761_AVX2_CRYPTO_DECODE_256X16_H
+#define PQCLEAN_NTRULPR761_AVX2_CRYPTO_DECODE_256X16_H
+
+#include <stdint.h>
+#define PQCLEAN_NTRULPR761_AVX2_crypto_decode_256x16_STRBYTES 128
+#define PQCLEAN_NTRULPR761_AVX2_crypto_decode_256x16_ITEMS 256
+#define PQCLEAN_NTRULPR761_AVX2_crypto_decode_256x16_ITEMBYTES 1
+
+void PQCLEAN_NTRULPR761_AVX2_crypto_decode_256x16(void *v, const unsigned char *s);
+#endif
diff --git a/crypto_kem/ntrulpr761/avx2/crypto_decode_256x2.c b/crypto_kem/ntrulpr761/avx2/crypto_decode_256x2.c
new file mode 100644
index 00000000..defc1b76
--- /dev/null
+++ b/crypto_kem/ntrulpr761/avx2/crypto_decode_256x2.c
@@ -0,0 +1,27 @@
+#include "crypto_decode_256x2.h"
+#include <immintrin.h>
+#include <stdint.h>
+
+#define COPY _mm256_set_epi64x(0x0303030303030303,0x0202020202020202,0x0101010101010101,0x0000000000000000)
+#define MASK _mm256_set1_epi64x(0x8040201008040201)
+#define MASK2 _mm256_set1_epi64x(0x0101010101010101)
+
+void PQCLEAN_NTRULPR761_AVX2_crypto_decode_256x2(void *v, const unsigned char *s) {
+    __m256i *r = v;
+    int i;
+
+    for (i = 0; i < 8; ++i) {
+        /* bytes s0 s1 s2 s3 */
+        __m256i x = _mm256_set1_epi32(*(int32_t *) s);
+        /* s0 s1 s2 s3 s0 s1 s2 s3 s0 s1 s2 s3 s0 s1 s2 s3 s0 s1 s2 s3 s0 s1 s2 s3 s0 s1 s2 s3 s0 s1 s2 s3 */
+        x = _mm256_shuffle_epi8(x, COPY);
+        /* s0 s0 s0 s0 s0 s0 s0 s0 s1 s1 s1 s1 s1 s1 s1 s1 s2 s2 s2 s2 s2 s2 s2 s2 s3 s3 s3 s3 s3 s3 s3 s3 */
+        x = _mm256_andnot_si256(x, MASK);
+        x = _mm256_cmpeq_epi8(x, _mm256_setzero_si256());
+        x &= MASK2;
+        _mm256_storeu_si256(r, x);
+
+        s += 4;
+        r += 1;
+    }
+}
diff --git a/crypto_kem/ntrulpr761/avx2/crypto_decode_256x2.h b/crypto_kem/ntrulpr761/avx2/crypto_decode_256x2.h
new file mode 100644
index 00000000..19661cc5
--- /dev/null
+++ b/crypto_kem/ntrulpr761/avx2/crypto_decode_256x2.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_NTRULPR761_AVX2_CRYPTO_DECODE_256X2_H
+#define PQCLEAN_NTRULPR761_AVX2_CRYPTO_DECODE_256X2_H
+
+#include <stdint.h>
+#define PQCLEAN_NTRULPR761_AVX2_crypto_decode_256x2_STRBYTES 32
+#define PQCLEAN_NTRULPR761_AVX2_crypto_decode_256x2_ITEMS 256
+#define PQCLEAN_NTRULPR761_AVX2_crypto_decode_256x2_ITEMBYTES 1
+
+void PQCLEAN_NTRULPR761_AVX2_crypto_decode_256x2(void *v, const unsigned char *s);
+#endif
diff --git a/crypto_kem/ntrulpr761/avx2/crypto_decode_761x1531.c b/crypto_kem/ntrulpr761/avx2/crypto_decode_761x1531.c
new file mode 100644
index 00000000..3b373183
--- /dev/null
+++ b/crypto_kem/ntrulpr761/avx2/crypto_decode_761x1531.c
@@ -0,0 +1,436 @@
+#include "crypto_decode_761x1531.h"
+#include <immintrin.h>
+/* auto-generated; do not edit */
+
+#define int16 int16_t
+#define int32 int32_t
+
+static inline int16 mullo(int16 x, int16 y) {
+    return x * y;
+}
+
+static inline int16 mulhi(int16 x, int16 y) {
+    return (x * (int32)y) >> 16;
+}
+
+static inline __m256i add(__m256i x, __m256i y) {
+    return _mm256_add_epi16(x, y);
+}
+
+static inline __m256i sub(__m256i x, __m256i y) {
+    return _mm256_sub_epi16(x, y);
+}
+
+static inline __m256i shiftleftconst(__m256i x, int16 y) {
+    return _mm256_slli_epi16(x, y);
+}
+
+static inline __m256i signedshiftrightconst(__m256i x, int16 y) {
+    return _mm256_srai_epi16(x, y);
+}
+
+static inline __m256i addconst(__m256i x, int16 y) {
+    return add(x, _mm256_set1_epi16(y));
+}
+
+static inline __m256i subconst(__m256i x, int16 y) {
+    return sub(x, _mm256_set1_epi16(y));
+}
+
+static inline __m256i mulloconst(__m256i x, int16 y) {
+    return _mm256_mullo_epi16(x, _mm256_set1_epi16(y));
+}
+
+static inline __m256i mulhiconst(__m256i x, int16 y) {
+    return _mm256_mulhi_epi16(x, _mm256_set1_epi16(y));
+}
+
+static inline __m256i ifgesubconst(__m256i x, int16 y) {
+    __m256i y16 = _mm256_set1_epi16(y);
+    __m256i top16 = _mm256_set1_epi16((int16)(y - 1));
+    return sub(x, _mm256_cmpgt_epi16(x, top16) & y16);
+}
+
+static inline __m256i ifnegaddconst(__m256i x, int16 y) {
+    return add(x, signedshiftrightconst(x, 15) & _mm256_set1_epi16(y));
+}
+
+void PQCLEAN_NTRULPR761_AVX2_crypto_decode_761x1531(void *v, const unsigned char *s) {
+    int16 *R0 = v;
+    int16 R1[381], R2[191], R3[96], R4[48], R5[24], R6[12], R7[6], R8[3], R9[2], R10[1];
+    long long i;
+    int16 a0, a1, a2;
+    __m256i A0, A1, A2, S0, S1, B0, B1, C0, C1;
+
+    s += PQCLEAN_NTRULPR761_AVX2_crypto_decode_761x1531_STRBYTES;
+    a1 = 0;
+    a1 += *--s; /* 0...255 */
+    a1 = mulhi(a1, -84) - mulhi(mullo(a1, -4828), 3475);
+    a1 += *--s; /* -1738...1992 */
+    a1 += (a1 >> 15) & 3475; /* 0...3474 */
+    R10[0] = a1;
+
+    /* R10 ------> R9: reconstruct mod 1*[593]+[1500] */
+
+    i = 0;
+    s -= 1;
+    a2 = a0 = R10[0];
+    a0 = mulhi(a0, 60) - mulhi(mullo(a0, -28292), 593); /* -297...311 */
+    a0 += s[1 * i + 0]; /* -297...566 */
+    a0 += (a0 >> 15) & 593; /* 0...592 */
+    a1 = (a2 << 8) + s[i] - a0;
+    a1 = mullo(a1, -31055);
+
+    /* invalid inputs might need reduction mod 1500 */
+    a1 -= 1500;
+    a1 += (a1 >> 15) & 1500;
+
+    R9[0] = a0;
+    R9[1] = a1;
+    s -= 0;
+
+    /* R9 ------> R8: reconstruct mod 2*[6232]+[1500] */
+
+    R8[2] = R9[1];
+    s -= 2;
+    for (i = 0; i >= 0; --i) {
+        a2 = a0 = R9[i];
+        a0 = mulhi(a0, 672) - mulhi(mullo(a0, -2692), 6232); /* -3116...3284 */
+        a0 += s[2 * i + 1]; /* -3116...3539 */
+        a0 = mulhi(a0, 672) - mulhi(mullo(a0, -2692), 6232); /* -3148...3152 */
+        a0 += s[2 * i + 0]; /* -3148...3407 */
+        a0 += (a0 >> 15) & 6232; /* 0...6231 */
+        a1 = (a2 << 13) + (s[2 * i + 1] << 5) + ((s[2 * i] - a0) >> 3);
+        a1 = mullo(a1, 12451);
+
+        /* invalid inputs might need reduction mod 6232 */
+        a1 -= 6232;
+        a1 += (a1 >> 15) & 6232;
+
+        R8[2 * i] = a0;
+        R8[2 * i + 1] = a1;
+    }
+
+    /* R8 ------> R7: reconstruct mod 5*[1263]+[304] */
+
+    i = 0;
+    s -= 1;
+    a2 = a0 = R8[2];
+    a0 = mulhi(a0, -476) - mulhi(mullo(a0, -13284), 1263); /* -751...631 */
+    a0 += s[1 * i + 0]; /* -751...886 */
+    a0 += (a0 >> 15) & 1263; /* 0...1262 */
+    a1 = (a2 << 8) + s[i] - a0;
+    a1 = mullo(a1, -22001);
+
+    /* invalid inputs might need reduction mod 304 */
+    a1 -= 304;
+    a1 += (a1 >> 15) & 304;
+
+    R7[4] = a0;
+    R7[5] = a1;
+    s -= 2;
+    for (i = 1; i >= 0; --i) {
+        a2 = a0 = R8[i];
+        a0 = mulhi(a0, -476) - mulhi(mullo(a0, -13284), 1263); /* -751...631 */
+        a0 += s[1 * i + 0]; /* -751...886 */
+        a0 += (a0 >> 15) & 1263; /* 0...1262 */
+        a1 = (a2 << 8) + s[i] - a0;
+        a1 = mullo(a1, -22001);
+
+        /* invalid inputs might need reduction mod 1263 */
+        a1 -= 1263;
+        a1 += (a1 >> 15) & 1263;
+
+        R7[2 * i] = a0;
+        R7[2 * i + 1] = a1;
+    }
+
+    /* R7 ------> R6: reconstruct mod 11*[9097]+[2188] */
+
+    i = 0;
+    s -= 2;
+    a0 = R7[5];
+    a0 = mulhi(a0, 2348) - mulhi(mullo(a0, -1844), 9097); /* -4549...5135 */
+    a0 += s[2 * i + 1]; /* -4549...5390 */
+    a0 = mulhi(a0, 2348) - mulhi(mullo(a0, -1844), 9097); /* -4712...4741 */
+    a0 += s[2 * i + 0]; /* -4712...4996 */
+    a0 += (a0 >> 15) & 9097; /* 0...9096 */
+    a1 = (s[2 * i + 1] << 8) + s[2 * i] - a0;
+    a1 = mullo(a1, 17081);
+
+    /* invalid inputs might need reduction mod 2188 */
+    a1 -= 2188;
+    a1 += (a1 >> 15) & 2188;
+
+    R6[10] = a0;
+    R6[11] = a1;
+    s -= 10;
+    for (i = 4; i >= 0; --i) {
+        a0 = R7[i];
+        a0 = mulhi(a0, 2348) - mulhi(mullo(a0, -1844), 9097); /* -4549...5135 */
+        a0 += s[2 * i + 1]; /* -4549...5390 */
+        a0 = mulhi(a0, 2348) - mulhi(mullo(a0, -1844), 9097); /* -4712...4741 */
+        a0 += s[2 * i + 0]; /* -4712...4996 */
+        a0 += (a0 >> 15) & 9097; /* 0...9096 */
+        a1 = (s[2 * i + 1] << 8) + s[2 * i] - a0;
+        a1 = mullo(a1, 17081);
+
+        /* invalid inputs might need reduction mod 9097 */
+        a1 -= 9097;
+        a1 += (a1 >> 15) & 9097;
+
+        R6[2 * i] = a0;
+        R6[2 * i + 1] = a1;
+    }
+
+    /* R6 ------> R5: reconstruct mod 23*[1526]+[367] */
+
+    i = 0;
+    s -= 1;
+    a2 = a0 = R6[11];
+    a0 = mulhi(a0, 372) - mulhi(mullo(a0, -10994), 1526); /* -763...856 */
+    a0 += s[1 * i + 0]; /* -763...1111 */
+    a0 += (a0 >> 15) & 1526; /* 0...1525 */
+    a1 = (a2 << 7) + ((s[i] - a0) >> 1);
+    a1 = mullo(a1, -18381);
+
+    /* invalid inputs might need reduction mod 367 */
+    a1 -= 367;
+    a1 += (a1 >> 15) & 367;
+
+    R5[22] = a0;
+    R5[23] = a1;
+    s -= 11;
+    for (i = 10; i >= 0; --i) {
+        a2 = a0 = R6[i];
+        a0 = mulhi(a0, 372) - mulhi(mullo(a0, -10994), 1526); /* -763...856 */
+        a0 += s[1 * i + 0]; /* -763...1111 */
+        a0 += (a0 >> 15) & 1526; /* 0...1525 */
+        a1 = (a2 << 7) + ((s[i] - a0) >> 1);
+        a1 = mullo(a1, -18381);
+
+        /* invalid inputs might need reduction mod 1526 */
+        a1 -= 1526;
+        a1 += (a1 >> 15) & 1526;
+
+        R5[2 * i] = a0;
+        R5[2 * i + 1] = a1;
+    }
+
+    /* R5 ------> R4: reconstruct mod 47*[625]+[150] */
+
+    i = 0;
+    s -= 1;
+    a2 = a0 = R5[23];
+    a0 = mulhi(a0, -284) - mulhi(mullo(a0, -26844), 625); /* -384...312 */
+    a0 += s[1 * i + 0]; /* -384...567 */
+    a0 += (a0 >> 15) & 625; /* 0...624 */
+    a1 = (a2 << 8) + s[i] - a0;
+    a1 = mullo(a1, 32401);
+
+    /* invalid inputs might need reduction mod 150 */
+    a1 -= 150;
+    a1 += (a1 >> 15) & 150;
+
+    R4[46] = a0;
+    R4[47] = a1;
+    s -= 23;
+    i = 7;
+    for (;;) {
+        A2 = A0 = _mm256_loadu_si256((__m256i *) &R5[i]);
+        S0 = _mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i *) (s + i)));
+        A0 = sub(mulhiconst(A0, -284), mulhiconst(mulloconst(A0, -26844), 625)); /* -384...312 */
+        A0 = add(A0, S0); /* -384...567 */
+        A0 = ifnegaddconst(A0, 625); /* 0...624 */
+        A1 = add(shiftleftconst(A2, 8), sub(S0, A0));
+        A1 = mulloconst(A1, 32401);
+
+        /* invalid inputs might need reduction mod 625 */
+        A1 = ifgesubconst(A1, 625);
+
+        /* A0: r0r2r4r6r8r10r12r14 r16r18r20r22r24r26r28r30 */
+        /* A1: r1r3r5r7r9r11r13r15 r17r19r21r23r25r27r29r31 */
+        B0 = _mm256_unpacklo_epi16(A0, A1);
+        B1 = _mm256_unpackhi_epi16(A0, A1);
+        /* B0: r0r1r2r3r4r5r6r7 r16r17r18r19r20r21r22r23 */
+        /* B1: r8r9r10r11r12r13r14r15 r24r25r26r27r28r29r30r31 */
+        C0 = _mm256_permute2x128_si256(B0, B1, 0x20);
+        C1 = _mm256_permute2x128_si256(B0, B1, 0x31);
+        /* C0: r0r1r2r3r4r5r6r7 r8r9r10r11r12r13r14r15 */
+        /* C1: r16r17r18r19r20r21r22r23 r24r25r26r27r28r29r30r31 */
+        _mm256_storeu_si256((__m256i *) (&R4[2 * i]), C0);
+        _mm256_storeu_si256((__m256i *) (16 + &R4[2 * i]), C1);
+        if (!i) {
+            break;
+        }
+        i = -16 - ((~15) & -i);
+    }
+
+    /* R4 ------> R3: reconstruct mod 95*[6400]+[1531] */
+
+    i = 0;
+    s -= 2;
+    a2 = a0 = R4[47];
+    a0 = mulhi(a0, 2816) - mulhi(mullo(a0, -2621), 6400); /* -3200...3904 */
+    a0 += s[2 * i + 1]; /* -3200...4159 */
+    a0 = mulhi(a0, 2816) - mulhi(mullo(a0, -2621), 6400); /* -3338...3378 */
+    a0 += s[2 * i + 0]; /* -3338...3633 */
+    a0 += (a0 >> 15) & 6400; /* 0...6399 */
+    a1 = (a2 << 8) + s[2 * i + 1] + ((s[2 * i] - a0) >> 8);
+    a1 = mullo(a1, 23593);
+
+    /* invalid inputs might need reduction mod 1531 */
+    a1 -= 1531;
+    a1 += (a1 >> 15) & 1531;
+
+    R3[94] = a0;
+    R3[95] = a1;
+    s -= 94;
+    i = 31;
+    for (;;) {
+        A2 = A0 = _mm256_loadu_si256((__m256i *) &R4[i]);
+        S0 = _mm256_loadu_si256((__m256i *) (s + 2 * i));
+        S1 = _mm256_srli_epi16(S0, 8);
+        S0 &= _mm256_set1_epi16(255);
+        A0 = sub(mulhiconst(A0, 2816), mulhiconst(mulloconst(A0, -2621), 6400)); /* -3200...3904 */
+        A0 = add(A0, S1); /* -3200...4159 */
+        A0 = sub(mulhiconst(A0, 2816), mulhiconst(mulloconst(A0, -2621), 6400)); /* -3338...3378 */
+        A0 = add(A0, S0); /* -3338...3633 */
+        A0 = ifnegaddconst(A0, 6400); /* 0...6399 */
+        A1 = add(add(shiftleftconst(A2, 8), S1), signedshiftrightconst(sub(S0, A0), 8));
+        A1 = mulloconst(A1, 23593);
+
+        /* invalid inputs might need reduction mod 6400 */
+        A1 = ifgesubconst(A1, 6400);
+
+        /* A0: r0r2r4r6r8r10r12r14 r16r18r20r22r24r26r28r30 */
+        /* A1: r1r3r5r7r9r11r13r15 r17r19r21r23r25r27r29r31 */
+        B0 = _mm256_unpacklo_epi16(A0, A1);
+        B1 = _mm256_unpackhi_epi16(A0, A1);
+        /* B0: r0r1r2r3r4r5r6r7 r16r17r18r19r20r21r22r23 */
+        /* B1: r8r9r10r11r12r13r14r15 r24r25r26r27r28r29r30r31 */
+        C0 = _mm256_permute2x128_si256(B0, B1, 0x20);
+        C1 = _mm256_permute2x128_si256(B0, B1, 0x31);
+        /* C0: r0r1r2r3r4r5r6r7 r8r9r10r11r12r13r14r15 */
+        /* C1: r16r17r18r19r20r21r22r23 r24r25r26r27r28r29r30r31 */
+        _mm256_storeu_si256((__m256i *) (&R3[2 * i]), C0);
+        _mm256_storeu_si256((__m256i *) (16 + &R3[2 * i]), C1);
+        if (!i) {
+            break;
+        }
+        i = -16 - ((~15) & -i);
+    }
+
+    /* R3 ------> R2: reconstruct mod 190*[1280]+[1531] */
+
+    R2[190] = R3[95];
+    s -= 95;
+    i = 79;
+    for (;;) {
+        A2 = A0 = _mm256_loadu_si256((__m256i *) &R3[i]);
+        S0 = _mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i *) (s + i)));
+        A0 = sub(mulhiconst(A0, 256), mulhiconst(mulloconst(A0, -13107), 1280)); /* -640...704 */
+        A0 = add(A0, S0); /* -640...959 */
+        A0 = ifnegaddconst(A0, 1280); /* 0...1279 */
+        A1 = add(A2, signedshiftrightconst(sub(S0, A0), 8));
+        A1 = mulloconst(A1, -13107);
+
+        /* invalid inputs might need reduction mod 1280 */
+        A1 = ifgesubconst(A1, 1280);
+
+        /* A0: r0r2r4r6r8r10r12r14 r16r18r20r22r24r26r28r30 */
+        /* A1: r1r3r5r7r9r11r13r15 r17r19r21r23r25r27r29r31 */
+        B0 = _mm256_unpacklo_epi16(A0, A1);
+        B1 = _mm256_unpackhi_epi16(A0, A1);
+        /* B0: r0r1r2r3r4r5r6r7 r16r17r18r19r20r21r22r23 */
+        /* B1: r8r9r10r11r12r13r14r15 r24r25r26r27r28r29r30r31 */
+        C0 = _mm256_permute2x128_si256(B0, B1, 0x20);
+        C1 = _mm256_permute2x128_si256(B0, B1, 0x31);
+        /* C0: r0r1r2r3r4r5r6r7 r8r9r10r11r12r13r14r15 */
+        /* C1: r16r17r18r19r20r21r22r23 r24r25r26r27r28r29r30r31 */
+        _mm256_storeu_si256((__m256i *) (&R2[2 * i]), C0);
+        _mm256_storeu_si256((__m256i *) (16 + &R2[2 * i]), C1);
+        if (!i) {
+            break;
+        }
+        i = -16 - ((~15) & -i);
+    }
+
+    /* R2 ------> R1: reconstruct mod 380*[9157]+[1531] */
+
+    R1[380] = R2[190];
+    s -= 380;
+    i = 174;
+    for (;;) {
+        A0 = _mm256_loadu_si256((__m256i *) &R2[i]);
+        S0 = _mm256_loadu_si256((__m256i *) (s + 2 * i));
+        S1 = _mm256_srli_epi16(S0, 8);
+        S0 &= _mm256_set1_epi16(255);
+        A0 = sub(mulhiconst(A0, 1592), mulhiconst(mulloconst(A0, -1832), 9157)); /* -4579...4976 */
+        A0 = add(A0, S1); /* -4579...5231 */
+        A0 = sub(mulhiconst(A0, 1592), mulhiconst(mulloconst(A0, -1832), 9157)); /* -4690...4705 */
+        A0 = add(A0, S0); /* -4690...4960 */
+        A0 = ifnegaddconst(A0, 9157); /* 0...9156 */
+        A1 = add(shiftleftconst(S1, 8), sub(S0, A0));
+        A1 = mulloconst(A1, 25357);
+
+        /* invalid inputs might need reduction mod 9157 */
+        A1 = ifgesubconst(A1, 9157);
+
+        /* A0: r0r2r4r6r8r10r12r14 r16r18r20r22r24r26r28r30 */
+        /* A1: r1r3r5r7r9r11r13r15 r17r19r21r23r25r27r29r31 */
+        B0 = _mm256_unpacklo_epi16(A0, A1);
+        B1 = _mm256_unpackhi_epi16(A0, A1);
+        /* B0: r0r1r2r3r4r5r6r7 r16r17r18r19r20r21r22r23 */
+        /* B1: r8r9r10r11r12r13r14r15 r24r25r26r27r28r29r30r31 */
+        C0 = _mm256_permute2x128_si256(B0, B1, 0x20);
+        C1 = _mm256_permute2x128_si256(B0, B1, 0x31);
+        /* C0: r0r1r2r3r4r5r6r7 r8r9r10r11r12r13r14r15 */
+        /* C1: r16r17r18r19r20r21r22r23 r24r25r26r27r28r29r30r31 */
+        _mm256_storeu_si256((__m256i *) (&R1[2 * i]), C0);
+        _mm256_storeu_si256((__m256i *) (16 + &R1[2 * i]), C1);
+        if (!i) {
+            break;
+        }
+        i = -16 - ((~15) & -i);
+    }
+
+    /* R1 ------> R0: reconstruct mod 761*[1531] */
+
+    R0[760] = 3 * R1[380] - 2295;
+    s -= 380;
+    i = 364;
+    for (;;) {
+        A2 = A0 = _mm256_loadu_si256((__m256i *) &R1[i]);
+        S0 = _mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i *) (s + i)));
+        A0 = sub(mulhiconst(A0, 518), mulhiconst(mulloconst(A0, -10958), 1531)); /* -766...895 */
+        A0 = add(A0, S0); /* -766...1150 */
+        A0 = ifnegaddconst(A0, 1531); /* 0...1530 */
+        A1 = add(shiftleftconst(A2, 8), sub(S0, A0));
+        A1 = mulloconst(A1, 15667);
+
+        /* invalid inputs might need reduction mod 1531 */
+        A1 = ifgesubconst(A1, 1531);
+
+        A0 = mulloconst(A0, 3);
+        A1 = mulloconst(A1, 3);
+        A0 = subconst(A0, 2295);
+        A1 = subconst(A1, 2295);
+        /* A0: r0r2r4r6r8r10r12r14 r16r18r20r22r24r26r28r30 */
+        /* A1: r1r3r5r7r9r11r13r15 r17r19r21r23r25r27r29r31 */
+        B0 = _mm256_unpacklo_epi16(A0, A1);
+        B1 = _mm256_unpackhi_epi16(A0, A1);
+        /* B0: r0r1r2r3r4r5r6r7 r16r17r18r19r20r21r22r23 */
+        /* B1: r8r9r10r11r12r13r14r15 r24r25r26r27r28r29r30r31 */
+        C0 = _mm256_permute2x128_si256(B0, B1, 0x20);
+        C1 = _mm256_permute2x128_si256(B0, B1, 0x31);
+        /* C0: r0r1r2r3r4r5r6r7 r8r9r10r11r12r13r14r15 */
+        /* C1: r16r17r18r19r20r21r22r23 r24r25r26r27r28r29r30r31 */
+        _mm256_storeu_si256((__m256i *) (&R0[2 * i]), C0);
+        _mm256_storeu_si256((__m256i *) (16 + &R0[2 * i]), C1);
+        if (!i) {
+            break;
+        }
+        i = -16 - ((~15) & -i);
+    }
+}
diff --git a/crypto_kem/ntrulpr761/avx2/crypto_decode_761x1531.h b/crypto_kem/ntrulpr761/avx2/crypto_decode_761x1531.h
new file mode 100644
index 00000000..a862cd5a
--- /dev/null
+++ b/crypto_kem/ntrulpr761/avx2/crypto_decode_761x1531.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_NTRULPR761_AVX2_CRYPTO_DECODE_761X1531_H
+#define PQCLEAN_NTRULPR761_AVX2_CRYPTO_DECODE_761X1531_H
+
+#include <stdint.h>
+#define PQCLEAN_NTRULPR761_AVX2_crypto_decode_761x1531_STRBYTES 1007
+#define PQCLEAN_NTRULPR761_AVX2_crypto_decode_761x1531_ITEMS 761
+#define PQCLEAN_NTRULPR761_AVX2_crypto_decode_761x1531_ITEMBYTES 2
+
+void PQCLEAN_NTRULPR761_AVX2_crypto_decode_761x1531(void *v, const unsigned char *s);
+#endif
diff --git a/crypto_kem/ntrulpr761/avx2/crypto_decode_761x3.c b/crypto_kem/ntrulpr761/avx2/crypto_decode_761x3.c
new file mode 100644
index 00000000..5c7cdcd9
--- /dev/null
+++ b/crypto_kem/ntrulpr761/avx2/crypto_decode_761x3.c
@@ -0,0 +1,65 @@
+#include "crypto_decode_761x3.h"
+#include <immintrin.h>
+#define uint8 uint8_t
+
+#define p 761
+#define loops 6
+#define overshoot 2
+
+void PQCLEAN_NTRULPR761_AVX2_crypto_decode_761x3(void *v, const unsigned char *s) {
+    uint8 *f = v;
+    int loop;
+    uint8 *nextf = f + 128 - 4 * overshoot;
+    const unsigned char *nexts = s + 32 - overshoot;
+
+    for (loop = loops; loop > 0; --loop) {
+        __m256i s0 = _mm256_loadu_si256((const __m256i *) s);
+        s = nexts;
+        nexts += 32;
+
+        __m256i s1 = _mm256_srli_epi16(s0 & _mm256_set1_epi8(-16), 4);
+        s0 &= _mm256_set1_epi8(15);
+
+        __m256i a0 = _mm256_unpacklo_epi8(s0, s1);
+        /* 0 0>>4 1 1>>4 2 2>>4 3 3>>4 4 4>>4 5 5>>4 6 6>>4 7 7>>4 */
+        /* 16 16>>4 ... */
+        __m256i a1 = _mm256_unpackhi_epi8(s0, s1);
+        /* 8 8>>4 9 9>>4 10 10>>4 ... */
+        /* 24 24>>4 ... */
+
+        __m256i a2 = _mm256_srli_epi16(a0 & _mm256_set1_epi8(12), 2);
+        __m256i a3 = _mm256_srli_epi16(a1 & _mm256_set1_epi8(12), 2);
+        a0 &= _mm256_set1_epi8(3);
+        a1 &= _mm256_set1_epi8(3);
+
+        __m256i b0 = _mm256_unpacklo_epi8(a0, a2);
+        /* 0 0>>2 0>>4 0>>6 1 1>>2 1>>4 1>>6 */
+        /* 2 2>>2 2>>4 2>>6 3 3>>2 3>>4 3>.6 */
+        /* 16 16>>2 16>>4 16>>6 ... */
+        __m256i b2 = _mm256_unpackhi_epi8(a0, a2);
+        /* 4 4>>2 ... */
+        __m256i b1 = _mm256_unpacklo_epi8(a1, a3);
+        /* 8 8>>2 ... */
+        __m256i b3 = _mm256_unpackhi_epi8(a1, a3);
+        /* 12 12>>2 ... */
+
+        __m256i f0 = _mm256_permute2x128_si256(b0, b2, 0x20);
+        __m256i f2 = _mm256_permute2x128_si256(b0, b2, 0x31);
+        __m256i f1 = _mm256_permute2x128_si256(b1, b3, 0x20);
+        __m256i f3 = _mm256_permute2x128_si256(b1, b3, 0x31);
+
+        f0 = _mm256_add_epi8(f0, _mm256_set1_epi8(-1));
+        f1 = _mm256_add_epi8(f1, _mm256_set1_epi8(-1));
+        f2 = _mm256_add_epi8(f2, _mm256_set1_epi8(-1));
+        f3 = _mm256_add_epi8(f3, _mm256_set1_epi8(-1));
+
+        _mm256_storeu_si256((__m256i *) (f + 0), f0);
+        _mm256_storeu_si256((__m256i *) (f + 32), f1);
+        _mm256_storeu_si256((__m256i *) (f + 64), f2);
+        _mm256_storeu_si256((__m256i *) (f + 96), f3);
+        f = nextf;
+        nextf += 128;
+    }
+
+    *f = ((uint8)(*s & 3)) - 1;
+}
diff --git a/crypto_kem/ntrulpr761/avx2/crypto_decode_761x3.h b/crypto_kem/ntrulpr761/avx2/crypto_decode_761x3.h
new file mode 100644
index 00000000..ce7e0bfc
--- /dev/null
+++ b/crypto_kem/ntrulpr761/avx2/crypto_decode_761x3.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_NTRULPR761_AVX2_CRYPTO_DECODE_761X3_H
+#define PQCLEAN_NTRULPR761_AVX2_CRYPTO_DECODE_761X3_H
+
+#include <stdint.h>
+#define PQCLEAN_NTRULPR761_AVX2_crypto_decode_761x3_STRBYTES 191
+#define PQCLEAN_NTRULPR761_AVX2_crypto_decode_761x3_ITEMS 761
+#define PQCLEAN_NTRULPR761_AVX2_crypto_decode_761x3_ITEMBYTES 1
+
+void PQCLEAN_NTRULPR761_AVX2_crypto_decode_761x3(void *v, const unsigned char *s);
+#endif
diff --git a/crypto_kem/ntrulpr761/avx2/crypto_decode_761xint16.c b/crypto_kem/ntrulpr761/avx2/crypto_decode_761xint16.c
new file mode 100644
index 00000000..f72588a9
--- /dev/null
+++ b/crypto_kem/ntrulpr761/avx2/crypto_decode_761xint16.c
@@ -0,0 +1,16 @@
+#include "crypto_decode_761xint16.h"
+
+
+void PQCLEAN_NTRULPR761_AVX2_crypto_decode_761xint16(void *v, const unsigned char *s) {
+    uint16_t *x = v;
+    int i;
+
+    for (i = 0; i < 761; ++i) {
+        uint16_t u0 = s[0];
+        uint16_t u1 = s[1];
+        u1 <<= 8;
+        *x = u0 | u1;
+        x += 1;
+        s += 2;
+    }
+}
diff --git a/crypto_kem/ntrulpr761/avx2/crypto_decode_761xint16.h b/crypto_kem/ntrulpr761/avx2/crypto_decode_761xint16.h
new file mode 100644
index 00000000..b6c48d9c
--- /dev/null
+++ b/crypto_kem/ntrulpr761/avx2/crypto_decode_761xint16.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_NTRULPR761_AVX2_CRYPTO_DECODE_761XINT16_H
+#define PQCLEAN_NTRULPR761_AVX2_CRYPTO_DECODE_761XINT16_H
+
+#include <stdint.h>
+#define PQCLEAN_NTRULPR761_AVX2_crypto_decode_761xint16_STRBYTES 1522
+#define PQCLEAN_NTRULPR761_AVX2_crypto_decode_761xint16_ITEMBYTES 2
+#define PQCLEAN_NTRULPR761_AVX2_crypto_decode_761xint16_ITEMS 761
+
+void PQCLEAN_NTRULPR761_AVX2_crypto_decode_761xint16(void *v, const unsigned char *s);
+#endif
diff --git a/crypto_kem/ntrulpr761/avx2/crypto_decode_761xint32.c b/crypto_kem/ntrulpr761/avx2/crypto_decode_761xint32.c
new file mode 100644
index 00000000..2ac1fe5e
--- /dev/null
+++ b/crypto_kem/ntrulpr761/avx2/crypto_decode_761xint32.c
@@ -0,0 +1,20 @@
+#include "crypto_decode_761xint32.h"
+
+
+void PQCLEAN_NTRULPR761_AVX2_crypto_decode_761xint32(void *v, const unsigned char *s) {
+    uint32_t *x = v;
+    int i;
+
+    for (i = 0; i < 761; ++i) {
+        uint32_t u0 = s[0];
+        uint32_t u1 = s[1];
+        uint32_t u2 = s[2];
+        uint32_t u3 = s[3];
+        u1 <<= 8;
+        u2 <<= 16;
+        u3 <<= 24;
+        *x = u0 | u1 | u2 | u3;
+        x += 1;
+        s += 4;
+    }
+}
diff --git a/crypto_kem/ntrulpr761/avx2/crypto_decode_761xint32.h b/crypto_kem/ntrulpr761/avx2/crypto_decode_761xint32.h
new file mode 100644
index 00000000..213d3111
--- /dev/null
+++ b/crypto_kem/ntrulpr761/avx2/crypto_decode_761xint32.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_NTRULPR761_AVX2_CRYPTO_DECODE_761XINT32_H
+#define PQCLEAN_NTRULPR761_AVX2_CRYPTO_DECODE_761XINT32_H
+
+#include <stdint.h>
+#define PQCLEAN_NTRULPR761_AVX2_crypto_decode_761xint32_STRBYTES 3044
+#define PQCLEAN_NTRULPR761_AVX2_crypto_decode_761xint32_ITEMBYTES 4
+#define PQCLEAN_NTRULPR761_AVX2_crypto_decode_761xint32_ITEMS 761
+
+void PQCLEAN_NTRULPR761_AVX2_crypto_decode_761xint32(void *v, const unsigned char *s);
+#endif
diff --git a/crypto_kem/ntrulpr761/avx2/crypto_encode_256x16.c b/crypto_kem/ntrulpr761/avx2/crypto_encode_256x16.c
new file mode 100644
index 00000000..3cdf2a1d
--- /dev/null
+++ b/crypto_kem/ntrulpr761/avx2/crypto_encode_256x16.c
@@ -0,0 +1,10 @@
+#include "crypto_encode_256x16.h"
+
+
+void PQCLEAN_NTRULPR761_AVX2_crypto_encode_256x16(unsigned char *s, const void *v) {
+    const unsigned char *T = v;
+    int i;
+    for (i = 0; i < 128; ++i) {
+        s[i] = T[2 * i] + (T[2 * i + 1] << 4);
+    }
+}
diff --git a/crypto_kem/ntrulpr761/avx2/crypto_encode_256x16.h b/crypto_kem/ntrulpr761/avx2/crypto_encode_256x16.h
new file mode 100644
index 00000000..f4763596
--- /dev/null
+++ b/crypto_kem/ntrulpr761/avx2/crypto_encode_256x16.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_NTRULPR761_AVX2_CRYPTO_ENCODE_256X16_H
+#define PQCLEAN_NTRULPR761_AVX2_CRYPTO_ENCODE_256X16_H
+
+#include <stdint.h>
+#define PQCLEAN_NTRULPR761_AVX2_crypto_encode_256x16_STRBYTES 128
+#define PQCLEAN_NTRULPR761_AVX2_crypto_encode_256x16_ITEMS 256
+#define PQCLEAN_NTRULPR761_AVX2_crypto_encode_256x16_ITEMBYTES 1
+
+void PQCLEAN_NTRULPR761_AVX2_crypto_encode_256x16(unsigned char *s, const void *v);
+#endif
diff --git a/crypto_kem/ntrulpr761/avx2/crypto_encode_256x2.c b/crypto_kem/ntrulpr761/avx2/crypto_encode_256x2.c
new file mode 100644
index 00000000..5bafef06
--- /dev/null
+++ b/crypto_kem/ntrulpr761/avx2/crypto_encode_256x2.c
@@ -0,0 +1,88 @@
+#include "crypto_encode_256x2.h"
+#include <immintrin.h>
+#include <stdint.h>
+
+void PQCLEAN_NTRULPR761_AVX2_crypto_encode_256x2(unsigned char *s, const void *v) {
+    __m256i a0 = _mm256_loadu_si256(0 + (__m256i *) v);
+    __m256i a1 = _mm256_loadu_si256(1 + (__m256i *) v);
+    __m256i a2 = _mm256_loadu_si256(2 + (__m256i *) v);
+    __m256i a3 = _mm256_loadu_si256(3 + (__m256i *) v);
+    __m256i a4 = _mm256_loadu_si256(4 + (__m256i *) v);
+    __m256i a5 = _mm256_loadu_si256(5 + (__m256i *) v);
+    __m256i a6 = _mm256_loadu_si256(6 + (__m256i *) v);
+    __m256i a7 = _mm256_loadu_si256(7 + (__m256i *) v);
+    __m256i bottom = _mm256_set1_epi8(1);
+    __m256i zero = _mm256_setzero_si256();
+    __m256i b0 = _mm256_cmpgt_epi8(a0 & bottom, zero);
+    __m256i b1 = _mm256_cmpgt_epi8(a1 & bottom, zero);
+    __m256i b2 = _mm256_cmpgt_epi8(a2 & bottom, zero);
+    __m256i b3 = _mm256_cmpgt_epi8(a3 & bottom, zero);
+    __m256i b4 = _mm256_cmpgt_epi8(a4 & bottom, zero);
+    __m256i b5 = _mm256_cmpgt_epi8(a5 & bottom, zero);
+    __m256i b6 = _mm256_cmpgt_epi8(a6 & bottom, zero);
+    __m256i b7 = _mm256_cmpgt_epi8(a7 & bottom, zero);
+    int32_t c0 = _mm256_movemask_epi8(b0);
+    int32_t c1 = _mm256_movemask_epi8(b1);
+    int32_t c2 = _mm256_movemask_epi8(b2);
+    int32_t c3 = _mm256_movemask_epi8(b3);
+    int32_t c4 = _mm256_movemask_epi8(b4);
+    int32_t c5 = _mm256_movemask_epi8(b5);
+    int32_t c6 = _mm256_movemask_epi8(b6);
+    int32_t c7 = _mm256_movemask_epi8(b7);
+    *s++ = c0;
+    c0 >>= 8;
+    *s++ = c0;
+    c0 >>= 8;
+    *s++ = c0;
+    c0 >>= 8;
+    *s++ = c0;
+    *s++ = c1;
+    c1 >>= 8;
+    *s++ = c1;
+    c1 >>= 8;
+    *s++ = c1;
+    c1 >>= 8;
+    *s++ = c1;
+    *s++ = c2;
+    c2 >>= 8;
+    *s++ = c2;
+    c2 >>= 8;
+    *s++ = c2;
+    c2 >>= 8;
+    *s++ = c2;
+    *s++ = c3;
+    c3 >>= 8;
+    *s++ = c3;
+    c3 >>= 8;
+    *s++ = c3;
+    c3 >>= 8;
+    *s++ = c3;
+    *s++ = c4;
+    c4 >>= 8;
+    *s++ = c4;
+    c4 >>= 8;
+    *s++ = c4;
+    c4 >>= 8;
+    *s++ = c4;
+    *s++ = c5;
+    c5 >>= 8;
+    *s++ = c5;
+    c5 >>= 8;
+    *s++ = c5;
+    c5 >>= 8;
+    *s++ = c5;
+    *s++ = c6;
+    c6 >>= 8;
+    *s++ = c6;
+    c6 >>= 8;
+    *s++ = c6;
+    c6 >>= 8;
+    *s++ = c6;
+    *s++ = c7;
+    c7 >>= 8;
+    *s++ = c7;
+    c7 >>= 8;
+    *s++ = c7;
+    c7 >>= 8;
+    *s++ = c7;
+}
diff --git a/crypto_kem/ntrulpr761/avx2/crypto_encode_256x2.h b/crypto_kem/ntrulpr761/avx2/crypto_encode_256x2.h
new file mode 100644
index 00000000..e29ac8f7
--- /dev/null
+++ b/crypto_kem/ntrulpr761/avx2/crypto_encode_256x2.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_NTRULPR761_AVX2_CRYPTO_ENCODE_256X2_H
+#define PQCLEAN_NTRULPR761_AVX2_CRYPTO_ENCODE_256X2_H
+
+#include <stdint.h>
+#define PQCLEAN_NTRULPR761_AVX2_crypto_encode_256x2_STRBYTES 32
+#define PQCLEAN_NTRULPR761_AVX2_crypto_encode_256x2_ITEMS 256
+#define PQCLEAN_NTRULPR761_AVX2_crypto_encode_256x2_ITEMBYTES 1
+
+void PQCLEAN_NTRULPR761_AVX2_crypto_encode_256x2(unsigned char *s, const void *v);
+#endif
diff --git a/crypto_kem/ntrulpr761/avx2/crypto_encode_761x1531.c b/crypto_kem/ntrulpr761/avx2/crypto_encode_761x1531.c
new file mode 100644
index 00000000..8726b5ba
--- /dev/null
+++ b/crypto_kem/ntrulpr761/avx2/crypto_encode_761x1531.c
@@ -0,0 +1,301 @@
+#include "crypto_encode_761x1531.h"
+#include <immintrin.h>
+/* auto-generated; do not edit */
+
+#define int16 int16_t
+#define uint16 uint16_t
+#define uint32 uint32_t
+
+void PQCLEAN_NTRULPR761_AVX2_crypto_encode_761x1531(unsigned char *out, const void *v) {
+    const int16 *R0 = v;
+    /* XXX: caller could overlap R with input */
+    uint16 R[381];
+    long i;
+    const uint16 *reading;
+    uint16 *writing;
+    uint16 r0, r1;
+    uint32 r2;
+    uint32 s0;
+
+    reading = (uint16 *) R0;
+    writing = R;
+    i = 48;
+    while (i > 0) {
+        __m256i x, y;
+        --i;
+        if (!i) {
+            reading -= 8;
+            writing -= 4;
+            out -= 4;
+        }
+        x = _mm256_loadu_si256((__m256i *) reading);
+        x = _mm256_add_epi16(x, _mm256_set1_epi16(2295));
+        x &= _mm256_set1_epi16(16383);
+        x = _mm256_mulhi_epi16(x, _mm256_set1_epi16(21846));
+        y = x & _mm256_set1_epi32(65535);
+        x = _mm256_srli_epi32(x, 16);
+        x = _mm256_mullo_epi32(x, _mm256_set1_epi32(1531));
+        x = _mm256_add_epi32(y, x);
+        x = _mm256_shuffle_epi8(x, _mm256_set_epi8(
+                                    12, 8, 4, 0, 12, 8, 4, 0, 14, 13, 10, 9, 6, 5, 2, 1,
+                                    12, 8, 4, 0, 12, 8, 4, 0, 14, 13, 10, 9, 6, 5, 2, 1
+                                ));
+        x = _mm256_permute4x64_epi64(x, 0xd8);
+        _mm_storeu_si128((__m128i *) writing, _mm256_extractf128_si256(x, 0));
+        s0 = _mm256_extract_epi32(x, 4);
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 = _mm256_extract_epi32(x, 6);
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        reading += 16;
+        writing += 8;
+    }
+    R[380] = (((R0[760] + 2295) & 16383) * 10923) >> 15;
+
+    reading = (uint16 *) R;
+    writing = R;
+    i = 12;
+    while (i > 0) {
+        __m256i x, x2, y, y2;
+        --i;
+        if (!i) {
+            reading -= 4;
+            writing -= 2;
+            out -= 4;
+        }
+        x = _mm256_loadu_si256((__m256i *) (reading + 0));
+        x2 = _mm256_loadu_si256((__m256i *) (reading + 16));
+        y = x & _mm256_set1_epi32(65535);
+        y2 = x2 & _mm256_set1_epi32(65535);
+        x = _mm256_srli_epi32(x, 16);
+        x2 = _mm256_srli_epi32(x2, 16);
+        x = _mm256_mullo_epi32(x, _mm256_set1_epi32(9157));
+        x2 = _mm256_mullo_epi32(x2, _mm256_set1_epi32(9157));
+        x = _mm256_add_epi32(y, x);
+        x2 = _mm256_add_epi32(y2, x2);
+        x = _mm256_shuffle_epi8(x, _mm256_set_epi8(
+                                    15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0,
+                                    15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0
+                                ));
+        x2 = _mm256_shuffle_epi8(x2, _mm256_set_epi8(
+                                     15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0,
+                                     15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0
+                                 ));
+        x = _mm256_permute4x64_epi64(x, 0xd8);
+        x2 = _mm256_permute4x64_epi64(x2, 0xd8);
+        _mm256_storeu_si256((__m256i *) writing, _mm256_permute2f128_si256(x, x2, 0x31));
+        _mm256_storeu_si256((__m256i *) out, _mm256_permute2f128_si256(x, x2, 0x20));
+        reading += 32;
+        writing += 16;
+        out += 32;
+    }
+    R[190] = R[380];
+
+    reading = (uint16 *) R;
+    writing = R;
+    i = 12;
+    while (i > 0) {
+        __m256i x, y;
+        --i;
+        if (!i) {
+            reading -= 2;
+            writing -= 1;
+            out -= 1;
+        }
+        x = _mm256_loadu_si256((__m256i *) reading);
+        y = x & _mm256_set1_epi32(65535);
+        x = _mm256_srli_epi32(x, 16);
+        x = _mm256_mullo_epi32(x, _mm256_set1_epi32(1280));
+        x = _mm256_add_epi32(y, x);
+        x = _mm256_shuffle_epi8(x, _mm256_set_epi8(
+                                    12, 8, 4, 0, 12, 8, 4, 0, 14, 13, 10, 9, 6, 5, 2, 1,
+                                    12, 8, 4, 0, 12, 8, 4, 0, 14, 13, 10, 9, 6, 5, 2, 1
+                                ));
+        x = _mm256_permute4x64_epi64(x, 0xd8);
+        _mm_storeu_si128((__m128i *) writing, _mm256_extractf128_si256(x, 0));
+        s0 = _mm256_extract_epi32(x, 4);
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 = _mm256_extract_epi32(x, 6);
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        reading += 16;
+        writing += 8;
+    }
+    R[95] = R[190];
+
+    reading = (uint16 *) R;
+    writing = R;
+    i = 3;
+    while (i > 0) {
+        __m256i x, x2, y, y2;
+        --i;
+        x = _mm256_loadu_si256((__m256i *) (reading + 0));
+        x2 = _mm256_loadu_si256((__m256i *) (reading + 16));
+        y = x & _mm256_set1_epi32(65535);
+        y2 = x2 & _mm256_set1_epi32(65535);
+        x = _mm256_srli_epi32(x, 16);
+        x2 = _mm256_srli_epi32(x2, 16);
+        x = _mm256_mullo_epi32(x, _mm256_set1_epi32(6400));
+        x2 = _mm256_mullo_epi32(x2, _mm256_set1_epi32(6400));
+        x = _mm256_add_epi32(y, x);
+        x2 = _mm256_add_epi32(y2, x2);
+        x = _mm256_shuffle_epi8(x, _mm256_set_epi8(
+                                    15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0,
+                                    15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0
+                                ));
+        x2 = _mm256_shuffle_epi8(x2, _mm256_set_epi8(
+                                     15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0,
+                                     15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0
+                                 ));
+        x = _mm256_permute4x64_epi64(x, 0xd8);
+        x2 = _mm256_permute4x64_epi64(x2, 0xd8);
+        _mm256_storeu_si256((__m256i *) writing, _mm256_permute2f128_si256(x, x2, 0x31));
+        _mm256_storeu_si256((__m256i *) out, _mm256_permute2f128_si256(x, x2, 0x20));
+        reading += 32;
+        writing += 16;
+        out += 32;
+    }
+
+    reading = (uint16 *) R;
+    writing = R;
+    i = 3;
+    while (i > 0) {
+        __m256i x, y;
+        --i;
+        x = _mm256_loadu_si256((__m256i *) reading);
+        y = x & _mm256_set1_epi32(65535);
+        x = _mm256_srli_epi32(x, 16);
+        x = _mm256_mullo_epi32(x, _mm256_set1_epi32(625));
+        x = _mm256_add_epi32(y, x);
+        x = _mm256_shuffle_epi8(x, _mm256_set_epi8(
+                                    12, 8, 4, 0, 12, 8, 4, 0, 14, 13, 10, 9, 6, 5, 2, 1,
+                                    12, 8, 4, 0, 12, 8, 4, 0, 14, 13, 10, 9, 6, 5, 2, 1
+                                ));
+        x = _mm256_permute4x64_epi64(x, 0xd8);
+        _mm_storeu_si128((__m128i *) writing, _mm256_extractf128_si256(x, 0));
+        s0 = _mm256_extract_epi32(x, 4);
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 = _mm256_extract_epi32(x, 6);
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        reading += 16;
+        writing += 8;
+    }
+
+    reading = (uint16 *) R;
+    writing = R;
+    i = 2;
+    while (i > 0) {
+        __m256i x, y;
+        --i;
+        if (!i) {
+            reading -= 8;
+            writing -= 4;
+            out -= 4;
+        }
+        x = _mm256_loadu_si256((__m256i *) reading);
+        y = x & _mm256_set1_epi32(65535);
+        x = _mm256_srli_epi32(x, 16);
+        x = _mm256_mullo_epi32(x, _mm256_set1_epi32(1526));
+        x = _mm256_add_epi32(y, x);
+        x = _mm256_shuffle_epi8(x, _mm256_set_epi8(
+                                    12, 8, 4, 0, 12, 8, 4, 0, 14, 13, 10, 9, 6, 5, 2, 1,
+                                    12, 8, 4, 0, 12, 8, 4, 0, 14, 13, 10, 9, 6, 5, 2, 1
+                                ));
+        x = _mm256_permute4x64_epi64(x, 0xd8);
+        _mm_storeu_si128((__m128i *) writing, _mm256_extractf128_si256(x, 0));
+        s0 = _mm256_extract_epi32(x, 4);
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 = _mm256_extract_epi32(x, 6);
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        reading += 16;
+        writing += 8;
+    }
+
+    for (i = 0; i < 6; ++i) {
+        r0 = R[2 * i];
+        r1 = R[2 * i + 1];
+        r2 = r0 + r1 * (uint32)9097;
+        *out++ = r2;
+        r2 >>= 8;
+        *out++ = r2;
+        r2 >>= 8;
+        R[i] = r2;
+    }
+
+    for (i = 0; i < 3; ++i) {
+        r0 = R[2 * i];
+        r1 = R[2 * i + 1];
+        r2 = r0 + r1 * (uint32)1263;
+        *out++ = r2;
+        r2 >>= 8;
+        R[i] = r2;
+    }
+
+    r0 = R[0];
+    r1 = R[1];
+    r2 = r0 + r1 * (uint32)6232;
+    *out++ = r2;
+    r2 >>= 8;
+    *out++ = r2;
+    r2 >>= 8;
+    R[0] = r2;
+    R[1] = R[2];
+
+    r0 = R[0];
+    r1 = R[1];
+    r2 = r0 + r1 * (uint32)593;
+    *out++ = r2;
+    r2 >>= 8;
+    R[0] = r2;
+
+    r0 = R[0];
+    *out++ = r0;
+    r0 >>= 8;
+    *out++ = r0; /*clang-analyzer-deadcode.DeadStores*/ /*r0 >>= 8;*/
+}
diff --git a/crypto_kem/ntrulpr761/avx2/crypto_encode_761x1531.h b/crypto_kem/ntrulpr761/avx2/crypto_encode_761x1531.h
new file mode 100644
index 00000000..58bb9ea2
--- /dev/null
+++ b/crypto_kem/ntrulpr761/avx2/crypto_encode_761x1531.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_NTRULPR761_AVX2_CRYPTO_ENCODE_761X1531_H
+#define PQCLEAN_NTRULPR761_AVX2_CRYPTO_ENCODE_761X1531_H
+
+#include <stdint.h>
+#define PQCLEAN_NTRULPR761_AVX2_crypto_encode_761x1531_STRBYTES 1007
+#define PQCLEAN_NTRULPR761_AVX2_crypto_encode_761x1531_ITEMS 761
+#define PQCLEAN_NTRULPR761_AVX2_crypto_encode_761x1531_ITEMBYTES 2
+
+void PQCLEAN_NTRULPR761_AVX2_crypto_encode_761x1531(unsigned char *out, const void *v);
+#endif
diff --git a/crypto_kem/ntrulpr761/avx2/crypto_encode_761x1531round.c b/crypto_kem/ntrulpr761/avx2/crypto_encode_761x1531round.c
new file mode 100644
index 00000000..e1f96889
--- /dev/null
+++ b/crypto_kem/ntrulpr761/avx2/crypto_encode_761x1531round.c
@@ -0,0 +1,303 @@
+#include "crypto_encode_761x1531round.h"
+#include <immintrin.h>
+/* auto-generated; do not edit */
+
+#define int16 int16_t
+#define uint16 uint16_t
+#define uint32 uint32_t
+
+void PQCLEAN_NTRULPR761_AVX2_crypto_encode_761x1531round(unsigned char *out, const void *v) {
+    const int16 *R0 = v;
+    /* XXX: caller could overlap R with input */
+    uint16 R[381];
+    long i;
+    const uint16 *reading;
+    uint16 *writing;
+    uint16 r0, r1;
+    uint32 r2;
+    uint32 s0;
+
+    reading = (uint16 *) R0;
+    writing = R;
+    i = 48;
+    while (i > 0) {
+        __m256i x, y;
+        --i;
+        if (!i) {
+            reading -= 8;
+            writing -= 4;
+            out -= 4;
+        }
+        x = _mm256_loadu_si256((__m256i *) reading);
+        x = _mm256_mulhrs_epi16(x, _mm256_set1_epi16(10923));
+        x = _mm256_add_epi16(x, _mm256_add_epi16(x, x));
+        x = _mm256_add_epi16(x, _mm256_set1_epi16(2295));
+        x &= _mm256_set1_epi16(16383);
+        x = _mm256_mulhi_epi16(x, _mm256_set1_epi16(21846));
+        y = x & _mm256_set1_epi32(65535);
+        x = _mm256_srli_epi32(x, 16);
+        x = _mm256_mullo_epi32(x, _mm256_set1_epi32(1531));
+        x = _mm256_add_epi32(y, x);
+        x = _mm256_shuffle_epi8(x, _mm256_set_epi8(
+                                    12, 8, 4, 0, 12, 8, 4, 0, 14, 13, 10, 9, 6, 5, 2, 1,
+                                    12, 8, 4, 0, 12, 8, 4, 0, 14, 13, 10, 9, 6, 5, 2, 1
+                                ));
+        x = _mm256_permute4x64_epi64(x, 0xd8);
+        _mm_storeu_si128((__m128i *) writing, _mm256_extractf128_si256(x, 0));
+        s0 = _mm256_extract_epi32(x, 4);
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 = _mm256_extract_epi32(x, 6);
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        reading += 16;
+        writing += 8;
+    }
+    R[380] = (((3 * ((10923 * R0[760] + 16384) >> 15) + 2295) & 16383) * 10923) >> 15;
+
+    reading = (uint16 *) R;
+    writing = R;
+    i = 12;
+    while (i > 0) {
+        __m256i x, x2, y, y2;
+        --i;
+        if (!i) {
+            reading -= 4;
+            writing -= 2;
+            out -= 4;
+        }
+        x = _mm256_loadu_si256((__m256i *) (reading + 0));
+        x2 = _mm256_loadu_si256((__m256i *) (reading + 16));
+        y = x & _mm256_set1_epi32(65535);
+        y2 = x2 & _mm256_set1_epi32(65535);
+        x = _mm256_srli_epi32(x, 16);
+        x2 = _mm256_srli_epi32(x2, 16);
+        x = _mm256_mullo_epi32(x, _mm256_set1_epi32(9157));
+        x2 = _mm256_mullo_epi32(x2, _mm256_set1_epi32(9157));
+        x = _mm256_add_epi32(y, x);
+        x2 = _mm256_add_epi32(y2, x2);
+        x = _mm256_shuffle_epi8(x, _mm256_set_epi8(
+                                    15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0,
+                                    15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0
+                                ));
+        x2 = _mm256_shuffle_epi8(x2, _mm256_set_epi8(
+                                     15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0,
+                                     15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0
+                                 ));
+        x = _mm256_permute4x64_epi64(x, 0xd8);
+        x2 = _mm256_permute4x64_epi64(x2, 0xd8);
+        _mm256_storeu_si256((__m256i *) writing, _mm256_permute2f128_si256(x, x2, 0x31));
+        _mm256_storeu_si256((__m256i *) out, _mm256_permute2f128_si256(x, x2, 0x20));
+        reading += 32;
+        writing += 16;
+        out += 32;
+    }
+    R[190] = R[380];
+
+    reading = (uint16 *) R;
+    writing = R;
+    i = 12;
+    while (i > 0) {
+        __m256i x, y;
+        --i;
+        if (!i) {
+            reading -= 2;
+            writing -= 1;
+            out -= 1;
+        }
+        x = _mm256_loadu_si256((__m256i *) reading);
+        y = x & _mm256_set1_epi32(65535);
+        x = _mm256_srli_epi32(x, 16);
+        x = _mm256_mullo_epi32(x, _mm256_set1_epi32(1280));
+        x = _mm256_add_epi32(y, x);
+        x = _mm256_shuffle_epi8(x, _mm256_set_epi8(
+                                    12, 8, 4, 0, 12, 8, 4, 0, 14, 13, 10, 9, 6, 5, 2, 1,
+                                    12, 8, 4, 0, 12, 8, 4, 0, 14, 13, 10, 9, 6, 5, 2, 1
+                                ));
+        x = _mm256_permute4x64_epi64(x, 0xd8);
+        _mm_storeu_si128((__m128i *) writing, _mm256_extractf128_si256(x, 0));
+        s0 = _mm256_extract_epi32(x, 4);
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 = _mm256_extract_epi32(x, 6);
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        reading += 16;
+        writing += 8;
+    }
+    R[95] = R[190];
+
+    reading = (uint16 *) R;
+    writing = R;
+    i = 3;
+    while (i > 0) {
+        __m256i x, x2, y, y2;
+        --i;
+        x = _mm256_loadu_si256((__m256i *) (reading + 0));
+        x2 = _mm256_loadu_si256((__m256i *) (reading + 16));
+        y = x & _mm256_set1_epi32(65535);
+        y2 = x2 & _mm256_set1_epi32(65535);
+        x = _mm256_srli_epi32(x, 16);
+        x2 = _mm256_srli_epi32(x2, 16);
+        x = _mm256_mullo_epi32(x, _mm256_set1_epi32(6400));
+        x2 = _mm256_mullo_epi32(x2, _mm256_set1_epi32(6400));
+        x = _mm256_add_epi32(y, x);
+        x2 = _mm256_add_epi32(y2, x2);
+        x = _mm256_shuffle_epi8(x, _mm256_set_epi8(
+                                    15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0,
+                                    15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0
+                                ));
+        x2 = _mm256_shuffle_epi8(x2, _mm256_set_epi8(
+                                     15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0,
+                                     15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0
+                                 ));
+        x = _mm256_permute4x64_epi64(x, 0xd8);
+        x2 = _mm256_permute4x64_epi64(x2, 0xd8);
+        _mm256_storeu_si256((__m256i *) writing, _mm256_permute2f128_si256(x, x2, 0x31));
+        _mm256_storeu_si256((__m256i *) out, _mm256_permute2f128_si256(x, x2, 0x20));
+        reading += 32;
+        writing += 16;
+        out += 32;
+    }
+
+    reading = (uint16 *) R;
+    writing = R;
+    i = 3;
+    while (i > 0) {
+        __m256i x, y;
+        --i;
+        x = _mm256_loadu_si256((__m256i *) reading);
+        y = x & _mm256_set1_epi32(65535);
+        x = _mm256_srli_epi32(x, 16);
+        x = _mm256_mullo_epi32(x, _mm256_set1_epi32(625));
+        x = _mm256_add_epi32(y, x);
+        x = _mm256_shuffle_epi8(x, _mm256_set_epi8(
+                                    12, 8, 4, 0, 12, 8, 4, 0, 14, 13, 10, 9, 6, 5, 2, 1,
+                                    12, 8, 4, 0, 12, 8, 4, 0, 14, 13, 10, 9, 6, 5, 2, 1
+                                ));
+        x = _mm256_permute4x64_epi64(x, 0xd8);
+        _mm_storeu_si128((__m128i *) writing, _mm256_extractf128_si256(x, 0));
+        s0 = _mm256_extract_epi32(x, 4);
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 = _mm256_extract_epi32(x, 6);
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        reading += 16;
+        writing += 8;
+    }
+
+    reading = (uint16 *) R;
+    writing = R;
+    i = 2;
+    while (i > 0) {
+        __m256i x, y;
+        --i;
+        if (!i) {
+            reading -= 8;
+            writing -= 4;
+            out -= 4;
+        }
+        x = _mm256_loadu_si256((__m256i *) reading);
+        y = x & _mm256_set1_epi32(65535);
+        x = _mm256_srli_epi32(x, 16);
+        x = _mm256_mullo_epi32(x, _mm256_set1_epi32(1526));
+        x = _mm256_add_epi32(y, x);
+        x = _mm256_shuffle_epi8(x, _mm256_set_epi8(
+                                    12, 8, 4, 0, 12, 8, 4, 0, 14, 13, 10, 9, 6, 5, 2, 1,
+                                    12, 8, 4, 0, 12, 8, 4, 0, 14, 13, 10, 9, 6, 5, 2, 1
+                                ));
+        x = _mm256_permute4x64_epi64(x, 0xd8);
+        _mm_storeu_si128((__m128i *) writing, _mm256_extractf128_si256(x, 0));
+        s0 = _mm256_extract_epi32(x, 4);
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 = _mm256_extract_epi32(x, 6);
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        reading += 16;
+        writing += 8;
+    }
+
+    for (i = 0; i < 6; ++i) {
+        r0 = R[2 * i];
+        r1 = R[2 * i + 1];
+        r2 = r0 + r1 * (uint32)9097;
+        *out++ = r2;
+        r2 >>= 8;
+        *out++ = r2;
+        r2 >>= 8;
+        R[i] = r2;
+    }
+
+    for (i = 0; i < 3; ++i) {
+        r0 = R[2 * i];
+        r1 = R[2 * i + 1];
+        r2 = r0 + r1 * (uint32)1263;
+        *out++ = r2;
+        r2 >>= 8;
+        R[i] = r2;
+    }
+
+    r0 = R[0];
+    r1 = R[1];
+    r2 = r0 + r1 * (uint32)6232;
+    *out++ = r2;
+    r2 >>= 8;
+    *out++ = r2;
+    r2 >>= 8;
+    R[0] = r2;
+    R[1] = R[2];
+
+    r0 = R[0];
+    r1 = R[1];
+    r2 = r0 + r1 * (uint32)593;
+    *out++ = r2;
+    r2 >>= 8;
+    R[0] = r2;
+
+    r0 = R[0];
+    *out++ = r0;
+    r0 >>= 8;
+    *out++ = r0; /*clang-analyzer-deadcode.DeadStores*/ /*r0 >>= 8;*/
+}
diff --git a/crypto_kem/ntrulpr761/avx2/crypto_encode_761x1531round.h b/crypto_kem/ntrulpr761/avx2/crypto_encode_761x1531round.h
new file mode 100644
index 00000000..b9335dad
--- /dev/null
+++ b/crypto_kem/ntrulpr761/avx2/crypto_encode_761x1531round.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_NTRULPR761_AVX2_CRYPTO_ENCODE_761X1531ROUND_H
+#define PQCLEAN_NTRULPR761_AVX2_CRYPTO_ENCODE_761X1531ROUND_H
+
+#include <stdint.h>
+#define PQCLEAN_NTRULPR761_AVX2_crypto_encode_761x1531round_STRBYTES 1007
+#define PQCLEAN_NTRULPR761_AVX2_crypto_encode_761x1531round_ITEMS 761
+#define PQCLEAN_NTRULPR761_AVX2_crypto_encode_761x1531round_ITEMBYTES 2
+
+void PQCLEAN_NTRULPR761_AVX2_crypto_encode_761x1531round(unsigned char *out, const void *v);
+#endif
diff --git a/crypto_kem/ntrulpr761/avx2/crypto_encode_761x3.c b/crypto_kem/ntrulpr761/avx2/crypto_encode_761x3.c
new file mode 100644
index 00000000..9cb10c88
--- /dev/null
+++ b/crypto_kem/ntrulpr761/avx2/crypto_encode_761x3.c
@@ -0,0 +1,64 @@
+#include "crypto_encode_761x3.h"
+#include <immintrin.h>
+#define uint8 uint8_t
+
+#define p 761
+#define loops 6
+#define overshoot 2
+
+static const union {
+    uint8 init[32];
+    __m256i val;
+} lobytes_buf = { .init = {
+        255, 0, 255, 0, 255, 0, 255, 0,
+        255, 0, 255, 0, 255, 0, 255, 0,
+        255, 0, 255, 0, 255, 0, 255, 0,
+        255, 0, 255, 0, 255, 0, 255, 0,
+    }
+};
+#define lobytes (lobytes_buf.val)
+
+void PQCLEAN_NTRULPR761_AVX2_crypto_encode_761x3(unsigned char *s, const void *v) {
+    const uint8 *f = v;
+    int loop;
+    const uint8 *nextf = f + 128 - 4 * overshoot;
+    unsigned char *nexts = s + 32 - overshoot;
+
+    for (loop = loops; loop > 0; --loop) {
+        __m256i f0 = _mm256_loadu_si256((const __m256i *) (f + 0));
+        __m256i f1 = _mm256_loadu_si256((const __m256i *) (f + 32));
+        __m256i f2 = _mm256_loadu_si256((const __m256i *) (f + 64));
+        __m256i f3 = _mm256_loadu_si256((const __m256i *) (f + 96));
+        f = nextf;
+        nextf += 128;
+
+        __m256i a0 = _mm256_packus_epi16(f0 & lobytes, f1 & lobytes);
+        /* 0 2 4 6 8 10 12 14 32 34 36 38 40 42 44 46 */
+        /* 16 18 20 22 24 26 28 30 48 50 52 54 56 58 60 62 */
+        __m256i a1 = _mm256_packus_epi16(_mm256_srli_epi16(f0, 8), _mm256_srli_epi16(f1, 8));
+        /* 1 3 ... */
+        __m256i a2 = _mm256_packus_epi16(f2 & lobytes, f3 & lobytes);
+        __m256i a3 = _mm256_packus_epi16(_mm256_srli_epi16(f2, 8), _mm256_srli_epi16(f3, 8));
+
+        a0 = _mm256_add_epi8(a0, _mm256_slli_epi16(a1 & _mm256_set1_epi8(63), 2));
+        a2 = _mm256_add_epi8(a2, _mm256_slli_epi16(a3 & _mm256_set1_epi8(63), 2));
+
+        __m256i b0 = _mm256_packus_epi16(a0 & lobytes, a2 & lobytes);
+        /* 0 4 8 12 32 36 40 44 64 68 72 76 96 100 104 108 */
+        /* 16 20 24 28 48 52 56 60 80 84 88 92 112 116 120 124 */
+        __m256i b2 = _mm256_packus_epi16(_mm256_srli_epi16(a0, 8), _mm256_srli_epi16(a2, 8));
+        /* 2 6 ... */
+
+        b0 = _mm256_add_epi8(b0, _mm256_slli_epi16(b2 & _mm256_set1_epi8(15), 4));
+
+        b0 = _mm256_permutevar8x32_epi32(b0, _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0));
+
+        b0 = _mm256_add_epi8(b0, _mm256_set1_epi8(85));
+
+        _mm256_storeu_si256((__m256i *) s, b0);
+        s = nexts;
+        nexts += 32;
+    }
+
+    *s++ = *f++ + 1;
+}
diff --git a/crypto_kem/ntrulpr761/avx2/crypto_encode_761x3.h b/crypto_kem/ntrulpr761/avx2/crypto_encode_761x3.h
new file mode 100644
index 00000000..da478dbb
--- /dev/null
+++ b/crypto_kem/ntrulpr761/avx2/crypto_encode_761x3.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_NTRULPR761_AVX2_CRYPTO_ENCODE_761X3_H
+#define PQCLEAN_NTRULPR761_AVX2_CRYPTO_ENCODE_761X3_H
+
+#include <stdint.h>
+#define PQCLEAN_NTRULPR761_AVX2_crypto_encode_761x3_STRBYTES 191
+#define PQCLEAN_NTRULPR761_AVX2_crypto_encode_761x3_ITEMS 761
+#define PQCLEAN_NTRULPR761_AVX2_crypto_encode_761x3_ITEMBYTES 1
+
+void PQCLEAN_NTRULPR761_AVX2_crypto_encode_761x3(unsigned char *s, const void *v);
+#endif
diff --git a/crypto_kem/ntrulpr761/avx2/crypto_encode_761xint16.c b/crypto_kem/ntrulpr761/avx2/crypto_encode_761xint16.c
new file mode 100644
index 00000000..cc86a098
--- /dev/null
+++ b/crypto_kem/ntrulpr761/avx2/crypto_encode_761xint16.c
@@ -0,0 +1,13 @@
+#include "crypto_encode_761xint16.h"
+
+
+void PQCLEAN_NTRULPR761_AVX2_crypto_encode_761xint16(unsigned char *s, const void *v) {
+    const uint16_t *x = v;
+    int i;
+
+    for (i = 0; i < 761; ++i) {
+        uint16_t u = *x++;
+        *s++ = u;
+        *s++ = u >> 8;
+    }
+}
diff --git a/crypto_kem/ntrulpr761/avx2/crypto_encode_761xint16.h b/crypto_kem/ntrulpr761/avx2/crypto_encode_761xint16.h
new file mode 100644
index 00000000..0ae1dafe
--- /dev/null
+++ b/crypto_kem/ntrulpr761/avx2/crypto_encode_761xint16.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_NTRULPR761_AVX2_CRYPTO_ENCODE_761XINT16_H
+#define PQCLEAN_NTRULPR761_AVX2_CRYPTO_ENCODE_761XINT16_H
+
+#include <stdint.h>
+#define PQCLEAN_NTRULPR761_AVX2_crypto_encode_761xint16_STRBYTES 1522
+#define PQCLEAN_NTRULPR761_AVX2_crypto_encode_761xint16_ITEMBYTES 2
+#define PQCLEAN_NTRULPR761_AVX2_crypto_encode_761xint16_ITEMS 761
+
+void PQCLEAN_NTRULPR761_AVX2_crypto_encode_761xint16(unsigned char *s, const void *v);
+#endif
diff --git a/crypto_kem/ntrulpr761/avx2/crypto_sort_int32.c b/crypto_kem/ntrulpr761/avx2/crypto_sort_int32.c
new file mode 100644
index 00000000..ac82d1a4
--- /dev/null
+++ b/crypto_kem/ntrulpr761/avx2/crypto_sort_int32.c
@@ -0,0 +1,1210 @@
+#include "crypto_sort_int32.h"
+#include <immintrin.h>
+// Based on supercop-20200820/crypto_sort/int32/avx2
+
+
+#define int32 int32_t
+
+typedef __m256i int32x8;
+#define int32x8_load(z) _mm256_loadu_si256((__m256i *) (z))
+#define int32x8_store(z,i) _mm256_storeu_si256((__m256i *) (z),(i))
+#define int32x8_min _mm256_min_epi32
+#define int32x8_max _mm256_max_epi32
+
+#define int32x8_MINMAX(a,b) \
+    do { \
+        int32x8 c = int32x8_min((a),(b)); \
+        (b) = int32x8_max((a),(b)); \
+        (a) = c; \
+    } while(0)
+
+static inline void int32_MINMAX(int32 *a, int32 *b) {
+    int32 ab = *b ^ *a;
+    int32 c = (int32)((int64_t) * b - (int64_t) * a);
+    c ^= ab & (c ^ *b);
+    c >>= 31;
+    c &= ab;
+    *a ^= c;
+    *b ^= c;
+}
+
+static void minmax_vector(int32 *x, int32 *y, size_t n) {
+    if ((long long) n < 8) {
+        while ((long long) n > 0) {
+            int32_MINMAX(x, y);
+            ++x;
+            ++y;
+            --n;
+        }
+        return;
+    }
+    if (n & 7) {
+        int32x8 x0 = int32x8_load(x + n - 8);
+        int32x8 y0 = int32x8_load(y + n - 8);
+        int32x8_MINMAX(x0, y0);
+        int32x8_store(x + n - 8, x0);
+        int32x8_store(y + n - 8, y0);
+        n &= ~7;
+    }
+    do {
+        int32x8 x0 = int32x8_load(x);
+        int32x8 y0 = int32x8_load(y);
+        int32x8_MINMAX(x0, y0);
+        int32x8_store(x, x0);
+        int32x8_store(y, y0);
+        x += 8;
+        y += 8;
+        n -= 8;
+    } while (n);
+}
+
+/* stages 8,4,2,1 of size-16 bitonic merging */
+static void merge16_finish(int32 *x, int32x8 x0, int32x8 x1, int flagdown) {
+    int32x8 b0, b1, c0, c1, mask;
+
+    int32x8_MINMAX(x0, x1);
+
+    b0 = _mm256_permute2x128_si256(x0, x1, 0x20); /* A0123B0123 */
+    b1 = _mm256_permute2x128_si256(x0, x1, 0x31); /* A4567B4567 */
+
+    int32x8_MINMAX(b0, b1);
+
+    c0 = _mm256_unpacklo_epi64(b0, b1); /* A0145B0145 */
+    c1 = _mm256_unpackhi_epi64(b0, b1); /* A2367B2367 */
+
+    int32x8_MINMAX(c0, c1);
+
+    b0 = _mm256_unpacklo_epi32(c0, c1); /* A0213B0213 */
+    b1 = _mm256_unpackhi_epi32(c0, c1); /* A4657B4657 */
+
+    c0 = _mm256_unpacklo_epi64(b0, b1); /* A0246B0246 */
+    c1 = _mm256_unpackhi_epi64(b0, b1); /* A1357B1357 */
+
+    int32x8_MINMAX(c0, c1);
+
+    b0 = _mm256_unpacklo_epi32(c0, c1); /* A0123B0123 */
+    b1 = _mm256_unpackhi_epi32(c0, c1); /* A4567B4567 */
+
+    x0 = _mm256_permute2x128_si256(b0, b1, 0x20); /* A01234567 */
+    x1 = _mm256_permute2x128_si256(b0, b1, 0x31); /* A01234567 */
+
+    if (flagdown) {
+        mask = _mm256_set1_epi32(-1);
+        x0 ^= mask;
+        x1 ^= mask;
+    }
+
+    int32x8_store(&x[0], x0);
+    int32x8_store(&x[8], x1);
+}
+
+/* stages 64,32 of bitonic merging; n is multiple of 128 */
+static void int32_twostages_32(int32 *x, size_t n) {
+    size_t i;
+
+    while (n > 0) {
+        for (i = 0; i < 32; i += 8) {
+            int32x8 x0 = int32x8_load(&x[i]);
+            int32x8 x1 = int32x8_load(&x[i + 32]);
+            int32x8 x2 = int32x8_load(&x[i + 64]);
+            int32x8 x3 = int32x8_load(&x[i + 96]);
+
+            int32x8_MINMAX(x0, x2);
+            int32x8_MINMAX(x1, x3);
+            int32x8_MINMAX(x0, x1);
+            int32x8_MINMAX(x2, x3);
+
+            int32x8_store(&x[i], x0);
+            int32x8_store(&x[i + 32], x1);
+            int32x8_store(&x[i + 64], x2);
+            int32x8_store(&x[i + 96], x3);
+        }
+        x += 128;
+        n -= 128;
+    }
+}
+
+/* stages 4q,2q,q of bitonic merging */
+static size_t int32_threestages(int32 *x, size_t n, size_t q) {
+    size_t k, i;
+
+    for (k = 0; k + 8 * q <= n; k += 8 * q) {
+        for (i = k; i < k + q; i += 8) {
+            int32x8 x0 = int32x8_load(&x[i]);
+            int32x8 x1 = int32x8_load(&x[i + q]);
+            int32x8 x2 = int32x8_load(&x[i + 2 * q]);
+            int32x8 x3 = int32x8_load(&x[i + 3 * q]);
+            int32x8 x4 = int32x8_load(&x[i + 4 * q]);
+            int32x8 x5 = int32x8_load(&x[i + 5 * q]);
+            int32x8 x6 = int32x8_load(&x[i + 6 * q]);
+            int32x8 x7 = int32x8_load(&x[i + 7 * q]);
+
+            int32x8_MINMAX(x0, x4);
+            int32x8_MINMAX(x1, x5);
+            int32x8_MINMAX(x2, x6);
+            int32x8_MINMAX(x3, x7);
+            int32x8_MINMAX(x0, x2);
+            int32x8_MINMAX(x1, x3);
+            int32x8_MINMAX(x4, x6);
+            int32x8_MINMAX(x5, x7);
+            int32x8_MINMAX(x0, x1);
+            int32x8_MINMAX(x2, x3);
+            int32x8_MINMAX(x4, x5);
+            int32x8_MINMAX(x6, x7);
+
+            int32x8_store(&x[i], x0);
+            int32x8_store(&x[i + q], x1);
+            int32x8_store(&x[i + 2 * q], x2);
+            int32x8_store(&x[i + 3 * q], x3);
+            int32x8_store(&x[i + 4 * q], x4);
+            int32x8_store(&x[i + 5 * q], x5);
+            int32x8_store(&x[i + 6 * q], x6);
+            int32x8_store(&x[i + 7 * q], x7);
+        }
+    }
+
+    return k;
+}
+
+/* n is a power of 2; n >= 8; if n == 8 then flagdown */
+// NOLINTNEXTLINE(google-readability-function-size)
+static void int32_sort_2power(int32 *x, size_t n, int flagdown) {
+    size_t p, q, i, j, k;
+    int32x8 mask;
+
+    if (n == 8) {
+        int32 x0 = x[0];
+        int32 x1 = x[1];
+        int32 x2 = x[2];
+        int32 x3 = x[3];
+        int32 x4 = x[4];
+        int32 x5 = x[5];
+        int32 x6 = x[6];
+        int32 x7 = x[7];
+
+        /* odd-even sort instead of bitonic sort */
+
+        int32_MINMAX(&x1, &x0);
+        int32_MINMAX(&x3, &x2);
+        int32_MINMAX(&x2, &x0);
+        int32_MINMAX(&x3, &x1);
+        int32_MINMAX(&x2, &x1);
+
+        int32_MINMAX(&x5, &x4);
+        int32_MINMAX(&x7, &x6);
+        int32_MINMAX(&x6, &x4);
+        int32_MINMAX(&x7, &x5);
+        int32_MINMAX(&x6, &x5);
+
+        int32_MINMAX(&x4, &x0);
+        int32_MINMAX(&x6, &x2);
+        int32_MINMAX(&x4, &x2);
+
+        int32_MINMAX(&x5, &x1);
+        int32_MINMAX(&x7, &x3);
+        int32_MINMAX(&x5, &x3);
+
+        int32_MINMAX(&x2, &x1);
+        int32_MINMAX(&x4, &x3);
+        int32_MINMAX(&x6, &x5);
+
+        x[0] = x0;
+        x[1] = x1;
+        x[2] = x2;
+        x[3] = x3;
+        x[4] = x4;
+        x[5] = x5;
+        x[6] = x6;
+        x[7] = x7;
+        return;
+    }
+
+    if (n == 16) {
+        int32x8 x0, x1, b0, b1, c0, c1;
+
+        x0 = int32x8_load(&x[0]);
+        x1 = int32x8_load(&x[8]);
+
+        mask = _mm256_set_epi32(0, 0, -1, -1, 0, 0, -1, -1);
+
+        x0 ^= mask; /* A01234567 */
+        x1 ^= mask; /* B01234567 */
+
+        b0 = _mm256_unpacklo_epi32(x0, x1); /* AB0AB1AB4AB5 */
+        b1 = _mm256_unpackhi_epi32(x0, x1); /* AB2AB3AB6AB7 */
+
+        c0 = _mm256_unpacklo_epi64(b0, b1); /* AB0AB2AB4AB6 */
+        c1 = _mm256_unpackhi_epi64(b0, b1); /* AB1AB3AB5AB7 */
+
+        int32x8_MINMAX(c0, c1);
+
+        mask = _mm256_set_epi32(0, 0, -1, -1, -1, -1, 0, 0);
+        c0 ^= mask;
+        c1 ^= mask;
+
+        b0 = _mm256_unpacklo_epi32(c0, c1); /* A01B01A45B45 */
+        b1 = _mm256_unpackhi_epi32(c0, c1); /* A23B23A67B67 */
+
+        int32x8_MINMAX(b0, b1);
+
+        x0 = _mm256_unpacklo_epi64(b0, b1); /* A01234567 */
+        x1 = _mm256_unpackhi_epi64(b0, b1); /* B01234567 */
+
+        b0 = _mm256_unpacklo_epi32(x0, x1); /* AB0AB1AB4AB5 */
+        b1 = _mm256_unpackhi_epi32(x0, x1); /* AB2AB3AB6AB7 */
+
+        c0 = _mm256_unpacklo_epi64(b0, b1); /* AB0AB2AB4AB6 */
+        c1 = _mm256_unpackhi_epi64(b0, b1); /* AB1AB3AB5AB7 */
+
+        int32x8_MINMAX(c0, c1);
+
+        b0 = _mm256_unpacklo_epi32(c0, c1); /* A01B01A45B45 */
+        b1 = _mm256_unpackhi_epi32(c0, c1); /* A23B23A67B67 */
+
+        b0 ^= mask;
+        b1 ^= mask;
+
+        c0 = _mm256_permute2x128_si256(b0, b1, 0x20); /* A01B01A23B23 */
+        c1 = _mm256_permute2x128_si256(b0, b1, 0x31); /* A45B45A67B67 */
+
+        int32x8_MINMAX(c0, c1);
+
+        b0 = _mm256_permute2x128_si256(c0, c1, 0x20); /* A01B01A45B45 */
+        b1 = _mm256_permute2x128_si256(c0, c1, 0x31); /* A23B23A67B67 */
+
+        int32x8_MINMAX(b0, b1);
+
+        x0 = _mm256_unpacklo_epi64(b0, b1); /* A01234567 */
+        x1 = _mm256_unpackhi_epi64(b0, b1); /* B01234567 */
+
+        b0 = _mm256_unpacklo_epi32(x0, x1); /* AB0AB1AB4AB5 */
+        b1 = _mm256_unpackhi_epi32(x0, x1); /* AB2AB3AB6AB7 */
+
+        c0 = _mm256_unpacklo_epi64(b0, b1); /* AB0AB2AB4AB6 */
+        c1 = _mm256_unpackhi_epi64(b0, b1); /* AB1AB3AB5AB7 */
+
+        int32x8_MINMAX(c0, c1);
+
+        b0 = _mm256_unpacklo_epi32(c0, c1); /* A01B01A45B45 */
+        b1 = _mm256_unpackhi_epi32(c0, c1); /* A23B23A67B67 */
+
+        x0 = _mm256_unpacklo_epi64(b0, b1); /* A01234567 */
+        x1 = _mm256_unpackhi_epi64(b0, b1); /* B01234567 */
+
+        mask = _mm256_set1_epi32(-1);
+        if (flagdown) {
+            x1 ^= mask;
+        } else {
+            x0 ^= mask;
+        }
+
+        merge16_finish(x, x0, x1, flagdown);
+        return;
+    }
+
+    if (n == 32) {
+        int32x8 x0, x1, x2, x3;
+
+        int32_sort_2power(x, 16, 1);
+        int32_sort_2power(x + 16, 16, 0);
+
+        x0 = int32x8_load(&x[0]);
+        x1 = int32x8_load(&x[8]);
+        x2 = int32x8_load(&x[16]);
+        x3 = int32x8_load(&x[24]);
+
+        if (flagdown) {
+            mask = _mm256_set1_epi32(-1);
+            x0 ^= mask;
+            x1 ^= mask;
+            x2 ^= mask;
+            x3 ^= mask;
+        }
+
+        int32x8_MINMAX(x0, x2);
+        int32x8_MINMAX(x1, x3);
+
+        merge16_finish(x, x0, x1, flagdown);
+        merge16_finish(x + 16, x2, x3, flagdown);
+        return;
+    }
+
+    p = n >> 3;
+    for (i = 0; i < p; i += 8) {
+        int32x8 x0 = int32x8_load(&x[i]);
+        int32x8 x2 = int32x8_load(&x[i + 2 * p]);
+        int32x8 x4 = int32x8_load(&x[i + 4 * p]);
+        int32x8 x6 = int32x8_load(&x[i + 6 * p]);
+
+        /* odd-even stage instead of bitonic stage */
+
+        int32x8_MINMAX(x4, x0);
+        int32x8_MINMAX(x6, x2);
+        int32x8_MINMAX(x2, x0);
+        int32x8_MINMAX(x6, x4);
+        int32x8_MINMAX(x2, x4);
+
+        int32x8_store(&x[i], x0);
+        int32x8_store(&x[i + 2 * p], x2);
+        int32x8_store(&x[i + 4 * p], x4);
+        int32x8_store(&x[i + 6 * p], x6);
+
+        int32x8 x1 = int32x8_load(&x[i + p]);
+        int32x8 x3 = int32x8_load(&x[i + 3 * p]);
+        int32x8 x5 = int32x8_load(&x[i + 5 * p]);
+        int32x8 x7 = int32x8_load(&x[i + 7 * p]);
+
+        int32x8_MINMAX(x1, x5);
+        int32x8_MINMAX(x3, x7);
+        int32x8_MINMAX(x1, x3);
+        int32x8_MINMAX(x5, x7);
+        int32x8_MINMAX(x5, x3);
+
+        int32x8_store(&x[i + p], x1);
+        int32x8_store(&x[i + 3 * p], x3);
+        int32x8_store(&x[i + 5 * p], x5);
+        int32x8_store(&x[i + 7 * p], x7);
+    }
+
+    if (n >= 128) {
+        int flip, flipflip;
+
+        mask = _mm256_set1_epi32(-1);
+
+        for (j = 0; j < n; j += 32) {
+            int32x8 x0 = int32x8_load(&x[j]);
+            int32x8 x1 = int32x8_load(&x[j + 16]);
+            x0 ^= mask;
+            x1 ^= mask;
+            int32x8_store(&x[j], x0);
+            int32x8_store(&x[j + 16], x1);
+        }
+
+        p = 8;
+        for (;;) { /* for p in [8, 16, ..., n/16] */
+            q = p >> 1;
+            while (q >= 128) {
+                int32_threestages(x, n, q >> 2);
+                q >>= 3;
+            }
+            if (q == 64) {
+                int32_twostages_32(x, n);
+                q = 16;
+            }
+            if (q == 32) {
+                q = 8;
+                for (k = 0; k < n; k += 8 * q) {
+                    for (i = k; i < k + q; i += 8) {
+                        int32x8 x0 = int32x8_load(&x[i]);
+                        int32x8 x1 = int32x8_load(&x[i + q]);
+                        int32x8 x2 = int32x8_load(&x[i + 2 * q]);
+                        int32x8 x3 = int32x8_load(&x[i + 3 * q]);
+                        int32x8 x4 = int32x8_load(&x[i + 4 * q]);
+                        int32x8 x5 = int32x8_load(&x[i + 5 * q]);
+                        int32x8 x6 = int32x8_load(&x[i + 6 * q]);
+                        int32x8 x7 = int32x8_load(&x[i + 7 * q]);
+
+                        int32x8_MINMAX(x0, x4);
+                        int32x8_MINMAX(x1, x5);
+                        int32x8_MINMAX(x2, x6);
+                        int32x8_MINMAX(x3, x7);
+                        int32x8_MINMAX(x0, x2);
+                        int32x8_MINMAX(x1, x3);
+                        int32x8_MINMAX(x4, x6);
+                        int32x8_MINMAX(x5, x7);
+                        int32x8_MINMAX(x0, x1);
+                        int32x8_MINMAX(x2, x3);
+                        int32x8_MINMAX(x4, x5);
+                        int32x8_MINMAX(x6, x7);
+
+                        int32x8_store(&x[i], x0);
+                        int32x8_store(&x[i + q], x1);
+                        int32x8_store(&x[i + 2 * q], x2);
+                        int32x8_store(&x[i + 3 * q], x3);
+                        int32x8_store(&x[i + 4 * q], x4);
+                        int32x8_store(&x[i + 5 * q], x5);
+                        int32x8_store(&x[i + 6 * q], x6);
+                        int32x8_store(&x[i + 7 * q], x7);
+                    }
+                }
+                q = 4;
+            }
+            if (q == 16) {
+                q = 8;
+                for (k = 0; k < n; k += 4 * q) {
+                    for (i = k; i < k + q; i += 8) {
+                        int32x8 x0 = int32x8_load(&x[i]);
+                        int32x8 x1 = int32x8_load(&x[i + q]);
+                        int32x8 x2 = int32x8_load(&x[i + 2 * q]);
+                        int32x8 x3 = int32x8_load(&x[i + 3 * q]);
+
+                        int32x8_MINMAX(x0, x2);
+                        int32x8_MINMAX(x1, x3);
+                        int32x8_MINMAX(x0, x1);
+                        int32x8_MINMAX(x2, x3);
+
+                        int32x8_store(&x[i], x0);
+                        int32x8_store(&x[i + q], x1);
+                        int32x8_store(&x[i + 2 * q], x2);
+                        int32x8_store(&x[i + 3 * q], x3);
+                    }
+                }
+                q = 4;
+            }
+            if (q == 8) {
+                for (k = 0; k < n; k += q + q) {
+                    int32x8 x0 = int32x8_load(&x[k]);
+                    int32x8 x1 = int32x8_load(&x[k + q]);
+
+                    int32x8_MINMAX(x0, x1);
+
+                    int32x8_store(&x[k], x0);
+                    int32x8_store(&x[k + q], x1);
+                }
+            }
+
+            q = n >> 3;
+            flip = (p << 1 == q);
+            flipflip = !flip;
+            for (j = 0; j < q; j += p + p) {
+                for (k = j; k < j + p + p; k += p) {
+                    for (i = k; i < k + p; i += 8) {
+                        int32x8 x0 = int32x8_load(&x[i]);
+                        int32x8 x1 = int32x8_load(&x[i + q]);
+                        int32x8 x2 = int32x8_load(&x[i + 2 * q]);
+                        int32x8 x3 = int32x8_load(&x[i + 3 * q]);
+                        int32x8 x4 = int32x8_load(&x[i + 4 * q]);
+                        int32x8 x5 = int32x8_load(&x[i + 5 * q]);
+                        int32x8 x6 = int32x8_load(&x[i + 6 * q]);
+                        int32x8 x7 = int32x8_load(&x[i + 7 * q]);
+
+                        int32x8_MINMAX(x0, x1);
+                        int32x8_MINMAX(x2, x3);
+                        int32x8_MINMAX(x4, x5);
+                        int32x8_MINMAX(x6, x7);
+                        int32x8_MINMAX(x0, x2);
+                        int32x8_MINMAX(x1, x3);
+                        int32x8_MINMAX(x4, x6);
+                        int32x8_MINMAX(x5, x7);
+                        int32x8_MINMAX(x0, x4);
+                        int32x8_MINMAX(x1, x5);
+                        int32x8_MINMAX(x2, x6);
+                        int32x8_MINMAX(x3, x7);
+
+                        if (flip) {
+                            x0 ^= mask;
+                            x1 ^= mask;
+                            x2 ^= mask;
+                            x3 ^= mask;
+                            x4 ^= mask;
+                            x5 ^= mask;
+                            x6 ^= mask;
+                            x7 ^= mask;
+                        }
+
+                        int32x8_store(&x[i], x0);
+                        int32x8_store(&x[i + q], x1);
+                        int32x8_store(&x[i + 2 * q], x2);
+                        int32x8_store(&x[i + 3 * q], x3);
+                        int32x8_store(&x[i + 4 * q], x4);
+                        int32x8_store(&x[i + 5 * q], x5);
+                        int32x8_store(&x[i + 6 * q], x6);
+                        int32x8_store(&x[i + 7 * q], x7);
+                    }
+                    flip ^= 1;
+                }
+                flip ^= flipflip;
+            }
+
+            if (p << 4 == n) {
+                break;
+            }
+            p <<= 1;
+        }
+    }
+
+    for (p = 4; p >= 1; p >>= 1) {
+        int32 *z = x;
+        int32 *target = x + n;
+        if (p == 4) {
+            mask = _mm256_set_epi32(0, 0, 0, 0, -1, -1, -1, -1);
+            while (z != target) {
+                int32x8 x0 = int32x8_load(&z[0]);
+                int32x8 x1 = int32x8_load(&z[8]);
+                x0 ^= mask;
+                x1 ^= mask;
+                int32x8_store(&z[0], x0);
+                int32x8_store(&z[8], x1);
+                z += 16;
+            }
+        } else if (p == 2) {
+            mask = _mm256_set_epi32(0, 0, -1, -1, -1, -1, 0, 0);
+            while (z != target) {
+                int32x8 x0 = int32x8_load(&z[0]);
+                int32x8 x1 = int32x8_load(&z[8]);
+                x0 ^= mask;
+                x1 ^= mask;
+                int32x8 b0 = _mm256_permute2x128_si256(x0, x1, 0x20);
+                int32x8 b1 = _mm256_permute2x128_si256(x0, x1, 0x31);
+                int32x8_MINMAX(b0, b1);
+                int32x8 c0 = _mm256_permute2x128_si256(b0, b1, 0x20);
+                int32x8 c1 = _mm256_permute2x128_si256(b0, b1, 0x31);
+                int32x8_store(&z[0], c0);
+                int32x8_store(&z[8], c1);
+                z += 16;
+            }
+        } else { /* p == 1 */
+            mask = _mm256_set_epi32(0, -1, -1, 0, 0, -1, -1, 0);
+            while (z != target) {
+                int32x8 x0 = int32x8_load(&z[0]);
+                int32x8 x1 = int32x8_load(&z[8]);
+                x0 ^= mask;
+                x1 ^= mask;
+                int32x8 b0 = _mm256_permute2x128_si256(x0, x1, 0x20); /* A0123B0123 */
+                int32x8 b1 = _mm256_permute2x128_si256(x0, x1, 0x31); /* A4567B4567 */
+                int32x8 c0 = _mm256_unpacklo_epi64(b0, b1); /* A0145B0145 */
+                int32x8 c1 = _mm256_unpackhi_epi64(b0, b1); /* A2367B2367 */
+                int32x8_MINMAX(c0, c1);
+                int32x8 d0 = _mm256_unpacklo_epi64(c0, c1); /* A0123B0123 */
+                int32x8 d1 = _mm256_unpackhi_epi64(c0, c1); /* A4567B4567 */
+                int32x8_MINMAX(d0, d1);
+                int32x8 e0 = _mm256_permute2x128_si256(d0, d1, 0x20);
+                int32x8 e1 = _mm256_permute2x128_si256(d0, d1, 0x31);
+                int32x8_store(&z[0], e0);
+                int32x8_store(&z[8], e1);
+                z += 16;
+            }
+        }
+
+        q = n >> 4;
+        while (q >= 128 || q == 32) {
+            int32_threestages(x, n, q >> 2);
+            q >>= 3;
+        }
+        while (q >= 16) {
+            q >>= 1;
+            for (j = 0; j < n; j += 4 * q) {
+                for (k = j; k < j + q; k += 8) {
+                    int32x8 x0 = int32x8_load(&x[k]);
+                    int32x8 x1 = int32x8_load(&x[k + q]);
+                    int32x8 x2 = int32x8_load(&x[k + 2 * q]);
+                    int32x8 x3 = int32x8_load(&x[k + 3 * q]);
+
+                    int32x8_MINMAX(x0, x2);
+                    int32x8_MINMAX(x1, x3);
+                    int32x8_MINMAX(x0, x1);
+                    int32x8_MINMAX(x2, x3);
+
+                    int32x8_store(&x[k], x0);
+                    int32x8_store(&x[k + q], x1);
+                    int32x8_store(&x[k + 2 * q], x2);
+                    int32x8_store(&x[k + 3 * q], x3);
+                }
+            }
+            q >>= 1;
+        }
+        if (q == 8) {
+            for (j = 0; j < n; j += 2 * q) {
+                int32x8 x0 = int32x8_load(&x[j]);
+                int32x8 x1 = int32x8_load(&x[j + q]);
+
+                int32x8_MINMAX(x0, x1);
+
+                int32x8_store(&x[j], x0);
+                int32x8_store(&x[j + q], x1);
+            }
+        }
+
+        q = n >> 3;
+        for (k = 0; k < q; k += 8) {
+            int32x8 x0 = int32x8_load(&x[k]);
+            int32x8 x1 = int32x8_load(&x[k + q]);
+            int32x8 x2 = int32x8_load(&x[k + 2 * q]);
+            int32x8 x3 = int32x8_load(&x[k + 3 * q]);
+            int32x8 x4 = int32x8_load(&x[k + 4 * q]);
+            int32x8 x5 = int32x8_load(&x[k + 5 * q]);
+            int32x8 x6 = int32x8_load(&x[k + 6 * q]);
+            int32x8 x7 = int32x8_load(&x[k + 7 * q]);
+
+            int32x8_MINMAX(x0, x1);
+            int32x8_MINMAX(x2, x3);
+            int32x8_MINMAX(x4, x5);
+            int32x8_MINMAX(x6, x7);
+            int32x8_MINMAX(x0, x2);
+            int32x8_MINMAX(x1, x3);
+            int32x8_MINMAX(x4, x6);
+            int32x8_MINMAX(x5, x7);
+            int32x8_MINMAX(x0, x4);
+            int32x8_MINMAX(x1, x5);
+            int32x8_MINMAX(x2, x6);
+            int32x8_MINMAX(x3, x7);
+
+            int32x8_store(&x[k], x0);
+            int32x8_store(&x[k + q], x1);
+            int32x8_store(&x[k + 2 * q], x2);
+            int32x8_store(&x[k + 3 * q], x3);
+            int32x8_store(&x[k + 4 * q], x4);
+            int32x8_store(&x[k + 5 * q], x5);
+            int32x8_store(&x[k + 6 * q], x6);
+            int32x8_store(&x[k + 7 * q], x7);
+        }
+    }
+
+    /* everything is still masked with _mm256_set_epi32(0,-1,0,-1,0,-1,0,-1); */
+    mask = _mm256_set1_epi32(-1);
+
+    for (i = 0; i < n; i += 64) {
+        int32x8 a0 = int32x8_load(&x[i]);
+        int32x8 a1 = int32x8_load(&x[i + 8]);
+        int32x8 a2 = int32x8_load(&x[i + 16]);
+        int32x8 a3 = int32x8_load(&x[i + 24]);
+        int32x8 a4 = int32x8_load(&x[i + 32]);
+        int32x8 a5 = int32x8_load(&x[i + 40]);
+        int32x8 a6 = int32x8_load(&x[i + 48]);
+        int32x8 a7 = int32x8_load(&x[i + 56]);
+
+        int32x8 b0 = _mm256_unpacklo_epi32(a0, a1); /* AB0AB1AB4AB5 */
+        int32x8 b1 = _mm256_unpackhi_epi32(a0, a1); /* AB2AB3AB6AB7 */
+        int32x8 b2 = _mm256_unpacklo_epi32(a2, a3); /* CD0CD1CD4CD5 */
+        int32x8 b3 = _mm256_unpackhi_epi32(a2, a3); /* CD2CD3CD6CD7 */
+        int32x8 b4 = _mm256_unpacklo_epi32(a4, a5); /* EF0EF1EF4EF5 */
+        int32x8 b5 = _mm256_unpackhi_epi32(a4, a5); /* EF2EF3EF6EF7 */
+        int32x8 b6 = _mm256_unpacklo_epi32(a6, a7); /* GH0GH1GH4GH5 */
+        int32x8 b7 = _mm256_unpackhi_epi32(a6, a7); /* GH2GH3GH6GH7 */
+
+        int32x8 c0 = _mm256_unpacklo_epi64(b0, b2); /* ABCD0ABCD4 */
+        int32x8 c1 = _mm256_unpacklo_epi64(b1, b3); /* ABCD2ABCD6 */
+        int32x8 c2 = _mm256_unpackhi_epi64(b0, b2); /* ABCD1ABCD5 */
+        int32x8 c3 = _mm256_unpackhi_epi64(b1, b3); /* ABCD3ABCD7 */
+        int32x8 c4 = _mm256_unpacklo_epi64(b4, b6); /* EFGH0EFGH4 */
+        int32x8 c5 = _mm256_unpacklo_epi64(b5, b7); /* EFGH2EFGH6 */
+        int32x8 c6 = _mm256_unpackhi_epi64(b4, b6); /* EFGH1EFGH5 */
+        int32x8 c7 = _mm256_unpackhi_epi64(b5, b7); /* EFGH3EFGH7 */
+
+        if (flagdown) {
+            c2 ^= mask;
+            c3 ^= mask;
+            c6 ^= mask;
+            c7 ^= mask;
+        } else {
+            c0 ^= mask;
+            c1 ^= mask;
+            c4 ^= mask;
+            c5 ^= mask;
+        }
+
+        int32x8 d0 = _mm256_permute2x128_si256(c0, c4, 0x20); /* ABCDEFGH0 */
+        int32x8 d1 = _mm256_permute2x128_si256(c2, c6, 0x20); /* ABCDEFGH1 */
+        int32x8 d2 = _mm256_permute2x128_si256(c1, c5, 0x20); /* ABCDEFGH2 */
+        int32x8 d3 = _mm256_permute2x128_si256(c3, c7, 0x20); /* ABCDEFGH5 */
+        int32x8 d4 = _mm256_permute2x128_si256(c0, c4, 0x31); /* ABCDEFGH4 */
+        int32x8 d5 = _mm256_permute2x128_si256(c2, c6, 0x31); /* ABCDEFGH3 */
+        int32x8 d6 = _mm256_permute2x128_si256(c1, c5, 0x31); /* ABCDEFGH6 */
+        int32x8 d7 = _mm256_permute2x128_si256(c3, c7, 0x31); /* ABCDEFGH7 */
+
+        int32x8_MINMAX(d0, d1);
+        int32x8_MINMAX(d2, d3);
+        int32x8_MINMAX(d4, d5);
+        int32x8_MINMAX(d6, d7);
+        int32x8_MINMAX(d0, d2);
+        int32x8_MINMAX(d1, d3);
+        int32x8_MINMAX(d4, d6);
+        int32x8_MINMAX(d5, d7);
+        int32x8_MINMAX(d0, d4);
+        int32x8_MINMAX(d1, d5);
+        int32x8_MINMAX(d2, d6);
+        int32x8_MINMAX(d3, d7);
+
+        int32x8 e0 = _mm256_unpacklo_epi32(d0, d1);
+        int32x8 e1 = _mm256_unpackhi_epi32(d0, d1);
+        int32x8 e2 = _mm256_unpacklo_epi32(d2, d3);
+        int32x8 e3 = _mm256_unpackhi_epi32(d2, d3);
+        int32x8 e4 = _mm256_unpacklo_epi32(d4, d5);
+        int32x8 e5 = _mm256_unpackhi_epi32(d4, d5);
+        int32x8 e6 = _mm256_unpacklo_epi32(d6, d7);
+        int32x8 e7 = _mm256_unpackhi_epi32(d6, d7);
+
+        int32x8 f0 = _mm256_unpacklo_epi64(e0, e2);
+        int32x8 f1 = _mm256_unpacklo_epi64(e1, e3);
+        int32x8 f2 = _mm256_unpackhi_epi64(e0, e2);
+        int32x8 f3 = _mm256_unpackhi_epi64(e1, e3);
+        int32x8 f4 = _mm256_unpacklo_epi64(e4, e6);
+        int32x8 f5 = _mm256_unpacklo_epi64(e5, e7);
+        int32x8 f6 = _mm256_unpackhi_epi64(e4, e6);
+        int32x8 f7 = _mm256_unpackhi_epi64(e5, e7);
+
+        int32x8 g0 = _mm256_permute2x128_si256(f0, f4, 0x20);
+        int32x8 g1 = _mm256_permute2x128_si256(f2, f6, 0x20);
+        int32x8 g2 = _mm256_permute2x128_si256(f1, f5, 0x20);
+        int32x8 g3 = _mm256_permute2x128_si256(f3, f7, 0x20);
+        int32x8 g4 = _mm256_permute2x128_si256(f0, f4, 0x31);
+        int32x8 g5 = _mm256_permute2x128_si256(f2, f6, 0x31);
+        int32x8 g6 = _mm256_permute2x128_si256(f1, f5, 0x31);
+        int32x8 g7 = _mm256_permute2x128_si256(f3, f7, 0x31);
+
+        int32x8_store(&x[i], g0);
+        int32x8_store(&x[i + 8], g1);
+        int32x8_store(&x[i + 16], g2);
+        int32x8_store(&x[i + 24], g3);
+        int32x8_store(&x[i + 32], g4);
+        int32x8_store(&x[i + 40], g5);
+        int32x8_store(&x[i + 48], g6);
+        int32x8_store(&x[i + 56], g7);
+    }
+
+    q = n >> 4;
+    while (q >= 128 || q == 32) {
+        q >>= 2;
+        for (j = 0; j < n; j += 8 * q) {
+            for (i = j; i < j + q; i += 8) {
+                int32x8 x0 = int32x8_load(&x[i]);
+                int32x8 x1 = int32x8_load(&x[i + q]);
+                int32x8 x2 = int32x8_load(&x[i + 2 * q]);
+                int32x8 x3 = int32x8_load(&x[i + 3 * q]);
+                int32x8 x4 = int32x8_load(&x[i + 4 * q]);
+                int32x8 x5 = int32x8_load(&x[i + 5 * q]);
+                int32x8 x6 = int32x8_load(&x[i + 6 * q]);
+                int32x8 x7 = int32x8_load(&x[i + 7 * q]);
+                int32x8_MINMAX(x0, x4);
+                int32x8_MINMAX(x1, x5);
+                int32x8_MINMAX(x2, x6);
+                int32x8_MINMAX(x3, x7);
+                int32x8_MINMAX(x0, x2);
+                int32x8_MINMAX(x1, x3);
+                int32x8_MINMAX(x4, x6);
+                int32x8_MINMAX(x5, x7);
+                int32x8_MINMAX(x0, x1);
+                int32x8_MINMAX(x2, x3);
+                int32x8_MINMAX(x4, x5);
+                int32x8_MINMAX(x6, x7);
+                int32x8_store(&x[i], x0);
+                int32x8_store(&x[i + q], x1);
+                int32x8_store(&x[i + 2 * q], x2);
+                int32x8_store(&x[i + 3 * q], x3);
+                int32x8_store(&x[i + 4 * q], x4);
+                int32x8_store(&x[i + 5 * q], x5);
+                int32x8_store(&x[i + 6 * q], x6);
+                int32x8_store(&x[i + 7 * q], x7);
+            }
+        }
+        q >>= 1;
+    }
+    while (q >= 16) {
+        q >>= 1;
+        for (j = 0; j < n; j += 4 * q) {
+            for (i = j; i < j + q; i += 8) {
+                int32x8 x0 = int32x8_load(&x[i]);
+                int32x8 x1 = int32x8_load(&x[i + q]);
+                int32x8 x2 = int32x8_load(&x[i + 2 * q]);
+                int32x8 x3 = int32x8_load(&x[i + 3 * q]);
+                int32x8_MINMAX(x0, x2);
+                int32x8_MINMAX(x1, x3);
+                int32x8_MINMAX(x0, x1);
+                int32x8_MINMAX(x2, x3);
+                int32x8_store(&x[i], x0);
+                int32x8_store(&x[i + q], x1);
+                int32x8_store(&x[i + 2 * q], x2);
+                int32x8_store(&x[i + 3 * q], x3);
+            }
+        }
+        q >>= 1;
+    }
+    if (q == 8) {
+        for (j = 0; j < n; j += q + q) {
+            int32x8 x0 = int32x8_load(&x[j]);
+            int32x8 x1 = int32x8_load(&x[j + q]);
+            int32x8_MINMAX(x0, x1);
+            int32x8_store(&x[j], x0);
+            int32x8_store(&x[j + q], x1);
+        }
+    }
+
+    q = n >> 3;
+    for (i = 0; i < q; i += 8) {
+        int32x8 x0 = int32x8_load(&x[i]);
+        int32x8 x1 = int32x8_load(&x[i + q]);
+        int32x8 x2 = int32x8_load(&x[i + 2 * q]);
+        int32x8 x3 = int32x8_load(&x[i + 3 * q]);
+        int32x8 x4 = int32x8_load(&x[i + 4 * q]);
+        int32x8 x5 = int32x8_load(&x[i + 5 * q]);
+        int32x8 x6 = int32x8_load(&x[i + 6 * q]);
+        int32x8 x7 = int32x8_load(&x[i + 7 * q]);
+
+        int32x8_MINMAX(x0, x1);
+        int32x8_MINMAX(x2, x3);
+        int32x8_MINMAX(x4, x5);
+        int32x8_MINMAX(x6, x7);
+        int32x8_MINMAX(x0, x2);
+        int32x8_MINMAX(x1, x3);
+        int32x8_MINMAX(x4, x6);
+        int32x8_MINMAX(x5, x7);
+        int32x8_MINMAX(x0, x4);
+        int32x8_MINMAX(x1, x5);
+        int32x8_MINMAX(x2, x6);
+        int32x8_MINMAX(x3, x7);
+
+        int32x8 b0 = _mm256_unpacklo_epi32(x0, x4); /* AE0AE1AE4AE5 */
+        int32x8 b1 = _mm256_unpackhi_epi32(x0, x4); /* AE2AE3AE6AE7 */
+        int32x8 b2 = _mm256_unpacklo_epi32(x1, x5); /* BF0BF1BF4BF5 */
+        int32x8 b3 = _mm256_unpackhi_epi32(x1, x5); /* BF2BF3BF6BF7 */
+        int32x8 b4 = _mm256_unpacklo_epi32(x2, x6); /* CG0CG1CG4CG5 */
+        int32x8 b5 = _mm256_unpackhi_epi32(x2, x6); /* CG2CG3CG6CG7 */
+        int32x8 b6 = _mm256_unpacklo_epi32(x3, x7); /* DH0DH1DH4DH5 */
+        int32x8 b7 = _mm256_unpackhi_epi32(x3, x7); /* DH2DH3DH6DH7 */
+
+        int32x8 c0 = _mm256_unpacklo_epi64(b0, b4); /* AECG0AECG4 */
+        int32x8 c1 = _mm256_unpacklo_epi64(b1, b5); /* AECG2AECG6 */
+        int32x8 c2 = _mm256_unpackhi_epi64(b0, b4); /* AECG1AECG5 */
+        int32x8 c3 = _mm256_unpackhi_epi64(b1, b5); /* AECG3AECG7 */
+        int32x8 c4 = _mm256_unpacklo_epi64(b2, b6); /* BFDH0BFDH4 */
+        int32x8 c5 = _mm256_unpacklo_epi64(b3, b7); /* BFDH2BFDH6 */
+        int32x8 c6 = _mm256_unpackhi_epi64(b2, b6); /* BFDH1BFDH5 */
+        int32x8 c7 = _mm256_unpackhi_epi64(b3, b7); /* BFDH3BFDH7 */
+
+        int32x8 d0 = _mm256_permute2x128_si256(c0, c4, 0x20); /* AECGBFDH0 */
+        int32x8 d1 = _mm256_permute2x128_si256(c1, c5, 0x20); /* AECGBFDH2 */
+        int32x8 d2 = _mm256_permute2x128_si256(c2, c6, 0x20); /* AECGBFDH1 */
+        int32x8 d3 = _mm256_permute2x128_si256(c3, c7, 0x20); /* AECGBFDH3 */
+        int32x8 d4 = _mm256_permute2x128_si256(c0, c4, 0x31); /* AECGBFDH4 */
+        int32x8 d5 = _mm256_permute2x128_si256(c1, c5, 0x31); /* AECGBFDH6 */
+        int32x8 d6 = _mm256_permute2x128_si256(c2, c6, 0x31); /* AECGBFDH5 */
+        int32x8 d7 = _mm256_permute2x128_si256(c3, c7, 0x31); /* AECGBFDH7 */
+
+        if (flagdown) {
+            d0 ^= mask;
+            d1 ^= mask;
+            d2 ^= mask;
+            d3 ^= mask;
+            d4 ^= mask;
+            d5 ^= mask;
+            d6 ^= mask;
+            d7 ^= mask;
+        }
+
+        int32x8_store(&x[i], d0);
+        int32x8_store(&x[i + q], d4);
+        int32x8_store(&x[i + 2 * q], d1);
+        int32x8_store(&x[i + 3 * q], d5);
+        int32x8_store(&x[i + 4 * q], d2);
+        int32x8_store(&x[i + 5 * q], d6);
+        int32x8_store(&x[i + 6 * q], d3);
+        int32x8_store(&x[i + 7 * q], d7);
+    }
+}
+
+void PQCLEAN_NTRULPR761_AVX2_crypto_sort_int32(int32 *x, size_t n) {
+    size_t q, i, j;
+
+    if (n <= 8) {
+        if (n == 8) {
+            int32_MINMAX(&x[0], &x[1]);
+            int32_MINMAX(&x[1], &x[2]);
+            int32_MINMAX(&x[2], &x[3]);
+            int32_MINMAX(&x[3], &x[4]);
+            int32_MINMAX(&x[4], &x[5]);
+            int32_MINMAX(&x[5], &x[6]);
+            int32_MINMAX(&x[6], &x[7]);
+        }
+        if (n >= 7) {
+            int32_MINMAX(&x[0], &x[1]);
+            int32_MINMAX(&x[1], &x[2]);
+            int32_MINMAX(&x[2], &x[3]);
+            int32_MINMAX(&x[3], &x[4]);
+            int32_MINMAX(&x[4], &x[5]);
+            int32_MINMAX(&x[5], &x[6]);
+        }
+        if (n >= 6) {
+            int32_MINMAX(&x[0], &x[1]);
+            int32_MINMAX(&x[1], &x[2]);
+            int32_MINMAX(&x[2], &x[3]);
+            int32_MINMAX(&x[3], &x[4]);
+            int32_MINMAX(&x[4], &x[5]);
+        }
+        if (n >= 5) {
+            int32_MINMAX(&x[0], &x[1]);
+            int32_MINMAX(&x[1], &x[2]);
+            int32_MINMAX(&x[2], &x[3]);
+            int32_MINMAX(&x[3], &x[4]);
+        }
+        if (n >= 4) {
+            int32_MINMAX(&x[0], &x[1]);
+            int32_MINMAX(&x[1], &x[2]);
+            int32_MINMAX(&x[2], &x[3]);
+        }
+        if (n >= 3) {
+            int32_MINMAX(&x[0], &x[1]);
+            int32_MINMAX(&x[1], &x[2]);
+        }
+        if (n >= 2) {
+            int32_MINMAX(&x[0], &x[1]);
+        }
+        return;
+    }
+
+    if (!(n & (n - 1))) {
+        int32_sort_2power(x, n, 0);
+        return;
+    }
+
+    q = 8;
+    while (q < n - q) {
+        q += q;
+    }
+    /* n > q >= 8 */
+
+    if (q <= 128) { /* n <= 256 */
+        int32x8 y[32];
+        for (i = q >> 3; i < q >> 2; ++i) {
+            y[i] = _mm256_set1_epi32(0x7fffffff);
+        }
+        for (i = 0; i < n; ++i) {
+            ((int32 *) y)[i] = x[i];
+        }
+        int32_sort_2power((int32 *) y, 2 * q, 0);
+        for (i = 0; i < n; ++i) {
+            x[i] = ((int32 *) y)[i];
+        }
+        return;
+    }
+
+    int32_sort_2power(x, q, 1);
+    PQCLEAN_NTRULPR761_AVX2_crypto_sort_int32(x + q, n - q);
+
+    while (q >= 64) {
+        q >>= 2;
+        j = int32_threestages(x, n, q);
+        minmax_vector(x + j, x + j + 4 * q, n - 4 * q - j);
+        if (j + 4 * q <= n) {
+            for (i = j; i < j + q; i += 8) {
+                int32x8 x0 = int32x8_load(&x[i]);
+                int32x8 x1 = int32x8_load(&x[i + q]);
+                int32x8 x2 = int32x8_load(&x[i + 2 * q]);
+                int32x8 x3 = int32x8_load(&x[i + 3 * q]);
+                int32x8_MINMAX(x0, x2);
+                int32x8_MINMAX(x1, x3);
+                int32x8_MINMAX(x0, x1);
+                int32x8_MINMAX(x2, x3);
+                int32x8_store(&x[i], x0);
+                int32x8_store(&x[i + q], x1);
+                int32x8_store(&x[i + 2 * q], x2);
+                int32x8_store(&x[i + 3 * q], x3);
+            }
+            j += 4 * q;
+        }
+        minmax_vector(x + j, x + j + 2 * q, n - 2 * q - j);
+        if (j + 2 * q <= n) {
+            for (i = j; i < j + q; i += 8) {
+                int32x8 x0 = int32x8_load(&x[i]);
+                int32x8 x1 = int32x8_load(&x[i + q]);
+                int32x8_MINMAX(x0, x1);
+                int32x8_store(&x[i], x0);
+                int32x8_store(&x[i + q], x1);
+            }
+            j += 2 * q;
+        }
+        minmax_vector(x + j, x + j + q, n - q - j);
+        q >>= 1;
+    }
+    if (q == 32) {
+        j = 0;
+        for (; j + 64 <= n; j += 64) {
+            int32x8 x0 = int32x8_load(&x[j]);
+            int32x8 x1 = int32x8_load(&x[j + 8]);
+            int32x8 x2 = int32x8_load(&x[j + 16]);
+            int32x8 x3 = int32x8_load(&x[j + 24]);
+            int32x8 x4 = int32x8_load(&x[j + 32]);
+            int32x8 x5 = int32x8_load(&x[j + 40]);
+            int32x8 x6 = int32x8_load(&x[j + 48]);
+            int32x8 x7 = int32x8_load(&x[j + 56]);
+            int32x8_MINMAX(x0, x4);
+            int32x8_MINMAX(x1, x5);
+            int32x8_MINMAX(x2, x6);
+            int32x8_MINMAX(x3, x7);
+            int32x8_MINMAX(x0, x2);
+            int32x8_MINMAX(x1, x3);
+            int32x8_MINMAX(x4, x6);
+            int32x8_MINMAX(x5, x7);
+            int32x8_MINMAX(x0, x1);
+            int32x8_MINMAX(x2, x3);
+            int32x8_MINMAX(x4, x5);
+            int32x8_MINMAX(x6, x7);
+            int32x8 a0 = _mm256_permute2x128_si256(x0, x1, 0x20);
+            int32x8 a1 = _mm256_permute2x128_si256(x0, x1, 0x31);
+            int32x8 a2 = _mm256_permute2x128_si256(x2, x3, 0x20);
+            int32x8 a3 = _mm256_permute2x128_si256(x2, x3, 0x31);
+            int32x8 a4 = _mm256_permute2x128_si256(x4, x5, 0x20);
+            int32x8 a5 = _mm256_permute2x128_si256(x4, x5, 0x31);
+            int32x8 a6 = _mm256_permute2x128_si256(x6, x7, 0x20);
+            int32x8 a7 = _mm256_permute2x128_si256(x6, x7, 0x31);
+            int32x8_MINMAX(a0, a1);
+            int32x8_MINMAX(a2, a3);
+            int32x8_MINMAX(a4, a5);
+            int32x8_MINMAX(a6, a7);
+            int32x8 b0 = _mm256_permute2x128_si256(a0, a1, 0x20);
+            int32x8 b1 = _mm256_permute2x128_si256(a0, a1, 0x31);
+            int32x8 b2 = _mm256_permute2x128_si256(a2, a3, 0x20);
+            int32x8 b3 = _mm256_permute2x128_si256(a2, a3, 0x31);
+            int32x8 b4 = _mm256_permute2x128_si256(a4, a5, 0x20);
+            int32x8 b5 = _mm256_permute2x128_si256(a4, a5, 0x31);
+            int32x8 b6 = _mm256_permute2x128_si256(a6, a7, 0x20);
+            int32x8 b7 = _mm256_permute2x128_si256(a6, a7, 0x31);
+            int32x8 c0 = _mm256_unpacklo_epi64(b0, b1);
+            int32x8 c1 = _mm256_unpackhi_epi64(b0, b1);
+            int32x8 c2 = _mm256_unpacklo_epi64(b2, b3);
+            int32x8 c3 = _mm256_unpackhi_epi64(b2, b3);
+            int32x8 c4 = _mm256_unpacklo_epi64(b4, b5);
+            int32x8 c5 = _mm256_unpackhi_epi64(b4, b5);
+            int32x8 c6 = _mm256_unpacklo_epi64(b6, b7);
+            int32x8 c7 = _mm256_unpackhi_epi64(b6, b7);
+            int32x8_MINMAX(c0, c1);
+            int32x8_MINMAX(c2, c3);
+            int32x8_MINMAX(c4, c5);
+            int32x8_MINMAX(c6, c7);
+            int32x8 d0 = _mm256_unpacklo_epi32(c0, c1);
+            int32x8 d1 = _mm256_unpackhi_epi32(c0, c1);
+            int32x8 d2 = _mm256_unpacklo_epi32(c2, c3);
+            int32x8 d3 = _mm256_unpackhi_epi32(c2, c3);
+            int32x8 d4 = _mm256_unpacklo_epi32(c4, c5);
+            int32x8 d5 = _mm256_unpackhi_epi32(c4, c5);
+            int32x8 d6 = _mm256_unpacklo_epi32(c6, c7);
+            int32x8 d7 = _mm256_unpackhi_epi32(c6, c7);
+            int32x8 e0 = _mm256_unpacklo_epi64(d0, d1);
+            int32x8 e1 = _mm256_unpackhi_epi64(d0, d1);
+            int32x8 e2 = _mm256_unpacklo_epi64(d2, d3);
+            int32x8 e3 = _mm256_unpackhi_epi64(d2, d3);
+            int32x8 e4 = _mm256_unpacklo_epi64(d4, d5);
+            int32x8 e5 = _mm256_unpackhi_epi64(d4, d5);
+            int32x8 e6 = _mm256_unpacklo_epi64(d6, d7);
+            int32x8 e7 = _mm256_unpackhi_epi64(d6, d7);
+            int32x8_MINMAX(e0, e1);
+            int32x8_MINMAX(e2, e3);
+            int32x8_MINMAX(e4, e5);
+            int32x8_MINMAX(e6, e7);
+            int32x8 f0 = _mm256_unpacklo_epi32(e0, e1);
+            int32x8 f1 = _mm256_unpackhi_epi32(e0, e1);
+            int32x8 f2 = _mm256_unpacklo_epi32(e2, e3);
+            int32x8 f3 = _mm256_unpackhi_epi32(e2, e3);
+            int32x8 f4 = _mm256_unpacklo_epi32(e4, e5);
+            int32x8 f5 = _mm256_unpackhi_epi32(e4, e5);
+            int32x8 f6 = _mm256_unpacklo_epi32(e6, e7);
+            int32x8 f7 = _mm256_unpackhi_epi32(e6, e7);
+            int32x8_store(&x[j], f0);
+            int32x8_store(&x[j + 8], f1);
+            int32x8_store(&x[j + 16], f2);
+            int32x8_store(&x[j + 24], f3);
+            int32x8_store(&x[j + 32], f4);
+            int32x8_store(&x[j + 40], f5);
+            int32x8_store(&x[j + 48], f6);
+            int32x8_store(&x[j + 56], f7);
+        }
+        minmax_vector(x + j, x + j + 32, n - 32 - j);
+        goto continue16;
+    }
+    if (q == 16) {
+        j = 0;
+continue16:
+        for (; j + 32 <= n; j += 32) {
+            int32x8 x0 = int32x8_load(&x[j]);
+            int32x8 x1 = int32x8_load(&x[j + 8]);
+            int32x8 x2 = int32x8_load(&x[j + 16]);
+            int32x8 x3 = int32x8_load(&x[j + 24]);
+            int32x8_MINMAX(x0, x2);
+            int32x8_MINMAX(x1, x3);
+            int32x8_MINMAX(x0, x1);
+            int32x8_MINMAX(x2, x3);
+            int32x8 a0 = _mm256_permute2x128_si256(x0, x1, 0x20);
+            int32x8 a1 = _mm256_permute2x128_si256(x0, x1, 0x31);
+            int32x8 a2 = _mm256_permute2x128_si256(x2, x3, 0x20);
+            int32x8 a3 = _mm256_permute2x128_si256(x2, x3, 0x31);
+            int32x8_MINMAX(a0, a1);
+            int32x8_MINMAX(a2, a3);
+            int32x8 b0 = _mm256_permute2x128_si256(a0, a1, 0x20);
+            int32x8 b1 = _mm256_permute2x128_si256(a0, a1, 0x31);
+            int32x8 b2 = _mm256_permute2x128_si256(a2, a3, 0x20);
+            int32x8 b3 = _mm256_permute2x128_si256(a2, a3, 0x31);
+            int32x8 c0 = _mm256_unpacklo_epi64(b0, b1);
+            int32x8 c1 = _mm256_unpackhi_epi64(b0, b1);
+            int32x8 c2 = _mm256_unpacklo_epi64(b2, b3);
+            int32x8 c3 = _mm256_unpackhi_epi64(b2, b3);
+            int32x8_MINMAX(c0, c1);
+            int32x8_MINMAX(c2, c3);
+            int32x8 d0 = _mm256_unpacklo_epi32(c0, c1);
+            int32x8 d1 = _mm256_unpackhi_epi32(c0, c1);
+            int32x8 d2 = _mm256_unpacklo_epi32(c2, c3);
+            int32x8 d3 = _mm256_unpackhi_epi32(c2, c3);
+            int32x8 e0 = _mm256_unpacklo_epi64(d0, d1);
+            int32x8 e1 = _mm256_unpackhi_epi64(d0, d1);
+            int32x8 e2 = _mm256_unpacklo_epi64(d2, d3);
+            int32x8 e3 = _mm256_unpackhi_epi64(d2, d3);
+            int32x8_MINMAX(e0, e1);
+            int32x8_MINMAX(e2, e3);
+            int32x8 f0 = _mm256_unpacklo_epi32(e0, e1);
+            int32x8 f1 = _mm256_unpackhi_epi32(e0, e1);
+            int32x8 f2 = _mm256_unpacklo_epi32(e2, e3);
+            int32x8 f3 = _mm256_unpackhi_epi32(e2, e3);
+            int32x8_store(&x[j], f0);
+            int32x8_store(&x[j + 8], f1);
+            int32x8_store(&x[j + 16], f2);
+            int32x8_store(&x[j + 24], f3);
+        }
+        minmax_vector(x + j, x + j + 16, n - 16 - j);
+        goto continue8;
+    }
+    /* q == 8 */
+    j = 0;
+continue8:
+    for (; j + 16 <= n; j += 16) {
+        int32x8 x0 = int32x8_load(&x[j]);
+        int32x8 x1 = int32x8_load(&x[j + 8]);
+        int32x8_MINMAX(x0, x1);
+        int32x8_store(&x[j], x0);
+        int32x8_store(&x[j + 8], x1);
+        int32x8 a0 = _mm256_permute2x128_si256(x0, x1, 0x20); /* x0123y0123 */
+        int32x8 a1 = _mm256_permute2x128_si256(x0, x1, 0x31); /* x4567y4567 */
+        int32x8_MINMAX(a0, a1);
+        int32x8 b0 = _mm256_permute2x128_si256(a0, a1, 0x20); /* x01234567 */
+        int32x8 b1 = _mm256_permute2x128_si256(a0, a1, 0x31); /* y01234567 */
+        int32x8 c0 = _mm256_unpacklo_epi64(b0, b1); /* x01y01x45y45 */
+        int32x8 c1 = _mm256_unpackhi_epi64(b0, b1); /* x23y23x67y67 */
+        int32x8_MINMAX(c0, c1);
+        int32x8 d0 = _mm256_unpacklo_epi32(c0, c1); /* x02x13x46x57 */
+        int32x8 d1 = _mm256_unpackhi_epi32(c0, c1); /* y02y13y46y57 */
+        int32x8 e0 = _mm256_unpacklo_epi64(d0, d1); /* x02y02x46y46 */
+        int32x8 e1 = _mm256_unpackhi_epi64(d0, d1); /* x13y13x57y57 */
+        int32x8_MINMAX(e0, e1);
+        int32x8 f0 = _mm256_unpacklo_epi32(e0, e1); /* x01234567 */
+        int32x8 f1 = _mm256_unpackhi_epi32(e0, e1); /* y01234567 */
+        int32x8_store(&x[j], f0);
+        int32x8_store(&x[j + 8], f1);
+    }
+    minmax_vector(x + j, x + j + 8, n - 8 - j);
+    if (j + 8 <= n) {
+        int32_MINMAX(&x[j], &x[j + 4]);
+        int32_MINMAX(&x[j + 1], &x[j + 5]);
+        int32_MINMAX(&x[j + 2], &x[j + 6]);
+        int32_MINMAX(&x[j + 3], &x[j + 7]);
+        int32_MINMAX(&x[j], &x[j + 2]);
+        int32_MINMAX(&x[j + 1], &x[j + 3]);
+        int32_MINMAX(&x[j], &x[j + 1]);
+        int32_MINMAX(&x[j + 2], &x[j + 3]);
+        int32_MINMAX(&x[j + 4], &x[j + 6]);
+        int32_MINMAX(&x[j + 5], &x[j + 7]);
+        int32_MINMAX(&x[j + 4], &x[j + 5]);
+        int32_MINMAX(&x[j + 6], &x[j + 7]);
+        j += 8;
+    }
+    minmax_vector(x + j, x + j + 4, n - 4 - j);
+    if (j + 4 <= n) {
+        int32_MINMAX(&x[j], &x[j + 2]);
+        int32_MINMAX(&x[j + 1], &x[j + 3]);
+        int32_MINMAX(&x[j], &x[j + 1]);
+        int32_MINMAX(&x[j + 2], &x[j + 3]);
+        j += 4;
+    }
+    if (j + 3 <= n) {
+        int32_MINMAX(&x[j], &x[j + 2]);
+    }
+    if (j + 2 <= n) {
+        int32_MINMAX(&x[j], &x[j + 1]);
+    }
+}
diff --git a/crypto_kem/ntrulpr761/avx2/crypto_sort_int32.h b/crypto_kem/ntrulpr761/avx2/crypto_sort_int32.h
new file mode 100644
index 00000000..73931416
--- /dev/null
+++ b/crypto_kem/ntrulpr761/avx2/crypto_sort_int32.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_NTRULPR761_AVX2_CRYPTO_SORT
+#define PQCLEAN_NTRULPR761_AVX2_CRYPTO_SORT
+
+#include <stddef.h>
+#include <stdint.h>
+
+
+void PQCLEAN_NTRULPR761_AVX2_crypto_sort_int32(int32_t *x, size_t n);
+
+#endif
diff --git a/crypto_kem/ntrulpr761/avx2/crypto_sort_uint32.c b/crypto_kem/ntrulpr761/avx2/crypto_sort_uint32.c
new file mode 100644
index 00000000..dbbd0d2c
--- /dev/null
+++ b/crypto_kem/ntrulpr761/avx2/crypto_sort_uint32.c
@@ -0,0 +1,20 @@
+#include "crypto_sort_int32.h"
+#include "crypto_sort_uint32.h"
+#include <stdint.h>
+
+#define uint32 uint32_t
+
+/* can save time by vectorizing xor loops */
+/* can save time by integrating xor loops with int32_sort */
+
+void PQCLEAN_NTRULPR761_AVX2_crypto_sort_uint32(uint32_t *array, size_t n) {
+    uint32 *x = array;
+    size_t j;
+    for (j = 0; j < n; ++j) {
+        x[j] ^= 0x80000000;
+    }
+    PQCLEAN_NTRULPR761_AVX2_crypto_sort_int32((int32_t *)array, n);
+    for (j = 0; j < n; ++j) {
+        x[j] ^= 0x80000000;
+    }
+}
diff --git a/crypto_kem/ntrulpr761/avx2/crypto_sort_uint32.h b/crypto_kem/ntrulpr761/avx2/crypto_sort_uint32.h
new file mode 100644
index 00000000..66f79cb1
--- /dev/null
+++ b/crypto_kem/ntrulpr761/avx2/crypto_sort_uint32.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_NTRULPR761_AVX2_CRYPTO_SORT_UINT32_H
+#define PQCLEAN_NTRULPR761_AVX2_CRYPTO_SORT_UINT32_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+
+void PQCLEAN_NTRULPR761_AVX2_crypto_sort_uint32(uint32_t *array, size_t n);
+
+#endif
diff --git a/crypto_kem/ntrulpr761/avx2/crypto_stream_aes256ctr.c b/crypto_kem/ntrulpr761/avx2/crypto_stream_aes256ctr.c
new file mode 100644
index 00000000..a40c96d4
--- /dev/null
+++ b/crypto_kem/ntrulpr761/avx2/crypto_stream_aes256ctr.c
@@ -0,0 +1,15 @@
+#include "crypto_stream_aes256ctr.h"
+
+
+int PQCLEAN_NTRULPR761_AVX2_crypto_stream_aes256ctr(
+    uint8_t *out,
+    size_t outlen,
+    const uint8_t nonce[AESCTR_NONCEBYTES],
+    const uint8_t key[AES256_KEYBYTES]) {
+
+    aes256ctx state;
+    aes256_ctr_keyexp(&state, key);
+    aes256_ctr(out, outlen, nonce, &state);
+    aes256_ctx_release(&state);
+    return 0;
+}
diff --git a/crypto_kem/ntrulpr761/avx2/crypto_stream_aes256ctr.h b/crypto_kem/ntrulpr761/avx2/crypto_stream_aes256ctr.h
new file mode 100644
index 00000000..52a8e430
--- /dev/null
+++ b/crypto_kem/ntrulpr761/avx2/crypto_stream_aes256ctr.h
@@ -0,0 +1,15 @@
+#ifndef PQCLEAN_NTRULPR761_AVX2_CRYPTO_STREAM_AES256CTR_H
+#define PQCLEAN_NTRULPR761_AVX2_CRYPTO_STREAM_AES256CTR_H
+#include "aes.h"
+#include <stddef.h>
+#include <stdint.h>
+
+
+
+int PQCLEAN_NTRULPR761_AVX2_crypto_stream_aes256ctr(
+    uint8_t *out,
+    size_t outlen,
+    const uint8_t nonce[AESCTR_NONCEBYTES],
+    const uint8_t key[AES256_KEYBYTES]);
+
+#endif
diff --git a/crypto_kem/ntrulpr761/avx2/crypto_verify_1167.c b/crypto_kem/ntrulpr761/avx2/crypto_verify_1167.c
new file mode 100644
index 00000000..17c1da57
--- /dev/null
+++ b/crypto_kem/ntrulpr761/avx2/crypto_verify_1167.c
@@ -0,0 +1,36 @@
+#include "crypto_verify_1167.h"
+#include <immintrin.h>
+
+int PQCLEAN_NTRULPR761_AVX2_crypto_verify_1167(const unsigned char *x, const unsigned char *y) {
+    __m256i diff = _mm256_set1_epi8(0);
+    unsigned int differentbits = 0;
+    int i = PQCLEAN_NTRULPR761_AVX2_crypto_verify_1167_BYTES;
+
+    i -= 32;
+    for (;;) {
+        do {
+            __m256i x0 = _mm256_loadu_si256((__m256i *) x);
+            __m256i y0 = _mm256_loadu_si256((__m256i *) y);
+            diff |= x0 ^ y0;
+            i -= 32;
+            x += 32;
+            y += 32;
+        } while (i >= 0);
+        if (i <= -32) {
+            break;
+        }
+        x += i;
+        y += i;
+    }
+
+    diff |= _mm256_srli_epi16(diff, 8);
+    diff |= _mm256_srli_epi32(diff, 16);
+    diff |= _mm256_srli_epi64(diff, 32);
+
+    differentbits = _mm256_extract_epi8(diff, 0);
+    differentbits |= _mm256_extract_epi8(diff, 8);
+    differentbits |= _mm256_extract_epi8(diff, 16);
+    differentbits |= _mm256_extract_epi8(diff, 24);
+
+    return (int) (1 & ((differentbits - 1) >> 8)) - 1;
+}
diff --git a/crypto_kem/ntrulpr761/avx2/crypto_verify_1167.h b/crypto_kem/ntrulpr761/avx2/crypto_verify_1167.h
new file mode 100644
index 00000000..afeaa888
--- /dev/null
+++ b/crypto_kem/ntrulpr761/avx2/crypto_verify_1167.h
@@ -0,0 +1,8 @@
+#ifndef PQCLEAN_NTRULPR761_AVX2_CRYPTO_VERIFY_1167_H
+#define PQCLEAN_NTRULPR761_AVX2_CRYPTO_VERIFY_1167_H
+
+#include <stdint.h>
+#define PQCLEAN_NTRULPR761_AVX2_crypto_verify_1167_BYTES 1167
+
+int PQCLEAN_NTRULPR761_AVX2_crypto_verify_1167(const unsigned char *x, const unsigned char *y);
+#endif
diff --git a/crypto_kem/ntrulpr761/avx2/kem.c b/crypto_kem/ntrulpr761/avx2/kem.c
new file mode 100644
index 00000000..2c5dbbf6
--- /dev/null
+++ b/crypto_kem/ntrulpr761/avx2/kem.c
@@ -0,0 +1,287 @@
+#include "api.h"
+#include "crypto_sort_uint32.h"
+#include "crypto_stream_aes256ctr.h"
+#include "params.h"
+#include "randombytes.h"
+#include "sha2.h"
+
+
+
+#define int8 int8_t
+#define int16 int16_t
+#define int32 int32_t
+#define uint16 uint16_t
+#define uint32 uint32_t
+#define uint64 uint64_t
+
+/* ----- masks */
+
+/* return -1 if x<0; otherwise return 0 */
+static int int16_negative_mask(int16 x) {
+    uint16 u = x;
+    u >>= 15;
+    return -(int) u;
+    /* alternative with gcc -fwrapv: */
+    /* x>>15 compiles to CPU's arithmetic right shift */
+}
+
+/* ----- arithmetic mod 3 */
+
+typedef int8 small;
+/* F3 is always represented as -1,0,1 */
+
+/* ----- arithmetic mod q */
+
+#define q12 ((q-1)/2)
+typedef int16 Fq;
+
+/* works for -14000000 < x < 14000000 if q in 4591, 4621, 5167 */
+/* assumes twos complement; use, e.g., gcc -fwrapv */
+static Fq Fq_freeze(int32 x) {
+    x -= q * ((q18 * x) >> 18);
+    x -= q * ((q27 * x + 67108864) >> 27);
+    return x;
+}
+
+/* works for all uint32 x */
+static Fq Fq_bigfreeze(uint32 x) {
+    x -= q * ((x * (uint64)q31) >> 31);
+    x -= q * ((x * (uint64)q31) >> 31);
+    x -= q;
+    x += (-(x >> 31)) & (uint32)q;
+    return x;
+}
+
+/* ----- Top and Right */
+
+static int8 Top(Fq C) {
+    return (tau1 * (int32)(C + tau0) + 16384) >> 15;
+}
+
+static Fq Right(int8 T) {
+    return Fq_freeze(tau3 * (int32)T - tau2);
+}
+
+/* ----- polynomials mod q */
+
+/* h = h*g in the ring Rq */
+static void Rq_mult_small(Fq *h, const small *g) {
+    crypto_encode_pxint16((unsigned char *) h, h);
+    crypto_core_mult((unsigned char *) h, (const unsigned char *) h, (const unsigned char *) g);
+    crypto_decode_pxint16(h, (const unsigned char *) h);
+}
+
+/* ----- sorting to generate short polynomial */
+
+static void Short_fromlist(small *out, const uint32 *in) {
+    uint32 L[ppadsort];
+    int i;
+
+    for (i = 0; i < w; ++i) {
+        L[i] = in[i] & (uint32) - 2;
+    }
+    for (i = w; i < p; ++i) {
+        L[i] = (in[i] & (uint32) - 3) | 1;
+    }
+    for (i = p; i < ppadsort; ++i) {
+        L[i] = 0xffffffff;
+    }
+    PQCLEAN_NTRULPR761_AVX2_crypto_sort_uint32(L, ppadsort);
+    for (i = 0; i < p; ++i) {
+        out[i] = (L[i] & 3) - 1;
+    }
+}
+
+/* ----- underlying hash function */
+
+#define Hash_bytes 32
+
+static void Hash(unsigned char *out, const unsigned char *in, int inlen) {
+    unsigned char h[64];
+    int i;
+    sha512(h, in, inlen);
+    for (i = 0; i < 32; ++i) {
+        out[i] = h[i];
+    }
+}
+
+/* ----- higher-level randomness */
+
+static void Short_random(small *out) {
+    uint32 L[p];
+
+    randombytes((unsigned char *) L, sizeof L);
+    crypto_decode_pxint32(L, (unsigned char *) L);
+    Short_fromlist(out, L);
+}
+
+/* ----- Inputs, Generator */
+
+typedef int8 Inputs[I]; /* passed by reference */
+
+static const unsigned char aes_nonce[16] = {0};
+
+/* G = Generator(pk) */
+static void Generator(Fq *G, const unsigned char *pk) {
+    uint32 L[p];
+    int i;
+
+    PQCLEAN_NTRULPR761_AVX2_crypto_stream_aes256ctr((unsigned char *) L, 4 * p, aes_nonce, pk);
+    crypto_decode_pxint32(L, (unsigned char *) L);
+    for (i = 0; i < p; ++i) {
+        G[i] = Fq_bigfreeze(L[i]) - q12;
+    }
+}
+
+/* ----- NTRU LPRime */
+
+#define Seeds_bytes 32
+#define Ciphertexts_bytes (Rounded_bytes+Top_bytes)
+#define SecretKeys_bytes Small_bytes
+#define PublicKeys_bytes (Seeds_bytes+Rounded_bytes)
+#define Confirm_bytes 32
+
+/* c,r_enc[1:] = Hide(r,pk,cache); cache is Hash4(pk) */
+static void Hide(unsigned char *c, unsigned char *r_enc, const Inputs r, const unsigned char *pk, const unsigned char *cache) {
+    small b[p];
+    int i;
+
+    Inputs_encode(r_enc + 1, r);
+    {
+        unsigned char h[Hash_bytes];
+        uint32 L[p];
+        {
+            unsigned char s[1 + Inputs_bytes];
+            Inputs_encode(s + 1, r);
+            s[0] = 5;
+            Hash(h, s, sizeof s);
+        }
+        PQCLEAN_NTRULPR761_AVX2_crypto_stream_aes256ctr((unsigned char *) L, 4 * p, aes_nonce, h);
+        crypto_decode_pxint32(L, (unsigned char *) L);
+        Short_fromlist(b, L);
+    }
+    {
+        Fq bG[p];
+        Generator(bG, pk);
+        Rq_mult_small(bG, b);
+        Round_and_encode(c, bG);
+        c += Rounded_bytes;
+    }
+    {
+        Fq bA[p];
+        int8 T[I];
+        Rounded_decode(bA, pk + Seeds_bytes);
+        Rq_mult_small(bA, b);
+        for (i = 0; i < I; ++i) {
+            T[i] = Top(Fq_freeze(bA[i] + r[i] * q12));
+        }
+        Top_encode(c, T);
+        c += Top_bytes;
+    }
+    {
+        unsigned char x[1 + Inputs_bytes + Hash_bytes];
+        for (i = 0; i < Inputs_bytes; ++i) {
+            x[1 + i] = r_enc[1 + i];
+        }
+        for (i = 0; i < Hash_bytes; ++i) {
+            x[1 + Inputs_bytes + i] = cache[i];
+        }
+        x[0] = 2;
+        Hash(c, x, sizeof x);
+    }
+}
+
+
+int PQCLEAN_NTRULPR761_AVX2_crypto_kem_keypair(unsigned char *pk, unsigned char *sk) {
+    Fq aG[p];
+    int i;
+    randombytes(pk, Seeds_bytes);
+    Generator(aG, pk);
+    {
+        small a[p];
+        Short_random(a);
+        Rq_mult_small(aG, a);
+        Small_encode(sk, a);
+    }
+    Round_and_encode(pk + Seeds_bytes, aG);
+    {
+        unsigned char sksave = sk[SecretKeys_bytes - 1];
+        for (i = 0; i < PublicKeys_bytes; ++i) {
+            sk[SecretKeys_bytes + i] = pk[i];
+        }
+        sk[SecretKeys_bytes - 1] = 4;
+        Hash(sk + SecretKeys_bytes + PublicKeys_bytes + Inputs_bytes, sk + SecretKeys_bytes - 1, 1 + PublicKeys_bytes);
+        sk[SecretKeys_bytes - 1] = sksave;
+        randombytes(sk + SecretKeys_bytes + PublicKeys_bytes, Inputs_bytes);
+    }
+    return 0;
+}
+
+int PQCLEAN_NTRULPR761_AVX2_crypto_kem_enc(unsigned char *c, unsigned char *k, const unsigned char *pk) {
+    int i;
+    unsigned char cache[Hash_bytes];
+    {
+        unsigned char y[1 + PublicKeys_bytes];
+        for (i = 0; i < PublicKeys_bytes; ++i) {
+            y[1 + i] = pk[i];
+        }
+        y[0] = 4;
+        Hash(cache, y, sizeof y);
+    }
+    Inputs r;
+    {
+        unsigned char s[Inputs_bytes];
+        randombytes(s, sizeof s);
+        Inputs_decode(r, s);
+    }
+    {
+        unsigned char x[1 + Inputs_bytes + Ciphertexts_bytes + Confirm_bytes];
+        Hide(c, x, r, pk, cache);
+        for (i = 0; i < Ciphertexts_bytes + Confirm_bytes; ++i) {
+            x[1 + Inputs_bytes + i] = c[i];
+        }
+        x[0] = 1;
+        Hash(k, x, sizeof x);
+    }
+    return 0;
+}
+
+int PQCLEAN_NTRULPR761_AVX2_crypto_kem_dec(unsigned char *k, const unsigned char *c, const unsigned char *sk) {
+    const unsigned char *pk = sk + SecretKeys_bytes;
+    const unsigned char *rho = pk + PublicKeys_bytes;
+    const unsigned char *cache = rho + Inputs_bytes;
+    Inputs r;
+    int i;
+    {
+        Fq aB[p];
+        Rounded_decode(aB, c);
+        {
+            small a[p];
+            Small_decode(a, sk);
+            Rq_mult_small(aB, a);
+        }
+        {
+            int8 T[I];
+            Top_decode(T, c + Rounded_bytes);
+            for (i = 0; i < I; ++i) {
+                r[i] = -int16_negative_mask(Fq_freeze(Right(T[i]) - aB[i] + 4 * w + 1));
+            }
+        }
+    }
+    {
+        unsigned char cnew[Ciphertexts_bytes + Confirm_bytes];
+        int mask;
+        unsigned char x[1 + Inputs_bytes + Ciphertexts_bytes + Confirm_bytes];
+        Hide(cnew, x, r, pk, cache);
+        mask = crypto_verify_clen(c, cnew);
+        for (i = 0; i < Inputs_bytes; ++i) {
+            x[1 + i] ^= mask & (x[1 + i] ^ rho[i]);
+        }
+        for (i = 0; i < Ciphertexts_bytes + Confirm_bytes; ++i) {
+            x[1 + Inputs_bytes + i] = c[i];
+        }
+        x[0] = 1 + mask;
+        Hash(k, x, sizeof x);
+    }
+    return 0;
+}
diff --git a/crypto_kem/ntrulpr761/avx2/params.h b/crypto_kem/ntrulpr761/avx2/params.h
new file mode 100644
index 00000000..7ca23546
--- /dev/null
+++ b/crypto_kem/ntrulpr761/avx2/params.h
@@ -0,0 +1,61 @@
+#ifndef params_H
+#define params_H
+#include "crypto_core_multsntrup761.h"
+#include "crypto_decode_256x16.h"
+#include "crypto_decode_256x2.h"
+#include "crypto_decode_761x1531.h"
+#include "crypto_decode_761x3.h"
+#include "crypto_decode_761xint16.h"
+#include "crypto_decode_761xint32.h"
+#include "crypto_encode_256x16.h"
+#include "crypto_encode_256x2.h"
+#include "crypto_encode_761x1531.h"
+#include "crypto_encode_761x1531round.h"
+#include "crypto_encode_761x3.h"
+#include "crypto_encode_761xint16.h"
+#include "crypto_verify_1167.h"
+
+
+#define p 761
+#define q 4591
+#define w 250
+#define tau0 2156
+#define tau1 114
+#define tau2 2007
+#define tau3 287
+#define I 256
+
+#define ppadsort 768
+
+#define q18 57 /* round(2^18/q) */
+#define q27 29235 /* round(2^27/q) */
+#define q31 467759 /* floor(2^31/q) */
+
+#define crypto_verify_clen PQCLEAN_NTRULPR761_AVX2_crypto_verify_1167
+
+#define Rounded_bytes PQCLEAN_NTRULPR761_AVX2_crypto_decode_761x1531_STRBYTES
+#define Rounded_decode PQCLEAN_NTRULPR761_AVX2_crypto_decode_761x1531
+
+#define Round_and_encode PQCLEAN_NTRULPR761_AVX2_crypto_encode_761x1531round
+
+#define Small_bytes PQCLEAN_NTRULPR761_AVX2_crypto_encode_761x3_STRBYTES
+#define Small_encode PQCLEAN_NTRULPR761_AVX2_crypto_encode_761x3
+#define Small_decode PQCLEAN_NTRULPR761_AVX2_crypto_decode_761x3
+
+#define Top_bytes PQCLEAN_NTRULPR761_AVX2_crypto_encode_256x16_STRBYTES
+#define Top_encode PQCLEAN_NTRULPR761_AVX2_crypto_encode_256x16
+#define Top_decode PQCLEAN_NTRULPR761_AVX2_crypto_decode_256x16
+
+#define Inputs_bytes PQCLEAN_NTRULPR761_AVX2_crypto_encode_256x2_STRBYTES
+#define Inputs_encode PQCLEAN_NTRULPR761_AVX2_crypto_encode_256x2
+#define Inputs_decode PQCLEAN_NTRULPR761_AVX2_crypto_decode_256x2
+
+#define crypto_decode_pxint32 PQCLEAN_NTRULPR761_AVX2_crypto_decode_761xint32
+
+#define crypto_decode_pxint16 PQCLEAN_NTRULPR761_AVX2_crypto_decode_761xint16
+
+#define crypto_encode_pxint16 PQCLEAN_NTRULPR761_AVX2_crypto_encode_761xint16
+
+#define crypto_core_mult PQCLEAN_NTRULPR761_AVX2_crypto_core_multsntrup761
+
+#endif
diff --git a/crypto_kem/ntrulpr761/clean/LICENSE b/crypto_kem/ntrulpr761/clean/LICENSE
new file mode 100644
index 00000000..d5d21fff
--- /dev/null
+++ b/crypto_kem/ntrulpr761/clean/LICENSE
@@ -0,0 +1 @@
+Public Domain
diff --git a/crypto_kem/ntrulpr761/clean/Makefile b/crypto_kem/ntrulpr761/clean/Makefile
new file mode 100644
index 00000000..b1efe00b
--- /dev/null
+++ b/crypto_kem/ntrulpr761/clean/Makefile
@@ -0,0 +1,19 @@
+# This Makefile can be used with GNU Make or BSD Make
+
+LIB=libntrulpr761_clean.a
+HEADERS=api.h crypto_core_multsntrup761.h crypto_decode_256x16.h crypto_decode_256x2.h crypto_decode_761x1531.h crypto_decode_761x3.h crypto_decode_761xint16.h crypto_decode_761xint32.h crypto_encode_256x16.h crypto_encode_256x2.h crypto_encode_761x1531.h crypto_encode_761x1531round.h crypto_encode_761x3.h crypto_encode_761xint16.h crypto_sort_int32.h crypto_sort_uint32.h crypto_stream_aes256ctr.h crypto_verify_1167.h params.h 
+OBJECTS=crypto_core_multsntrup761.o crypto_decode_256x16.o crypto_decode_256x2.o crypto_decode_761x1531.o crypto_decode_761x3.o crypto_decode_761xint16.o crypto_decode_761xint32.o crypto_encode_256x16.o crypto_encode_256x2.o crypto_encode_761x1531.o crypto_encode_761x1531round.o crypto_encode_761x3.o crypto_encode_761xint16.o crypto_sort_int32.o crypto_sort_uint32.o crypto_stream_aes256ctr.o crypto_verify_1167.o kem.o 
+
+CFLAGS=-O3 -Wall -Wextra -Wpedantic -Wvla -Werror -Wredundant-decls -Wmissing-prototypes -std=c99 -I../../../common $(EXTRAFLAGS)
+
+all: $(LIB)
+
+%.o: %.c $(HEADERS)
+	$(CC) $(CFLAGS) -c -o $@ $<
+
+$(LIB): $(OBJECTS)
+	$(AR) -r $@ $(OBJECTS)
+
+clean:
+	$(RM) $(OBJECTS)
+	$(RM) $(LIB)
diff --git a/crypto_kem/ntrulpr761/clean/Makefile.Microsoft_nmake b/crypto_kem/ntrulpr761/clean/Makefile.Microsoft_nmake
new file mode 100644
index 00000000..f1cf84ca
--- /dev/null
+++ b/crypto_kem/ntrulpr761/clean/Makefile.Microsoft_nmake
@@ -0,0 +1,19 @@
+# This Makefile can be used with Microsoft Visual Studio's nmake using the command:
+#    nmake /f Makefile.Microsoft_nmake
+
+LIBRARY=libntrulpr761_clean.lib
+OBJECTS=crypto_core_multsntrup761.obj crypto_decode_256x16.obj crypto_decode_256x2.obj crypto_decode_761x1531.obj crypto_decode_761x3.obj crypto_decode_761xint16.obj crypto_decode_761xint32.obj crypto_encode_256x16.obj crypto_encode_256x2.obj crypto_encode_761x1531.obj crypto_encode_761x1531round.obj crypto_encode_761x3.obj crypto_encode_761xint16.obj crypto_sort_int32.obj crypto_sort_uint32.obj crypto_stream_aes256ctr.obj crypto_verify_1167.obj kem.obj 
+
+CFLAGS=/nologo /O2 /I ..\..\..\common /W4 /WX
+
+all: $(LIBRARY)
+
+# Make sure objects are recompiled if headers change.
+$(OBJECTS): *.h
+
+$(LIBRARY): $(OBJECTS)
+    LIB.EXE /NOLOGO /WX /OUT:$@ $**
+
+clean:
+    -DEL $(OBJECTS)
+    -DEL $(LIBRARY)
diff --git a/crypto_kem/ntrulpr761/clean/api.h b/crypto_kem/ntrulpr761/clean/api.h
new file mode 100644
index 00000000..dc7f2bb6
--- /dev/null
+++ b/crypto_kem/ntrulpr761/clean/api.h
@@ -0,0 +1,16 @@
+#ifndef PQCLEAN_NTRULPR761_CLEAN_API_H
+#define PQCLEAN_NTRULPR761_CLEAN_API_H
+
+
+
+#define PQCLEAN_NTRULPR761_CLEAN_CRYPTO_ALGNAME "ntrulpr761"
+
+#define PQCLEAN_NTRULPR761_CLEAN_CRYPTO_SECRETKEYBYTES 1294
+#define PQCLEAN_NTRULPR761_CLEAN_CRYPTO_PUBLICKEYBYTES 1039
+#define PQCLEAN_NTRULPR761_CLEAN_CRYPTO_CIPHERTEXTBYTES 1167
+#define PQCLEAN_NTRULPR761_CLEAN_CRYPTO_BYTES 32
+
+int PQCLEAN_NTRULPR761_CLEAN_crypto_kem_keypair(unsigned char *pk, unsigned char *sk);
+int PQCLEAN_NTRULPR761_CLEAN_crypto_kem_enc(unsigned char *c, unsigned char *k, const unsigned char *pk);
+int PQCLEAN_NTRULPR761_CLEAN_crypto_kem_dec(unsigned char *k, const unsigned char *c, const unsigned char *sk);
+#endif
diff --git a/crypto_kem/ntrulpr761/clean/crypto_core_multsntrup761.c b/crypto_kem/ntrulpr761/clean/crypto_core_multsntrup761.c
new file mode 100644
index 00000000..6fadd821
--- /dev/null
+++ b/crypto_kem/ntrulpr761/clean/crypto_core_multsntrup761.c
@@ -0,0 +1,60 @@
+#include "crypto_core_multsntrup761.h"
+#include "params.h"
+
+
+#define int8 int8_t
+#define int16 int16_t
+#define int32 int32_t
+typedef int8 small;
+
+typedef int16 Fq;
+/* always represented as -(q-1)/2...(q-1)/2 */
+
+/* works for -14000000 < x < 14000000 if q in 4591, 4621, 5167 */
+static Fq Fq_freeze(int32 x) {
+    x -= q * ((q18 * x) >> 18);
+    x -= q * ((q27 * x + 67108864) >> 27);
+    return x;
+}
+
+int PQCLEAN_NTRULPR761_CLEAN_crypto_core_multsntrup761(unsigned char *outbytes, const unsigned char *inbytes, const unsigned char *kbytes) {
+    Fq f[p];
+    small g[p];
+    Fq fg[p + p - 1];
+    int32 result;
+    int i, j;
+
+    crypto_decode_pxint16(f, inbytes);
+    for (i = 0; i < p; ++i) {
+        f[i] = Fq_freeze(f[i]);
+    }
+
+    for (i = 0; i < p; ++i) {
+        small gi = kbytes[i];
+        small gi0 = gi & 1;
+        g[i] = gi0 - (gi & (gi0 << 1));
+    }
+
+    for (i = 0; i < p; ++i) {
+        result = 0;
+        for (j = 0; j <= i; ++j) {
+            result += f[j] * (int32)g[i - j];
+        }
+        fg[i] = Fq_freeze(result);
+    }
+    for (i = p; i < p + p - 1; ++i) {
+        result = 0;
+        for (j = i - p + 1; j < p; ++j) {
+            result += f[j] * (int32)g[i - j];
+        }
+        fg[i] = Fq_freeze(result);
+    }
+
+    for (i = p + p - 2; i >= p; --i) {
+        fg[i - p] = Fq_freeze(fg[i - p] + fg[i]);
+        fg[i - p + 1] = Fq_freeze(fg[i - p + 1] + fg[i]);
+    }
+
+    crypto_encode_pxint16(outbytes, fg);
+    return 0;
+}
diff --git a/crypto_kem/ntrulpr761/clean/crypto_core_multsntrup761.h b/crypto_kem/ntrulpr761/clean/crypto_core_multsntrup761.h
new file mode 100644
index 00000000..3da5b2e0
--- /dev/null
+++ b/crypto_kem/ntrulpr761/clean/crypto_core_multsntrup761.h
@@ -0,0 +1,11 @@
+#ifndef PQCLEAN_NTRULPR761_CLEAN_CRYPTO_CORE_MULTSNTRUP761_H
+#define PQCLEAN_NTRULPR761_CLEAN_CRYPTO_CORE_MULTSNTRUP761_H
+
+#include <stdint.h>
+#define PQCLEAN_NTRULPR761_CLEAN_crypto_core_multsntrup761_OUTPUTBYTES 1522
+#define PQCLEAN_NTRULPR761_CLEAN_crypto_core_multsntrup761_INPUTBYTES 1522
+#define PQCLEAN_NTRULPR761_CLEAN_crypto_core_multsntrup761_KEYBYTES 761
+#define PQCLEAN_NTRULPR761_CLEAN_crypto_core_multsntrup761_CONSTBYTES 0
+
+int PQCLEAN_NTRULPR761_CLEAN_crypto_core_multsntrup761(unsigned char *outbytes, const unsigned char *inbytes, const unsigned char *kbytes);
+#endif
diff --git a/crypto_kem/ntrulpr761/clean/crypto_decode_256x16.c b/crypto_kem/ntrulpr761/clean/crypto_decode_256x16.c
new file mode 100644
index 00000000..82ff1b27
--- /dev/null
+++ b/crypto_kem/ntrulpr761/clean/crypto_decode_256x16.c
@@ -0,0 +1,11 @@
+#include "crypto_decode_256x16.h"
+
+
+void PQCLEAN_NTRULPR761_CLEAN_crypto_decode_256x16(void *v, const unsigned char *s) {
+    unsigned char *T = v;
+    int i;
+    for (i = 0; i < 128; ++i) {
+        T[2 * i] = s[i] & 15;
+        T[2 * i + 1] = s[i] >> 4;
+    }
+}
diff --git a/crypto_kem/ntrulpr761/clean/crypto_decode_256x16.h b/crypto_kem/ntrulpr761/clean/crypto_decode_256x16.h
new file mode 100644
index 00000000..2c38ad47
--- /dev/null
+++ b/crypto_kem/ntrulpr761/clean/crypto_decode_256x16.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_NTRULPR761_CLEAN_CRYPTO_DECODE_256X16_H
+#define PQCLEAN_NTRULPR761_CLEAN_CRYPTO_DECODE_256X16_H
+
+#include <stdint.h>
+#define PQCLEAN_NTRULPR761_CLEAN_crypto_decode_256x16_STRBYTES 128
+#define PQCLEAN_NTRULPR761_CLEAN_crypto_decode_256x16_ITEMS 256
+#define PQCLEAN_NTRULPR761_CLEAN_crypto_decode_256x16_ITEMBYTES 1
+
+void PQCLEAN_NTRULPR761_CLEAN_crypto_decode_256x16(void *v, const unsigned char *s);
+#endif
diff --git a/crypto_kem/ntrulpr761/clean/crypto_decode_256x2.c b/crypto_kem/ntrulpr761/clean/crypto_decode_256x2.c
new file mode 100644
index 00000000..d30c5f84
--- /dev/null
+++ b/crypto_kem/ntrulpr761/clean/crypto_decode_256x2.c
@@ -0,0 +1,10 @@
+#include "crypto_decode_256x2.h"
+
+
+void PQCLEAN_NTRULPR761_CLEAN_crypto_decode_256x2(void *v, const unsigned char *s) {
+    unsigned char *r = v;
+    int i;
+    for (i = 0; i < 256; ++i) {
+        r[i] = 1 & (s[i >> 3] >> (i & 7));
+    }
+}
diff --git a/crypto_kem/ntrulpr761/clean/crypto_decode_256x2.h b/crypto_kem/ntrulpr761/clean/crypto_decode_256x2.h
new file mode 100644
index 00000000..7798a97e
--- /dev/null
+++ b/crypto_kem/ntrulpr761/clean/crypto_decode_256x2.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_NTRULPR761_CLEAN_CRYPTO_DECODE_256X2_H
+#define PQCLEAN_NTRULPR761_CLEAN_CRYPTO_DECODE_256X2_H
+
+#include <stdint.h>
+#define PQCLEAN_NTRULPR761_CLEAN_crypto_decode_256x2_STRBYTES 32
+#define PQCLEAN_NTRULPR761_CLEAN_crypto_decode_256x2_ITEMS 256
+#define PQCLEAN_NTRULPR761_CLEAN_crypto_decode_256x2_ITEMBYTES 1
+
+void PQCLEAN_NTRULPR761_CLEAN_crypto_decode_256x2(void *v, const unsigned char *s);
+#endif
diff --git a/crypto_kem/ntrulpr761/clean/crypto_decode_761x1531.c b/crypto_kem/ntrulpr761/clean/crypto_decode_761x1531.c
new file mode 100644
index 00000000..fe2c1adb
--- /dev/null
+++ b/crypto_kem/ntrulpr761/clean/crypto_decode_761x1531.c
@@ -0,0 +1,211 @@
+#include "crypto_decode_761x1531.h"
+
+/* auto-generated; do not edit */
+
+#define int16 int16_t
+#define uint16 uint16_t
+#define uint32 uint32_t
+#define uint64 uint64_t
+
+/*
+CPU division instruction typically takes time depending on x.
+This software is designed to take time independent of x.
+Time still varies depending on m; user must ensure that m is constant.
+Time also varies on CPUs where multiplication is variable-time.
+There could be more CPU issues.
+There could also be compiler issues.
+*/
+
+static void uint32_divmod_uint14(uint32 *q, uint16 *r, uint32 x, uint16 m) {
+    uint32 v = 0x80000000;
+    uint32 qpart;
+    uint32 mask;
+
+    v /= m;
+
+    /* caller guarantees m > 0 */
+    /* caller guarantees m < 16384 */
+    /* vm <= 2^31 <= vm+m-1 */
+    /* xvm <= 2^31 x <= xvm+x(m-1) */
+
+    *q = 0;
+
+    qpart = (x * (uint64)v) >> 31;
+    /* 2^31 qpart <= xv <= 2^31 qpart + 2^31-1 */
+    /* 2^31 qpart m <= xvm <= 2^31 qpart m + (2^31-1)m */
+    /* 2^31 qpart m <= 2^31 x <= 2^31 qpart m + (2^31-1)m + x(m-1) */
+    /* 0 <= 2^31 newx <= (2^31-1)m + x(m-1) */
+    /* 0 <= newx <= (1-1/2^31)m + x(m-1)/2^31 */
+    /* 0 <= newx <= (1-1/2^31)(2^14-1) + (2^32-1)((2^14-1)-1)/2^31 */
+
+    x -= qpart * m;
+    *q += qpart;
+    /* x <= 49146 */
+
+    qpart = (x * (uint64)v) >> 31;
+    /* 0 <= newx <= (1-1/2^31)m + x(m-1)/2^31 */
+    /* 0 <= newx <= m + 49146(2^14-1)/2^31 */
+    /* 0 <= newx <= m + 0.4 */
+    /* 0 <= newx <= m */
+
+    x -= qpart * m;
+    *q += qpart;
+    /* x <= m */
+
+    x -= m;
+    *q += 1;
+    mask = -(x >> 31);
+    x += mask & (uint32)m;
+    *q += mask;
+    /* x < m */
+
+    *r = x;
+}
+
+static uint16 uint32_mod_uint14(uint32 x, uint16 m) {
+    uint32 q;
+    uint16 r;
+    uint32_divmod_uint14(&q, &r, x, m);
+    return r;
+}
+
+void PQCLEAN_NTRULPR761_CLEAN_crypto_decode_761x1531(void *v, const unsigned char *s) {
+    int16 *R0 = v;
+    uint16 R1[381], R2[191], R3[96], R4[48], R5[24], R6[12], R7[6], R8[3], R9[2], R10[1];
+    long long i;
+    uint16 r0;
+    uint32 r1, r2;
+
+    s += PQCLEAN_NTRULPR761_CLEAN_crypto_decode_761x1531_STRBYTES;
+    r1 = 0;
+    r1 = (r1 << 8) | *--s;
+    r1 = (r1 << 8) | *--s;
+    r1 = uint32_mod_uint14(r1, 3475); /* needed only for invalid inputs */
+    R10[0] = r1;
+
+    r2 = R10[0];
+    r2 = (r2 << 8) | *--s;
+    uint32_divmod_uint14(&r1, &r0, r2, 593);
+    R9[0] = r0;
+    r1 = uint32_mod_uint14(r1, 1500); /* needed only for invalid inputs */
+    R9[1] = r1;
+
+    R8[2] = R9[1];
+    r2 = R9[0];
+    r2 = (r2 << 8) | *--s;
+    r2 = (r2 << 8) | *--s;
+    uint32_divmod_uint14(&r1, &r0, r2, 6232);
+    R8[0] = r0;
+    r1 = uint32_mod_uint14(r1, 6232); /* needed only for invalid inputs */
+    R8[1] = r1;
+
+    r2 = R8[2];
+    r2 = (r2 << 8) | *--s;
+    uint32_divmod_uint14(&r1, &r0, r2, 1263);
+    R7[4] = r0;
+    r1 = uint32_mod_uint14(r1, 304); /* needed only for invalid inputs */
+    R7[5] = r1;
+    for (i = 1; i >= 0; --i) {
+        r2 = R8[i];
+        r2 = (r2 << 8) | *--s;
+        uint32_divmod_uint14(&r1, &r0, r2, 1263);
+        R7[2 * i] = r0;
+        r1 = uint32_mod_uint14(r1, 1263); /* needed only for invalid inputs */
+        R7[2 * i + 1] = r1;
+    }
+
+    r2 = R7[5];
+    r2 = (r2 << 8) | *--s;
+    r2 = (r2 << 8) | *--s;
+    uint32_divmod_uint14(&r1, &r0, r2, 9097);
+    R6[10] = r0;
+    r1 = uint32_mod_uint14(r1, 2188); /* needed only for invalid inputs */
+    R6[11] = r1;
+    for (i = 4; i >= 0; --i) {
+        r2 = R7[i];
+        r2 = (r2 << 8) | *--s;
+        r2 = (r2 << 8) | *--s;
+        uint32_divmod_uint14(&r1, &r0, r2, 9097);
+        R6[2 * i] = r0;
+        r1 = uint32_mod_uint14(r1, 9097); /* needed only for invalid inputs */
+        R6[2 * i + 1] = r1;
+    }
+
+    r2 = R6[11];
+    r2 = (r2 << 8) | *--s;
+    uint32_divmod_uint14(&r1, &r0, r2, 1526);
+    R5[22] = r0;
+    r1 = uint32_mod_uint14(r1, 367); /* needed only for invalid inputs */
+    R5[23] = r1;
+    for (i = 10; i >= 0; --i) {
+        r2 = R6[i];
+        r2 = (r2 << 8) | *--s;
+        uint32_divmod_uint14(&r1, &r0, r2, 1526);
+        R5[2 * i] = r0;
+        r1 = uint32_mod_uint14(r1, 1526); /* needed only for invalid inputs */
+        R5[2 * i + 1] = r1;
+    }
+
+    r2 = R5[23];
+    r2 = (r2 << 8) | *--s;
+    uint32_divmod_uint14(&r1, &r0, r2, 625);
+    R4[46] = r0;
+    r1 = uint32_mod_uint14(r1, 150); /* needed only for invalid inputs */
+    R4[47] = r1;
+    for (i = 22; i >= 0; --i) {
+        r2 = R5[i];
+        r2 = (r2 << 8) | *--s;
+        uint32_divmod_uint14(&r1, &r0, r2, 625);
+        R4[2 * i] = r0;
+        r1 = uint32_mod_uint14(r1, 625); /* needed only for invalid inputs */
+        R4[2 * i + 1] = r1;
+    }
+
+    r2 = R4[47];
+    r2 = (r2 << 8) | *--s;
+    r2 = (r2 << 8) | *--s;
+    uint32_divmod_uint14(&r1, &r0, r2, 6400);
+    R3[94] = r0;
+    r1 = uint32_mod_uint14(r1, 1531); /* needed only for invalid inputs */
+    R3[95] = r1;
+    for (i = 46; i >= 0; --i) {
+        r2 = R4[i];
+        r2 = (r2 << 8) | *--s;
+        r2 = (r2 << 8) | *--s;
+        uint32_divmod_uint14(&r1, &r0, r2, 6400);
+        R3[2 * i] = r0;
+        r1 = uint32_mod_uint14(r1, 6400); /* needed only for invalid inputs */
+        R3[2 * i + 1] = r1;
+    }
+
+    R2[190] = R3[95];
+    for (i = 94; i >= 0; --i) {
+        r2 = R3[i];
+        r2 = (r2 << 8) | *--s;
+        uint32_divmod_uint14(&r1, &r0, r2, 1280);
+        R2[2 * i] = r0;
+        r1 = uint32_mod_uint14(r1, 1280); /* needed only for invalid inputs */
+        R2[2 * i + 1] = r1;
+    }
+
+    R1[380] = R2[190];
+    for (i = 189; i >= 0; --i) {
+        r2 = R2[i];
+        r2 = (r2 << 8) | *--s;
+        r2 = (r2 << 8) | *--s;
+        uint32_divmod_uint14(&r1, &r0, r2, 9157);
+        R1[2 * i] = r0;
+        r1 = uint32_mod_uint14(r1, 9157); /* needed only for invalid inputs */
+        R1[2 * i + 1] = r1;
+    }
+
+    R0[760] = 3 * R1[380] - 2295;
+    for (i = 379; i >= 0; --i) {
+        r2 = R1[i];
+        r2 = (r2 << 8) | *--s;
+        uint32_divmod_uint14(&r1, &r0, r2, 1531);
+        R0[2 * i] = 3 * r0 - 2295;
+        r1 = uint32_mod_uint14(r1, 1531); /* needed only for invalid inputs */
+        R0[2 * i + 1] = 3 * r1 - 2295;
+    }
+}
diff --git a/crypto_kem/ntrulpr761/clean/crypto_decode_761x1531.h b/crypto_kem/ntrulpr761/clean/crypto_decode_761x1531.h
new file mode 100644
index 00000000..cc91874c
--- /dev/null
+++ b/crypto_kem/ntrulpr761/clean/crypto_decode_761x1531.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_NTRULPR761_CLEAN_CRYPTO_DECODE_761X1531_H
+#define PQCLEAN_NTRULPR761_CLEAN_CRYPTO_DECODE_761X1531_H
+
+#include <stdint.h>
+#define PQCLEAN_NTRULPR761_CLEAN_crypto_decode_761x1531_STRBYTES 1007
+#define PQCLEAN_NTRULPR761_CLEAN_crypto_decode_761x1531_ITEMS 761
+#define PQCLEAN_NTRULPR761_CLEAN_crypto_decode_761x1531_ITEMBYTES 2
+
+void PQCLEAN_NTRULPR761_CLEAN_crypto_decode_761x1531(void *v, const unsigned char *s);
+#endif
diff --git a/crypto_kem/ntrulpr761/clean/crypto_decode_761x3.c b/crypto_kem/ntrulpr761/clean/crypto_decode_761x3.c
new file mode 100644
index 00000000..ec73b604
--- /dev/null
+++ b/crypto_kem/ntrulpr761/clean/crypto_decode_761x3.c
@@ -0,0 +1,24 @@
+#include "crypto_decode_761x3.h"
+
+#define uint8 uint8_t
+
+#define p 761
+
+void PQCLEAN_NTRULPR761_CLEAN_crypto_decode_761x3(void *v, const unsigned char *s) {
+    uint8 *f = v;
+    uint8 x;
+    int i;
+
+    for (i = 0; i < p / 4; ++i) {
+        x = *s++;
+        *f++ = ((uint8)(x & 3)) - 1;
+        x >>= 2;
+        *f++ = ((uint8)(x & 3)) - 1;
+        x >>= 2;
+        *f++ = ((uint8)(x & 3)) - 1;
+        x >>= 2;
+        *f++ = ((uint8)(x & 3)) - 1;
+    }
+    x = *s++;
+    *f++ = ((uint8)(x & 3)) - 1;
+}
diff --git a/crypto_kem/ntrulpr761/clean/crypto_decode_761x3.h b/crypto_kem/ntrulpr761/clean/crypto_decode_761x3.h
new file mode 100644
index 00000000..e7c095d4
--- /dev/null
+++ b/crypto_kem/ntrulpr761/clean/crypto_decode_761x3.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_NTRULPR761_CLEAN_CRYPTO_DECODE_761X3_H
+#define PQCLEAN_NTRULPR761_CLEAN_CRYPTO_DECODE_761X3_H
+
+#include <stdint.h>
+#define PQCLEAN_NTRULPR761_CLEAN_crypto_decode_761x3_STRBYTES 191
+#define PQCLEAN_NTRULPR761_CLEAN_crypto_decode_761x3_ITEMS 761
+#define PQCLEAN_NTRULPR761_CLEAN_crypto_decode_761x3_ITEMBYTES 1
+
+void PQCLEAN_NTRULPR761_CLEAN_crypto_decode_761x3(void *v, const unsigned char *s);
+#endif
diff --git a/crypto_kem/ntrulpr761/clean/crypto_decode_761xint16.c b/crypto_kem/ntrulpr761/clean/crypto_decode_761xint16.c
new file mode 100644
index 00000000..d13db387
--- /dev/null
+++ b/crypto_kem/ntrulpr761/clean/crypto_decode_761xint16.c
@@ -0,0 +1,16 @@
+#include "crypto_decode_761xint16.h"
+
+
+void PQCLEAN_NTRULPR761_CLEAN_crypto_decode_761xint16(void *v, const unsigned char *s) {
+    uint16_t *x = v;
+    int i;
+
+    for (i = 0; i < 761; ++i) {
+        uint16_t u0 = s[0];
+        uint16_t u1 = s[1];
+        u1 <<= 8;
+        *x = u0 | u1;
+        x += 1;
+        s += 2;
+    }
+}
diff --git a/crypto_kem/ntrulpr761/clean/crypto_decode_761xint16.h b/crypto_kem/ntrulpr761/clean/crypto_decode_761xint16.h
new file mode 100644
index 00000000..eea533c8
--- /dev/null
+++ b/crypto_kem/ntrulpr761/clean/crypto_decode_761xint16.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_NTRULPR761_CLEAN_CRYPTO_DECODE_761XINT16_H
+#define PQCLEAN_NTRULPR761_CLEAN_CRYPTO_DECODE_761XINT16_H
+
+#include <stdint.h>
+#define PQCLEAN_NTRULPR761_CLEAN_crypto_decode_761xint16_STRBYTES 1522
+#define PQCLEAN_NTRULPR761_CLEAN_crypto_decode_761xint16_ITEMBYTES 2
+#define PQCLEAN_NTRULPR761_CLEAN_crypto_decode_761xint16_ITEMS 761
+
+void PQCLEAN_NTRULPR761_CLEAN_crypto_decode_761xint16(void *v, const unsigned char *s);
+#endif
diff --git a/crypto_kem/ntrulpr761/clean/crypto_decode_761xint32.c b/crypto_kem/ntrulpr761/clean/crypto_decode_761xint32.c
new file mode 100644
index 00000000..aaa8f49d
--- /dev/null
+++ b/crypto_kem/ntrulpr761/clean/crypto_decode_761xint32.c
@@ -0,0 +1,20 @@
+#include "crypto_decode_761xint32.h"
+
+
+void PQCLEAN_NTRULPR761_CLEAN_crypto_decode_761xint32(void *v, const unsigned char *s) {
+    uint32_t *x = v;
+    int i;
+
+    for (i = 0; i < 761; ++i) {
+        uint32_t u0 = s[0];
+        uint32_t u1 = s[1];
+        uint32_t u2 = s[2];
+        uint32_t u3 = s[3];
+        u1 <<= 8;
+        u2 <<= 16;
+        u3 <<= 24;
+        *x = u0 | u1 | u2 | u3;
+        x += 1;
+        s += 4;
+    }
+}
diff --git a/crypto_kem/ntrulpr761/clean/crypto_decode_761xint32.h b/crypto_kem/ntrulpr761/clean/crypto_decode_761xint32.h
new file mode 100644
index 00000000..253150d7
--- /dev/null
+++ b/crypto_kem/ntrulpr761/clean/crypto_decode_761xint32.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_NTRULPR761_CLEAN_CRYPTO_DECODE_761XINT32_H
+#define PQCLEAN_NTRULPR761_CLEAN_CRYPTO_DECODE_761XINT32_H
+
+#include <stdint.h>
+#define PQCLEAN_NTRULPR761_CLEAN_crypto_decode_761xint32_STRBYTES 3044
+#define PQCLEAN_NTRULPR761_CLEAN_crypto_decode_761xint32_ITEMBYTES 4
+#define PQCLEAN_NTRULPR761_CLEAN_crypto_decode_761xint32_ITEMS 761
+
+void PQCLEAN_NTRULPR761_CLEAN_crypto_decode_761xint32(void *v, const unsigned char *s);
+#endif
diff --git a/crypto_kem/ntrulpr761/clean/crypto_encode_256x16.c b/crypto_kem/ntrulpr761/clean/crypto_encode_256x16.c
new file mode 100644
index 00000000..c3f825b9
--- /dev/null
+++ b/crypto_kem/ntrulpr761/clean/crypto_encode_256x16.c
@@ -0,0 +1,10 @@
+#include "crypto_encode_256x16.h"
+
+
+void PQCLEAN_NTRULPR761_CLEAN_crypto_encode_256x16(unsigned char *s, const void *v) {
+    const unsigned char *T = v;
+    int i;
+    for (i = 0; i < 128; ++i) {
+        s[i] = T[2 * i] + (T[2 * i + 1] << 4);
+    }
+}
diff --git a/crypto_kem/ntrulpr761/clean/crypto_encode_256x16.h b/crypto_kem/ntrulpr761/clean/crypto_encode_256x16.h
new file mode 100644
index 00000000..93520514
--- /dev/null
+++ b/crypto_kem/ntrulpr761/clean/crypto_encode_256x16.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_NTRULPR761_CLEAN_CRYPTO_ENCODE_256X16_H
+#define PQCLEAN_NTRULPR761_CLEAN_CRYPTO_ENCODE_256X16_H
+
+#include <stdint.h>
+#define PQCLEAN_NTRULPR761_CLEAN_crypto_encode_256x16_STRBYTES 128
+#define PQCLEAN_NTRULPR761_CLEAN_crypto_encode_256x16_ITEMS 256
+#define PQCLEAN_NTRULPR761_CLEAN_crypto_encode_256x16_ITEMBYTES 1
+
+void PQCLEAN_NTRULPR761_CLEAN_crypto_encode_256x16(unsigned char *s, const void *v);
+#endif
diff --git a/crypto_kem/ntrulpr761/clean/crypto_encode_256x2.c b/crypto_kem/ntrulpr761/clean/crypto_encode_256x2.c
new file mode 100644
index 00000000..ff31479e
--- /dev/null
+++ b/crypto_kem/ntrulpr761/clean/crypto_encode_256x2.c
@@ -0,0 +1,13 @@
+#include "crypto_encode_256x2.h"
+
+
+void PQCLEAN_NTRULPR761_CLEAN_crypto_encode_256x2(unsigned char *s, const void *v) {
+    const unsigned char *r = v;
+    int i;
+    for (i = 0; i < 32; ++i) {
+        s[i] = 0;
+    }
+    for (i = 0; i < 256; ++i) {
+        s[i >> 3] |= (r[i] & 1) << (i & 7);
+    }
+}
diff --git a/crypto_kem/ntrulpr761/clean/crypto_encode_256x2.h b/crypto_kem/ntrulpr761/clean/crypto_encode_256x2.h
new file mode 100644
index 00000000..e0865155
--- /dev/null
+++ b/crypto_kem/ntrulpr761/clean/crypto_encode_256x2.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_NTRULPR761_CLEAN_CRYPTO_ENCODE_256X2_H
+#define PQCLEAN_NTRULPR761_CLEAN_CRYPTO_ENCODE_256X2_H
+
+#include <stdint.h>
+#define PQCLEAN_NTRULPR761_CLEAN_crypto_encode_256x2_STRBYTES 32
+#define PQCLEAN_NTRULPR761_CLEAN_crypto_encode_256x2_ITEMS 256
+#define PQCLEAN_NTRULPR761_CLEAN_crypto_encode_256x2_ITEMBYTES 1
+
+void PQCLEAN_NTRULPR761_CLEAN_crypto_encode_256x2(unsigned char *s, const void *v);
+#endif
diff --git a/crypto_kem/ntrulpr761/clean/crypto_encode_761x1531.c b/crypto_kem/ntrulpr761/clean/crypto_encode_761x1531.c
new file mode 100644
index 00000000..74243957
--- /dev/null
+++ b/crypto_kem/ntrulpr761/clean/crypto_encode_761x1531.c
@@ -0,0 +1,119 @@
+#include "crypto_encode_761x1531.h"
+
+/* auto-generated; do not edit */
+
+#define int16 int16_t
+#define uint16 uint16_t
+#define uint32 uint32_t
+
+void PQCLEAN_NTRULPR761_CLEAN_crypto_encode_761x1531(unsigned char *out, const void *v) {
+    const int16 *R0 = v;
+    /* XXX: caller could overlap R with input */
+    uint16 R[381];
+    long i;
+    uint16 r0, r1;
+    uint32 r2;
+
+    for (i = 0; i < 380; ++i) {
+        r0 = (((R0[2 * i] + 2295) & 16383) * 10923) >> 15;
+        r1 = (((R0[2 * i + 1] + 2295) & 16383) * 10923) >> 15;
+        r2 = r0 + r1 * (uint32)1531;
+        *out++ = r2;
+        r2 >>= 8;
+        R[i] = r2;
+    }
+    R[380] = (((R0[760] + 2295) & 16383) * 10923) >> 15;
+
+    for (i = 0; i < 190; ++i) {
+        r0 = R[2 * i];
+        r1 = R[2 * i + 1];
+        r2 = r0 + r1 * (uint32)9157;
+        *out++ = r2;
+        r2 >>= 8;
+        *out++ = r2;
+        r2 >>= 8;
+        R[i] = r2;
+    }
+    R[190] = R[380];
+
+    for (i = 0; i < 95; ++i) {
+        r0 = R[2 * i];
+        r1 = R[2 * i + 1];
+        r2 = r0 + r1 * (uint32)1280;
+        *out++ = r2;
+        r2 >>= 8;
+        R[i] = r2;
+    }
+    R[95] = R[190];
+
+    for (i = 0; i < 48; ++i) {
+        r0 = R[2 * i];
+        r1 = R[2 * i + 1];
+        r2 = r0 + r1 * (uint32)6400;
+        *out++ = r2;
+        r2 >>= 8;
+        *out++ = r2;
+        r2 >>= 8;
+        R[i] = r2;
+    }
+
+    for (i = 0; i < 24; ++i) {
+        r0 = R[2 * i];
+        r1 = R[2 * i + 1];
+        r2 = r0 + r1 * (uint32)625;
+        *out++ = r2;
+        r2 >>= 8;
+        R[i] = r2;
+    }
+
+    for (i = 0; i < 12; ++i) {
+        r0 = R[2 * i];
+        r1 = R[2 * i + 1];
+        r2 = r0 + r1 * (uint32)1526;
+        *out++ = r2;
+        r2 >>= 8;
+        R[i] = r2;
+    }
+
+    for (i = 0; i < 6; ++i) {
+        r0 = R[2 * i];
+        r1 = R[2 * i + 1];
+        r2 = r0 + r1 * (uint32)9097;
+        *out++ = r2;
+        r2 >>= 8;
+        *out++ = r2;
+        r2 >>= 8;
+        R[i] = r2;
+    }
+
+    for (i = 0; i < 3; ++i) {
+        r0 = R[2 * i];
+        r1 = R[2 * i + 1];
+        r2 = r0 + r1 * (uint32)1263;
+        *out++ = r2;
+        r2 >>= 8;
+        R[i] = r2;
+    }
+
+    r0 = R[0];
+    r1 = R[1];
+    r2 = r0 + r1 * (uint32)6232;
+    *out++ = r2;
+    r2 >>= 8;
+    *out++ = r2;
+    r2 >>= 8;
+    R[0] = r2;
+    R[1] = R[2];
+
+    r0 = R[0];
+    r1 = R[1];
+    r2 = r0 + r1 * (uint32)593;
+    *out++ = r2;
+    r2 >>= 8;
+    R[0] = r2;
+
+    r0 = R[0];
+    *out++ = r0;
+    r0 >>= 8;
+    *out++ = r0; /*clang-analyzer-deadcode.DeadStores*/ /*r0 >>= 8;*/
+}
diff --git a/crypto_kem/ntrulpr761/clean/crypto_encode_761x1531.h b/crypto_kem/ntrulpr761/clean/crypto_encode_761x1531.h
new file mode 100644
index 00000000..2cf745b9
--- /dev/null
+++ b/crypto_kem/ntrulpr761/clean/crypto_encode_761x1531.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_NTRULPR761_CLEAN_CRYPTO_ENCODE_761X1531_H
+#define PQCLEAN_NTRULPR761_CLEAN_CRYPTO_ENCODE_761X1531_H
+
+#include <stdint.h>
+#define PQCLEAN_NTRULPR761_CLEAN_crypto_encode_761x1531_STRBYTES 1007
+#define PQCLEAN_NTRULPR761_CLEAN_crypto_encode_761x1531_ITEMS 761
+#define PQCLEAN_NTRULPR761_CLEAN_crypto_encode_761x1531_ITEMBYTES 2
+
+void PQCLEAN_NTRULPR761_CLEAN_crypto_encode_761x1531(unsigned char *out, const void *v);
+#endif
diff --git a/crypto_kem/ntrulpr761/clean/crypto_encode_761x1531round.c b/crypto_kem/ntrulpr761/clean/crypto_encode_761x1531round.c
new file mode 100644
index 00000000..101283b5
--- /dev/null
+++ b/crypto_kem/ntrulpr761/clean/crypto_encode_761x1531round.c
@@ -0,0 +1,17 @@
+#include "crypto_encode_761x1531.h"
+#include "crypto_encode_761x1531round.h"
+
+#define int16 int16_t
+
+#define p 761
+
+void PQCLEAN_NTRULPR761_CLEAN_crypto_encode_761x1531round(unsigned char *out, const void *v) {
+    const int16 *a = v;
+    int16 x[p];
+    int i;
+
+    for (i = 0; i < p; ++i) {
+        x[i] = 3 * ((10923 * a[i] + 16384) >> 15);
+    }
+    PQCLEAN_NTRULPR761_CLEAN_crypto_encode_761x1531(out, x);
+}
diff --git a/crypto_kem/ntrulpr761/clean/crypto_encode_761x1531round.h b/crypto_kem/ntrulpr761/clean/crypto_encode_761x1531round.h
new file mode 100644
index 00000000..10d34a51
--- /dev/null
+++ b/crypto_kem/ntrulpr761/clean/crypto_encode_761x1531round.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_NTRULPR761_CLEAN_CRYPTO_ENCODE_761X1531ROUND_H
+#define PQCLEAN_NTRULPR761_CLEAN_CRYPTO_ENCODE_761X1531ROUND_H
+
+#include <stdint.h>
+#define PQCLEAN_NTRULPR761_CLEAN_crypto_encode_761x1531round_STRBYTES 1007
+#define PQCLEAN_NTRULPR761_CLEAN_crypto_encode_761x1531round_ITEMS 761
+#define PQCLEAN_NTRULPR761_CLEAN_crypto_encode_761x1531round_ITEMBYTES 2
+
+void PQCLEAN_NTRULPR761_CLEAN_crypto_encode_761x1531round(unsigned char *out, const void *v);
+#endif
diff --git a/crypto_kem/ntrulpr761/clean/crypto_encode_761x3.c b/crypto_kem/ntrulpr761/clean/crypto_encode_761x3.c
new file mode 100644
index 00000000..53cf0e21
--- /dev/null
+++ b/crypto_kem/ntrulpr761/clean/crypto_encode_761x3.c
@@ -0,0 +1,21 @@
+#include "crypto_encode_761x3.h"
+
+#define uint8 uint8_t
+
+#define p 761
+
+void PQCLEAN_NTRULPR761_CLEAN_crypto_encode_761x3(unsigned char *s, const void *v) {
+    const uint8 *f = v;
+    uint8 x;
+    int i;
+
+    for (i = 0; i < p / 4; ++i) {
+        x = *f++ + 1;
+        x += (*f++ + 1) << 2;
+        x += (*f++ + 1) << 4;
+        x += (*f++ + 1) << 6;
+        *s++ = x;
+    }
+    x = *f++ + 1;
+    *s++ = x;
+}
diff --git a/crypto_kem/ntrulpr761/clean/crypto_encode_761x3.h b/crypto_kem/ntrulpr761/clean/crypto_encode_761x3.h
new file mode 100644
index 00000000..13efec7b
--- /dev/null
+++ b/crypto_kem/ntrulpr761/clean/crypto_encode_761x3.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_NTRULPR761_CLEAN_CRYPTO_ENCODE_761X3_H
+#define PQCLEAN_NTRULPR761_CLEAN_CRYPTO_ENCODE_761X3_H
+
+#include <stdint.h>
+#define PQCLEAN_NTRULPR761_CLEAN_crypto_encode_761x3_STRBYTES 191
+#define PQCLEAN_NTRULPR761_CLEAN_crypto_encode_761x3_ITEMS 761
+#define PQCLEAN_NTRULPR761_CLEAN_crypto_encode_761x3_ITEMBYTES 1
+
+void PQCLEAN_NTRULPR761_CLEAN_crypto_encode_761x3(unsigned char *s, const void *v);
+#endif
diff --git a/crypto_kem/ntrulpr761/clean/crypto_encode_761xint16.c b/crypto_kem/ntrulpr761/clean/crypto_encode_761xint16.c
new file mode 100644
index 00000000..ec3d1541
--- /dev/null
+++ b/crypto_kem/ntrulpr761/clean/crypto_encode_761xint16.c
@@ -0,0 +1,13 @@
+#include "crypto_encode_761xint16.h"
+
+
+void PQCLEAN_NTRULPR761_CLEAN_crypto_encode_761xint16(unsigned char *s, const void *v) {
+    const uint16_t *x = v;
+    int i;
+
+    for (i = 0; i < 761; ++i) {
+        uint16_t u = *x++;
+        *s++ = u;
+        *s++ = u >> 8;
+    }
+}
diff --git a/crypto_kem/ntrulpr761/clean/crypto_encode_761xint16.h b/crypto_kem/ntrulpr761/clean/crypto_encode_761xint16.h
new file mode 100644
index 00000000..e28f4336
--- /dev/null
+++ b/crypto_kem/ntrulpr761/clean/crypto_encode_761xint16.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_NTRULPR761_CLEAN_CRYPTO_ENCODE_761XINT16_H
+#define PQCLEAN_NTRULPR761_CLEAN_CRYPTO_ENCODE_761XINT16_H
+
+#include <stdint.h>
+#define PQCLEAN_NTRULPR761_CLEAN_crypto_encode_761xint16_STRBYTES 1522
+#define PQCLEAN_NTRULPR761_CLEAN_crypto_encode_761xint16_ITEMBYTES 2
+#define PQCLEAN_NTRULPR761_CLEAN_crypto_encode_761xint16_ITEMS 761
+
+void PQCLEAN_NTRULPR761_CLEAN_crypto_encode_761xint16(unsigned char *s, const void *v);
+#endif
diff --git a/crypto_kem/ntrulpr761/clean/crypto_sort_int32.c b/crypto_kem/ntrulpr761/clean/crypto_sort_int32.c
new file mode 100644
index 00000000..9799240a
--- /dev/null
+++ b/crypto_kem/ntrulpr761/clean/crypto_sort_int32.c
@@ -0,0 +1,86 @@
+#include "crypto_sort_int32.h"
+#include <stdint.h>
+// Based on supercop-20190110/crypto_sort/int32/x86
+
+
+#define int32 int32_t
+
+#define int32_MINMAX(a,b) \
+    do { \
+        int32_t ab = (b) ^ (a); \
+        int32_t c = (int32_t)((int64_t)(b) - (int64_t)(a)); \
+        c ^= ab & (c ^ (b)); \
+        c >>= 31; \
+        c &= ab; \
+        (a) ^= c; \
+        (b) ^= c; \
+    } while(0)
+
+/* assume 2 <= n <= 0x40000000 */
+void PQCLEAN_NTRULPR761_CLEAN_crypto_sort_int32(int32 *array, size_t n) {
+    size_t top, p, q, r, i, j;
+    int32 *x = array;
+
+    top = 1;
+    while (top < n - top) {
+        top += top;
+    }
+
+    for (p = top; p >= 1; p >>= 1) {
+        i = 0;
+        while (i + 2 * p <= n) {
+            for (j = i; j < i + p; ++j) {
+                int32_MINMAX(x[j], x[j + p]);
+            }
+            i += 2 * p;
+        }
+        for (j = i; j < n - p; ++j) {
+            int32_MINMAX(x[j], x[j + p]);
+        }
+
+        i = 0;
+        j = 0;
+        for (q = top; q > p; q >>= 1) {
+            if (j != i) {
+                for (;;) {
+                    if (j == n - q) {
+                        goto done;
+                    }
+                    int32 a = x[j + p];
+                    for (r = q; r > p; r >>= 1) {
+                        int32_MINMAX(a, x[j + r]);
+                    }
+                    x[j + p] = a;
+                    ++j;
+                    if (j == i + p) {
+                        i += 2 * p;
+                        break;
+                    }
+                }
+            }
+            while (i + p <= n - q) {
+                for (j = i; j < i + p; ++j) {
+                    int32 a = x[j + p];
+                    for (r = q; r > p; r >>= 1) {
+                        int32_MINMAX(a, x[j + r]);
+                    }
+                    x[j + p] = a;
+                }
+                i += 2 * p;
+            }
+            /* now i + p > n - q */
+            j = i;
+            while (j < n - q) {
+                int32 a = x[j + p];
+                for (r = q; r > p; r >>= 1) {
+                    int32_MINMAX(a, x[j + r]);
+                }
+                x[j + p] = a;
+                ++j;
+            }
+
+done:
+            ;
+        }
+    }
+}
diff --git a/crypto_kem/ntrulpr761/clean/crypto_sort_int32.h b/crypto_kem/ntrulpr761/clean/crypto_sort_int32.h
new file mode 100644
index 00000000..a927aabd
--- /dev/null
+++ b/crypto_kem/ntrulpr761/clean/crypto_sort_int32.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_NTRULPR761_CLEAN_CRYPTO_SORT_INT32_H
+#define PQCLEAN_NTRULPR761_CLEAN_CRYPTO_SORT_INT32_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+
+void PQCLEAN_NTRULPR761_CLEAN_crypto_sort_int32(int32_t *array, size_t n);
+
+#endif
diff --git a/crypto_kem/ntrulpr761/clean/crypto_sort_uint32.c b/crypto_kem/ntrulpr761/clean/crypto_sort_uint32.c
new file mode 100644
index 00000000..19ced42a
--- /dev/null
+++ b/crypto_kem/ntrulpr761/clean/crypto_sort_uint32.c
@@ -0,0 +1,20 @@
+#include "crypto_sort_int32.h"
+#include "crypto_sort_uint32.h"
+#include <stdint.h>
+
+#define uint32 uint32_t
+
+/* can save time by vectorizing xor loops */
+/* can save time by integrating xor loops with int32_sort */
+
+void PQCLEAN_NTRULPR761_CLEAN_crypto_sort_uint32(uint32_t *array, size_t n) {
+    uint32 *x = array;
+    size_t j;
+    for (j = 0; j < n; ++j) {
+        x[j] ^= 0x80000000;
+    }
+    PQCLEAN_NTRULPR761_CLEAN_crypto_sort_int32((int32_t *)array, n);
+    for (j = 0; j < n; ++j) {
+        x[j] ^= 0x80000000;
+    }
+}
diff --git a/crypto_kem/ntrulpr761/clean/crypto_sort_uint32.h b/crypto_kem/ntrulpr761/clean/crypto_sort_uint32.h
new file mode 100644
index 00000000..9333543f
--- /dev/null
+++ b/crypto_kem/ntrulpr761/clean/crypto_sort_uint32.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_NTRULPR761_CLEAN_CRYPTO_SORT_UINT32_H
+#define PQCLEAN_NTRULPR761_CLEAN_CRYPTO_SORT_UINT32_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+
+void PQCLEAN_NTRULPR761_CLEAN_crypto_sort_uint32(uint32_t *array, size_t n);
+
+#endif
diff --git a/crypto_kem/ntrulpr761/clean/crypto_stream_aes256ctr.c b/crypto_kem/ntrulpr761/clean/crypto_stream_aes256ctr.c
new file mode 100644
index 00000000..6bc07077
--- /dev/null
+++ b/crypto_kem/ntrulpr761/clean/crypto_stream_aes256ctr.c
@@ -0,0 +1,15 @@
+#include "crypto_stream_aes256ctr.h"
+
+
+int PQCLEAN_NTRULPR761_CLEAN_crypto_stream_aes256ctr(
+    uint8_t *out,
+    size_t outlen,
+    const uint8_t nonce[AESCTR_NONCEBYTES],
+    const uint8_t key[AES256_KEYBYTES]) {
+
+    aes256ctx state;
+    aes256_ctr_keyexp(&state, key);
+    aes256_ctr(out, outlen, nonce, &state);
+    aes256_ctx_release(&state);
+    return 0;
+}
diff --git a/crypto_kem/ntrulpr761/clean/crypto_stream_aes256ctr.h b/crypto_kem/ntrulpr761/clean/crypto_stream_aes256ctr.h
new file mode 100644
index 00000000..dd212944
--- /dev/null
+++ b/crypto_kem/ntrulpr761/clean/crypto_stream_aes256ctr.h
@@ -0,0 +1,15 @@
+#ifndef PQCLEAN_NTRULPR761_CLEAN_CRYPTO_STREAM_AES256CTR_H
+#define PQCLEAN_NTRULPR761_CLEAN_CRYPTO_STREAM_AES256CTR_H
+#include "aes.h"
+#include <stddef.h>
+#include <stdint.h>
+
+
+
+int PQCLEAN_NTRULPR761_CLEAN_crypto_stream_aes256ctr(
+    uint8_t *out,
+    size_t outlen,
+    const uint8_t nonce[AESCTR_NONCEBYTES],
+    const uint8_t key[AES256_KEYBYTES]);
+
+#endif
diff --git a/crypto_kem/ntrulpr761/clean/crypto_verify_1167.c b/crypto_kem/ntrulpr761/clean/crypto_verify_1167.c
new file mode 100644
index 00000000..c66c9adf
--- /dev/null
+++ b/crypto_kem/ntrulpr761/clean/crypto_verify_1167.c
@@ -0,0 +1,13 @@
+#include "crypto_verify_1167.h"
+
+
+int PQCLEAN_NTRULPR761_CLEAN_crypto_verify_1167(const unsigned char *x, const unsigned char *y) {
+    unsigned int differentbits = 0;
+    int i;
+
+    for (i = 0; i < PQCLEAN_NTRULPR761_CLEAN_crypto_verify_1167_BYTES; ++i) {
+        differentbits |= x[i] ^ y[i];
+    }
+
+    return (int) (1 & ((differentbits - 1) >> 8)) - 1;
+}
diff --git a/crypto_kem/ntrulpr761/clean/crypto_verify_1167.h b/crypto_kem/ntrulpr761/clean/crypto_verify_1167.h
new file mode 100644
index 00000000..3feefc83
--- /dev/null
+++ b/crypto_kem/ntrulpr761/clean/crypto_verify_1167.h
@@ -0,0 +1,8 @@
+#ifndef PQCLEAN_NTRULPR761_CLEAN_CRYPTO_VERIFY_1167_H
+#define PQCLEAN_NTRULPR761_CLEAN_CRYPTO_VERIFY_1167_H
+
+#include <stdint.h>
+#define PQCLEAN_NTRULPR761_CLEAN_crypto_verify_1167_BYTES 1167
+
+int PQCLEAN_NTRULPR761_CLEAN_crypto_verify_1167(const unsigned char *x, const unsigned char *y);
+#endif
diff --git a/crypto_kem/ntrulpr761/clean/kem.c b/crypto_kem/ntrulpr761/clean/kem.c
new file mode 100644
index 00000000..e2ecdd28
--- /dev/null
+++ b/crypto_kem/ntrulpr761/clean/kem.c
@@ -0,0 +1,287 @@
+#include "api.h"
+#include "crypto_sort_uint32.h"
+#include "crypto_stream_aes256ctr.h"
+#include "params.h"
+#include "randombytes.h"
+#include "sha2.h"
+
+
+
+#define int8 int8_t
+#define int16 int16_t
+#define int32 int32_t
+#define uint16 uint16_t
+#define uint32 uint32_t
+#define uint64 uint64_t
+
+/* ----- masks */
+
+/* return -1 if x<0; otherwise return 0 */
+static int int16_negative_mask(int16 x) {
+    uint16 u = x;
+    u >>= 15;
+    return -(int) u;
+    /* alternative with gcc -fwrapv: */
+    /* x>>15 compiles to CPU's arithmetic right shift */
+}
+
+/* ----- arithmetic mod 3 */
+
+typedef int8 small;
+/* F3 is always represented as -1,0,1 */
+
+/* ----- arithmetic mod q */
+
+#define q12 ((q-1)/2)
+typedef int16 Fq;
+
+/* works for -14000000 < x < 14000000 if q in 4591, 4621, 5167 */
+/* assumes twos complement; use, e.g., gcc -fwrapv */
+static Fq Fq_freeze(int32 x) {
+    x -= q * ((q18 * x) >> 18);
+    x -= q * ((q27 * x + 67108864) >> 27);
+    return x;
+}
+
+/* works for all uint32 x */
+static Fq Fq_bigfreeze(uint32 x) {
+    x -= q * ((x * (uint64)q31) >> 31);
+    x -= q * ((x * (uint64)q31) >> 31);
+    x -= q;
+    x += (-(x >> 31)) & (uint32)q;
+    return x;
+}
+
+/* ----- Top and Right */
+
+static int8 Top(Fq C) {
+    return (tau1 * (int32)(C + tau0) + 16384) >> 15;
+}
+
+static Fq Right(int8 T) {
+    return Fq_freeze(tau3 * (int32)T - tau2);
+}
+
+/* ----- polynomials mod q */
+
+/* h = h*g in the ring Rq */
+static void Rq_mult_small(Fq *h, const small *g) {
+    crypto_encode_pxint16((unsigned char *) h, h);
+    crypto_core_mult((unsigned char *) h, (const unsigned char *) h, (const unsigned char *) g);
+    crypto_decode_pxint16(h, (const unsigned char *) h);
+}
+
+/* ----- sorting to generate short polynomial */
+
+static void Short_fromlist(small *out, const uint32 *in) {
+    uint32 L[ppadsort];
+    int i;
+
+    for (i = 0; i < w; ++i) {
+        L[i] = in[i] & (uint32) - 2;
+    }
+    for (i = w; i < p; ++i) {
+        L[i] = (in[i] & (uint32) - 3) | 1;
+    }
+    for (i = p; i < ppadsort; ++i) {
+        L[i] = 0xffffffff;
+    }
+    PQCLEAN_NTRULPR761_CLEAN_crypto_sort_uint32(L, ppadsort);
+    for (i = 0; i < p; ++i) {
+        out[i] = (L[i] & 3) - 1;
+    }
+}
+
+/* ----- underlying hash function */
+
+#define Hash_bytes 32
+
+static void Hash(unsigned char *out, const unsigned char *in, int inlen) {
+    unsigned char h[64];
+    int i;
+    sha512(h, in, inlen);
+    for (i = 0; i < 32; ++i) {
+        out[i] = h[i];
+    }
+}
+
+/* ----- higher-level randomness */
+
+static void Short_random(small *out) {
+    uint32 L[p];
+
+    randombytes((unsigned char *) L, sizeof L);
+    crypto_decode_pxint32(L, (unsigned char *) L);
+    Short_fromlist(out, L);
+}
+
+/* ----- Inputs, Generator */
+
+typedef int8 Inputs[I]; /* passed by reference */
+
+static const unsigned char aes_nonce[16] = {0};
+
+/* G = Generator(pk) */
+static void Generator(Fq *G, const unsigned char *pk) {
+    uint32 L[p];
+    int i;
+
+    PQCLEAN_NTRULPR761_CLEAN_crypto_stream_aes256ctr((unsigned char *) L, 4 * p, aes_nonce, pk);
+    crypto_decode_pxint32(L, (unsigned char *) L);
+    for (i = 0; i < p; ++i) {
+        G[i] = Fq_bigfreeze(L[i]) - q12;
+    }
+}
+
+/* ----- NTRU LPRime */
+
+#define Seeds_bytes 32
+#define Ciphertexts_bytes (Rounded_bytes+Top_bytes)
+#define SecretKeys_bytes Small_bytes
+#define PublicKeys_bytes (Seeds_bytes+Rounded_bytes)
+#define Confirm_bytes 32
+
+/* c,r_enc[1:] = Hide(r,pk,cache); cache is Hash4(pk) */
+static void Hide(unsigned char *c, unsigned char *r_enc, const Inputs r, const unsigned char *pk, const unsigned char *cache) {
+    small b[p];
+    int i;
+
+    Inputs_encode(r_enc + 1, r);
+    {
+        unsigned char h[Hash_bytes];
+        uint32 L[p];
+        {
+            unsigned char s[1 + Inputs_bytes];
+            Inputs_encode(s + 1, r);
+            s[0] = 5;
+            Hash(h, s, sizeof s);
+        }
+        PQCLEAN_NTRULPR761_CLEAN_crypto_stream_aes256ctr((unsigned char *) L, 4 * p, aes_nonce, h);
+        crypto_decode_pxint32(L, (unsigned char *) L);
+        Short_fromlist(b, L);
+    }
+    {
+        Fq bG[p];
+        Generator(bG, pk);
+        Rq_mult_small(bG, b);
+        Round_and_encode(c, bG);
+        c += Rounded_bytes;
+    }
+    {
+        Fq bA[p];
+        int8 T[I];
+        Rounded_decode(bA, pk + Seeds_bytes);
+        Rq_mult_small(bA, b);
+        for (i = 0; i < I; ++i) {
+            T[i] = Top(Fq_freeze(bA[i] + r[i] * q12));
+        }
+        Top_encode(c, T);
+        c += Top_bytes;
+    }
+    {
+        unsigned char x[1 + Inputs_bytes + Hash_bytes];
+        for (i = 0; i < Inputs_bytes; ++i) {
+            x[1 + i] = r_enc[1 + i];
+        }
+        for (i = 0; i < Hash_bytes; ++i) {
+            x[1 + Inputs_bytes + i] = cache[i];
+        }
+        x[0] = 2;
+        Hash(c, x, sizeof x);
+    }
+}
+
+
+int PQCLEAN_NTRULPR761_CLEAN_crypto_kem_keypair(unsigned char *pk, unsigned char *sk) {
+    Fq aG[p];
+    int i;
+    randombytes(pk, Seeds_bytes);
+    Generator(aG, pk);
+    {
+        small a[p];
+        Short_random(a);
+        Rq_mult_small(aG, a);
+        Small_encode(sk, a);
+    }
+    Round_and_encode(pk + Seeds_bytes, aG);
+    {
+        unsigned char sksave = sk[SecretKeys_bytes - 1];
+        for (i = 0; i < PublicKeys_bytes; ++i) {
+            sk[SecretKeys_bytes + i] = pk[i];
+        }
+        sk[SecretKeys_bytes - 1] = 4;
+        Hash(sk + SecretKeys_bytes + PublicKeys_bytes + Inputs_bytes, sk + SecretKeys_bytes - 1, 1 + PublicKeys_bytes);
+        sk[SecretKeys_bytes - 1] = sksave;
+        randombytes(sk + SecretKeys_bytes + PublicKeys_bytes, Inputs_bytes);
+    }
+    return 0;
+}
+
+int PQCLEAN_NTRULPR761_CLEAN_crypto_kem_enc(unsigned char *c, unsigned char *k, const unsigned char *pk) {
+    int i;
+    unsigned char cache[Hash_bytes];
+    {
+        unsigned char y[1 + PublicKeys_bytes];
+        for (i = 0; i < PublicKeys_bytes; ++i) {
+            y[1 + i] = pk[i];
+        }
+        y[0] = 4;
+        Hash(cache, y, sizeof y);
+    }
+    Inputs r;
+    {
+        unsigned char s[Inputs_bytes];
+        randombytes(s, sizeof s);
+        Inputs_decode(r, s);
+    }
+    {
+        unsigned char x[1 + Inputs_bytes + Ciphertexts_bytes + Confirm_bytes];
+        Hide(c, x, r, pk, cache);
+        for (i = 0; i < Ciphertexts_bytes + Confirm_bytes; ++i) {
+            x[1 + Inputs_bytes + i] = c[i];
+        }
+        x[0] = 1;
+        Hash(k, x, sizeof x);
+    }
+    return 0;
+}
+
+int PQCLEAN_NTRULPR761_CLEAN_crypto_kem_dec(unsigned char *k, const unsigned char *c, const unsigned char *sk) {
+    const unsigned char *pk = sk + SecretKeys_bytes;
+    const unsigned char *rho = pk + PublicKeys_bytes;
+    const unsigned char *cache = rho + Inputs_bytes;
+    Inputs r;
+    int i;
+    {
+        Fq aB[p];
+        Rounded_decode(aB, c);
+        {
+            small a[p];
+            Small_decode(a, sk);
+            Rq_mult_small(aB, a);
+        }
+        {
+            int8 T[I];
+            Top_decode(T, c + Rounded_bytes);
+            for (i = 0; i < I; ++i) {
+                r[i] = -int16_negative_mask(Fq_freeze(Right(T[i]) - aB[i] + 4 * w + 1));
+            }
+        }
+    }
+    {
+        unsigned char cnew[Ciphertexts_bytes + Confirm_bytes];
+        int mask;
+        unsigned char x[1 + Inputs_bytes + Ciphertexts_bytes + Confirm_bytes];
+        Hide(cnew, x, r, pk, cache);
+        mask = crypto_verify_clen(c, cnew);
+        for (i = 0; i < Inputs_bytes; ++i) {
+            x[1 + i] ^= mask & (x[1 + i] ^ rho[i]);
+        }
+        for (i = 0; i < Ciphertexts_bytes + Confirm_bytes; ++i) {
+            x[1 + Inputs_bytes + i] = c[i];
+        }
+        x[0] = 1 + mask;
+        Hash(k, x, sizeof x);
+    }
+    return 0;
+}
diff --git a/crypto_kem/ntrulpr761/clean/params.h b/crypto_kem/ntrulpr761/clean/params.h
new file mode 100644
index 00000000..23495582
--- /dev/null
+++ b/crypto_kem/ntrulpr761/clean/params.h
@@ -0,0 +1,63 @@
+#ifndef params_H
+#define params_H
+#include "crypto_core_multsntrup761.h"
+#include "crypto_decode_256x16.h"
+#include "crypto_decode_256x2.h"
+#include "crypto_decode_761x1531.h"
+#include "crypto_decode_761x3.h"
+#include "crypto_decode_761xint16.h"
+#include "crypto_decode_761xint32.h"
+#include "crypto_encode_256x16.h"
+#include "crypto_encode_256x2.h"
+#include "crypto_encode_761x1531.h"
+#include "crypto_encode_761x1531round.h"
+#include "crypto_encode_761x3.h"
+#include "crypto_encode_761xint16.h"
+#include "crypto_verify_1167.h"
+
+
+#define p 761
+#define q 4591
+#define w 250
+#define q27 29235 /* closest integer to 2^27/q */
+#define q18 57 /* closest integer to 2^18/q */
+#define tau0 2156
+#define tau1 114
+#define tau2 2007
+#define tau3 287
+#define I 256
+
+#define ppadsort 768
+
+#define q18 57 /* round(2^18/q) */
+#define q27 29235 /* round(2^27/q) */
+#define q31 467759 /* floor(2^31/q) */
+
+#define crypto_verify_clen PQCLEAN_NTRULPR761_CLEAN_crypto_verify_1167
+
+#define Rounded_bytes PQCLEAN_NTRULPR761_CLEAN_crypto_decode_761x1531_STRBYTES
+#define Rounded_decode PQCLEAN_NTRULPR761_CLEAN_crypto_decode_761x1531
+
+#define Round_and_encode PQCLEAN_NTRULPR761_CLEAN_crypto_encode_761x1531round
+
+#define Small_bytes PQCLEAN_NTRULPR761_CLEAN_crypto_encode_761x3_STRBYTES
+#define Small_encode PQCLEAN_NTRULPR761_CLEAN_crypto_encode_761x3
+#define Small_decode PQCLEAN_NTRULPR761_CLEAN_crypto_decode_761x3
+
+#define Top_bytes PQCLEAN_NTRULPR761_CLEAN_crypto_encode_256x16_STRBYTES
+#define Top_encode PQCLEAN_NTRULPR761_CLEAN_crypto_encode_256x16
+#define Top_decode PQCLEAN_NTRULPR761_CLEAN_crypto_decode_256x16
+
+#define Inputs_bytes PQCLEAN_NTRULPR761_CLEAN_crypto_encode_256x2_STRBYTES
+#define Inputs_encode PQCLEAN_NTRULPR761_CLEAN_crypto_encode_256x2
+#define Inputs_decode PQCLEAN_NTRULPR761_CLEAN_crypto_decode_256x2
+
+#define crypto_decode_pxint32 PQCLEAN_NTRULPR761_CLEAN_crypto_decode_761xint32
+
+#define crypto_decode_pxint16 PQCLEAN_NTRULPR761_CLEAN_crypto_decode_761xint16
+
+#define crypto_encode_pxint16 PQCLEAN_NTRULPR761_CLEAN_crypto_encode_761xint16
+
+#define crypto_core_mult PQCLEAN_NTRULPR761_CLEAN_crypto_core_multsntrup761
+
+#endif
diff --git a/crypto_kem/ntrulpr857/META.yml b/crypto_kem/ntrulpr857/META.yml
new file mode 100644
index 00000000..0726373b
--- /dev/null
+++ b/crypto_kem/ntrulpr857/META.yml
@@ -0,0 +1,26 @@
+name: ntrulpr857
+type: kem
+claimed-nist-level: 4
+claimed-security: IND-CCA2
+length-public-key: 1184
+length-secret-key: 1463
+length-ciphertext: 1312
+length-shared-secret: 32
+nistkat-sha256: cc8c8b8f3e31c07cce27c0e54c636884426593bf0f71c6e9215bde2ed3f516ef
+principal-submitters:
+  - Daniel J. Bernstein
+  - Chitchanok Chuengsatiansup
+  - Tanja Lange
+  - Christine van Vredendaal
+implementations:
+    - name: clean
+      version: supercop-20200826
+    - name: avx2
+      version: supercop-20200826
+      supported_platforms:
+          - architecture: x86_64
+            operating_systems:
+                - Linux
+                - Darwin
+            required_flags:
+                - avx2
diff --git a/crypto_kem/ntrulpr857/avx2/LICENSE b/crypto_kem/ntrulpr857/avx2/LICENSE
new file mode 100644
index 00000000..d5d21fff
--- /dev/null
+++ b/crypto_kem/ntrulpr857/avx2/LICENSE
@@ -0,0 +1 @@
+Public Domain
diff --git a/crypto_kem/ntrulpr857/avx2/Makefile b/crypto_kem/ntrulpr857/avx2/Makefile
new file mode 100644
index 00000000..e4918b62
--- /dev/null
+++ b/crypto_kem/ntrulpr857/avx2/Makefile
@@ -0,0 +1,22 @@
+# This Makefile can be used with GNU Make or BSD Make
+
+LIB=libntrulpr857_avx2.a
+HEADERS=api.h crypto_core_multsntrup857.h crypto_core_multsntrup857_ntt.h crypto_decode_256x16.h crypto_decode_256x2.h crypto_decode_857x1723.h crypto_decode_857x3.h crypto_decode_857xint16.h crypto_decode_857xint32.h crypto_encode_256x16.h crypto_encode_256x2.h crypto_encode_857x1723.h crypto_encode_857x1723round.h crypto_encode_857x3.h crypto_encode_857xint16.h crypto_sort_int32.h crypto_sort_uint32.h crypto_stream_aes256ctr.h crypto_verify_1312.h params.h 
+OBJECTS=crypto_core_multsntrup857.o crypto_core_multsntrup857_ntt.o crypto_decode_256x16.o crypto_decode_256x2.o crypto_decode_857x1723.o crypto_decode_857x3.o crypto_decode_857xint16.o crypto_decode_857xint32.o crypto_encode_256x16.o crypto_encode_256x2.o crypto_encode_857x1723.o crypto_encode_857x1723round.o crypto_encode_857x3.o crypto_encode_857xint16.o crypto_sort_int32.o crypto_sort_uint32.o crypto_stream_aes256ctr.o crypto_verify_1312.o kem.o 
+
+CFLAGS=-O3 -mavx2 -mbmi2 -Wall -Wextra -Wpedantic -Wvla -Werror -Wredundant-decls -Wmissing-prototypes -std=c99 -I../../../common $(EXTRAFLAGS)
+
+all: $(LIB)
+
+%.o: %.s $(HEADERS)
+	$(AS) -o $@ $<
+
+%.o: %.c $(HEADERS)
+	$(CC) $(CFLAGS) -c -o $@ $<
+
+$(LIB): $(OBJECTS)
+	$(AR) -r $@ $(OBJECTS)
+
+clean:
+	$(RM) $(OBJECTS)
+	$(RM) $(LIB)
diff --git a/crypto_kem/ntrulpr857/avx2/api.h b/crypto_kem/ntrulpr857/avx2/api.h
new file mode 100644
index 00000000..373863f4
--- /dev/null
+++ b/crypto_kem/ntrulpr857/avx2/api.h
@@ -0,0 +1,16 @@
+#ifndef PQCLEAN_NTRULPR857_AVX2_API_H
+#define PQCLEAN_NTRULPR857_AVX2_API_H
+
+
+
+#define PQCLEAN_NTRULPR857_AVX2_CRYPTO_ALGNAME "ntrulpr857"
+
+#define PQCLEAN_NTRULPR857_AVX2_CRYPTO_SECRETKEYBYTES 1463
+#define PQCLEAN_NTRULPR857_AVX2_CRYPTO_PUBLICKEYBYTES 1184
+#define PQCLEAN_NTRULPR857_AVX2_CRYPTO_CIPHERTEXTBYTES 1312
+#define PQCLEAN_NTRULPR857_AVX2_CRYPTO_BYTES 32
+
+int PQCLEAN_NTRULPR857_AVX2_crypto_kem_keypair(unsigned char *pk, unsigned char *sk);
+int PQCLEAN_NTRULPR857_AVX2_crypto_kem_enc(unsigned char *c, unsigned char *k, const unsigned char *pk);
+int PQCLEAN_NTRULPR857_AVX2_crypto_kem_dec(unsigned char *k, const unsigned char *c, const unsigned char *sk);
+#endif
diff --git a/crypto_kem/ntrulpr857/avx2/crypto_core_multsntrup857.c b/crypto_kem/ntrulpr857/avx2/crypto_core_multsntrup857.c
new file mode 100644
index 00000000..674fa8e3
--- /dev/null
+++ b/crypto_kem/ntrulpr857/avx2/crypto_core_multsntrup857.c
@@ -0,0 +1,421 @@
+#include "crypto_core_multsntrup857.h"
+#include "crypto_core_multsntrup857_ntt.h"
+#include "crypto_decode_857xint16.h"
+#include "crypto_encode_857xint16.h"
+#include <immintrin.h>
+
+typedef int8_t int8;
+typedef int16_t int16;
+
+#define int16x16 __m256i
+#define load_x16(p) _mm256_loadu_si256((int16x16 *) (p))
+#define store_x16(p,v) _mm256_storeu_si256((int16x16 *) (p),(v))
+#define const_x16 _mm256_set1_epi16
+#define add_x16 _mm256_add_epi16
+#define sub_x16 _mm256_sub_epi16
+#define mullo_x16 _mm256_mullo_epi16
+#define mulhi_x16 _mm256_mulhi_epi16
+#define mulhrs_x16 _mm256_mulhrs_epi16
+#define signmask_x16(x) _mm256_srai_epi16((x),15)
+
+typedef union {
+    int16 v[512];
+    int16x16 _dummy;
+} vec512;
+
+typedef union {
+    int16 v[4][512];
+    int16x16 _dummy;
+} vec4x512;
+
+typedef union {
+    int16 v[1024];
+    int16x16 _dummy;
+} vec1024;
+
+typedef union {
+    int16 v[4 * 512];
+    int16x16 _dummy;
+} vec2048;
+
+static inline int16x16 squeeze_5167_x16(int16x16 x) {
+    return sub_x16(x, mullo_x16(mulhrs_x16(x, const_x16(6)), const_x16(5167)));
+}
+
+static inline int16x16 squeeze_7681_x16(int16x16 x) {
+    return sub_x16(x, mullo_x16(mulhrs_x16(x, const_x16(4)), const_x16(7681)));
+}
+
+static inline int16x16 squeeze_10753_x16(int16x16 x) {
+    return sub_x16(x, mullo_x16(mulhrs_x16(x, const_x16(3)), const_x16(10753)));
+}
+
+static inline int16x16 mulmod_5167_x16(int16x16 x, int16x16 y) {
+    int16x16 yqinv = mullo_x16(y, const_x16(-19761)); /* XXX: precompute */
+    int16x16 b = mulhi_x16(x, y);
+    int16x16 d = mullo_x16(x, yqinv);
+    int16x16 e = mulhi_x16(d, const_x16(5167));
+    return sub_x16(b, e);
+}
+
+static inline int16x16 mulmod_7681_x16(int16x16 x, int16x16 y) {
+    int16x16 yqinv = mullo_x16(y, const_x16(-7679)); /* XXX: precompute */
+    int16x16 b = mulhi_x16(x, y);
+    int16x16 d = mullo_x16(x, yqinv);
+    int16x16 e = mulhi_x16(d, const_x16(7681));
+    return sub_x16(b, e);
+}
+
+static inline int16x16 mulmod_10753_x16(int16x16 x, int16x16 y) {
+    int16x16 yqinv = mullo_x16(y, const_x16(-10751)); /* XXX: precompute */
+    int16x16 b = mulhi_x16(x, y);
+    int16x16 d = mullo_x16(x, yqinv);
+    int16x16 e = mulhi_x16(d, const_x16(10753));
+    return sub_x16(b, e);
+}
+
+static void stride(int16 fpad[4][512], const int16 f[1024]) {
+    int16x16 f0, f1, f2, f3, g0, g1, g2, g3;
+    int i, j;
+
+    for (j = 0; j < 256; j += 16) {
+        f0 = load_x16(&f[0]);
+        f1 = load_x16(&f[16]);
+        f2 = load_x16(&f[32]);
+        f3 = load_x16(&f[48]);
+        f += 64;
+
+        g0 = _mm256_permute2x128_si256(f0, f2, 0x20);
+        g1 = _mm256_permute2x128_si256(f0, f2, 0x31);
+        g2 = _mm256_permute2x128_si256(f1, f3, 0x20);
+        g3 = _mm256_permute2x128_si256(f1, f3, 0x31);
+        f0 = _mm256_unpacklo_epi16(g0, g1);
+        f1 = _mm256_unpackhi_epi16(g0, g1);
+        f2 = _mm256_unpacklo_epi16(g2, g3);
+        f3 = _mm256_unpackhi_epi16(g2, g3);
+        g0 = _mm256_unpacklo_epi16(f0, f1);
+        g1 = _mm256_unpackhi_epi16(f0, f1);
+        g2 = _mm256_unpacklo_epi16(f2, f3);
+        g3 = _mm256_unpackhi_epi16(f2, f3);
+        f0 = _mm256_unpacklo_epi64(g0, g2);
+        f1 = _mm256_unpackhi_epi64(g0, g2);
+        f2 = _mm256_unpacklo_epi64(g1, g3);
+        f3 = _mm256_unpackhi_epi64(g1, g3);
+
+        store_x16(&fpad[0][j], f0);
+        store_x16(&fpad[1][j], f1);
+        store_x16(&fpad[2][j], f2);
+        store_x16(&fpad[3][j], f3);
+    }
+
+    for (i = 0; i < 4; ++i) {
+        for (j = 256; j < 512; ++j) {
+            fpad[i][j] = 0;
+        }
+    }
+}
+
+static void unstride(int16 f[2048], const int16 fpad[4][512]) {
+    int16x16 f0, f1, f2, f3, g0, g1, g2, g3, h0, h1, h2, h3;
+    int j;
+
+    for (j = 0; j < 512; j += 16) {
+        f0 = load_x16(&fpad[0][j]);
+        f1 = load_x16(&fpad[1][j]);
+        f2 = load_x16(&fpad[2][j]);
+        f3 = load_x16(&fpad[3][j]);
+
+        g2 = _mm256_unpacklo_epi16(f2, f3);
+        g3 = _mm256_unpackhi_epi16(f2, f3);
+        g0 = _mm256_unpacklo_epi16(f0, f1);
+        h0 = _mm256_unpacklo_epi32(g0, g2);
+        h1 = _mm256_unpackhi_epi32(g0, g2);
+        g1 = _mm256_unpackhi_epi16(f0, f1);
+        h2 = _mm256_unpacklo_epi32(g1, g3);
+        h3 = _mm256_unpackhi_epi32(g1, g3);
+        f1 = _mm256_permute2x128_si256(h2, h3, 0x20);
+        f3 = _mm256_permute2x128_si256(h2, h3, 0x31);
+        f0 = _mm256_permute2x128_si256(h0, h1, 0x20);
+        f2 = _mm256_permute2x128_si256(h0, h1, 0x31);
+
+        store_x16(&f[0], f0);
+        store_x16(&f[16], f1);
+        store_x16(&f[32], f2);
+        store_x16(&f[48], f3);
+        f += 64;
+    }
+}
+
+static const vec512 y_7681 = { .v = {
+        -3593, -617, -2804, 3266, -2194, -1296, -1321, 810, 1414, 3706, -549, -396, -121, -2088, -2555, 1305,
+            -3777, 1921, 103, 3600, -2456, 1483, 1399, -1887, -1701, 2006, 1535, -3174, -2250, 2816, -2440, -1760,
+            -3625, 2830, 2043, -3689, 1100, 1525, -514, 7, 2876, -1599, 3153, -1881, -2495, -2237, -2535, 438,
+            3182, 3364, -1431, 1738, 3696, -2557, -2956, 638, -2319, -1993, -2310, -3555, 834, -1986, 3772, -679,
+            3593, 617, 2804, -3266, 2194, 1296, 1321, -810, -1414, -3706, 549, 396, 121, 2088, 2555, -1305,
+            3777, -1921, -103, -3600, 2456, -1483, -1399, 1887, 1701, -2006, -1535, 3174, 2250, -2816, 2440, 1760,
+            3625, -2830, -2043, 3689, -1100, -1525, 514, -7, -2876, 1599, -3153, 1881, 2495, 2237, 2535, -438,
+            -3182, -3364, 1431, -1738, -3696, 2557, 2956, -638, 2319, 1993, 2310, 3555, -834, 1986, -3772, 679,
+            2665, 727, -2572, 2426, -2133, -1386, 1681, -1054, 2579, 3750, 373, 3417, 404, -2233, 3135, -3405,
+            -1799, 1521, 1497, -3831, -3480, -3428, 2883, -1698, -859, -2762, 2175, -194, -486, -3816, -1756, 2385,
+            -783, 1533, 3145, 2, 3310, -2743, 2224, -1166, 2649, -1390, 3692, 2789, 1919, 2835, -2391, -2732,
+            1056, 1464, 1350, -915, -1168, -921, -3588, 3456, -2160, -1598, 730, 2919, 1532, -2764, -660, -2113,
+            -2665, -727, 2572, -2426, 2133, 1386, -1681, 1054, -2579, -3750, -373, -3417, -404, 2233, -3135, 3405,
+            1799, -1521, -1497, 3831, 3480, 3428, -2883, 1698, 859, 2762, -2175, 194, 486, 3816, 1756, -2385,
+            783, -1533, -3145, -2, -3310, 2743, -2224, 1166, -2649, 1390, -3692, -2789, -1919, -2835, 2391, 2732,
+            -1056, -1464, -1350, 915, 1168, 921, 3588, -3456, 2160, 1598, -730, -2919, -1532, 2764, 660, 2113,
+            2005, -188, 2345, -3723, -1403, 2070, 83, -3214, -3752, -1012, 1837, -3208, 3287, 3335, -293, 796,
+            592, 1519, -1338, 1931, 509, -2262, -3408, 3334, 3677, 2130, 642, 589, -2167, -1084, -370, -3163,
+            3763, -893, -2303, -402, 2937, -1689, -1526, -3745, -2460, 2874, 2965, 124, -1669, -1441, -3312, 3781,
+            2812, -2386, -2515, -429, -3343, 777, -826, -3366, -3657, -1404, -791, -2963, -692, 2532, 2083, 2258,
+            -2005, 188, -2345, 3723, 1403, -2070, -83, 3214, 3752, 1012, -1837, 3208, -3287, -3335, 293, -796,
+            -592, -1519, 1338, -1931, -509, 2262, 3408, -3334, -3677, -2130, -642, -589, 2167, 1084, 370, 3163,
+            -3763, 893, 2303, 402, -2937, 1689, 1526, 3745, 2460, -2874, -2965, -124, 1669, 1441, 3312, -3781,
+            -2812, 2386, 2515, 429, 3343, -777, 826, 3366, 3657, 1404, 791, 2963, 692, -2532, -2083, -2258,
+            179, 1121, 2891, -3581, 3177, -658, -3314, -1509, -17, 151, 2815, 2786, 1278, -2767, -1072, -1151,
+            -1242, -2071, 2340, -1586, 2072, 1476, 2998, 2918, -3744, -3794, -1295, 451, -929, 2378, -1144, 434,
+            -1070, -436, -3550, -3568, 1649, 715, 3461, -1407, -2001, -1203, 3770, 1712, 2230, -3542, 2589, -3547,
+            -2059, -236, 3434, -3693, 2161, -670, 2719, 2339, -2422, 1181, 3450, 222, 1348, -226, 2247, -1779,
+            -179, -1121, -2891, 3581, -3177, 658, 3314, 1509, 17, -151, -2815, -2786, -1278, 2767, 1072, 1151,
+            1242, 2071, -2340, 1586, -2072, -1476, -2998, -2918, 3744, 3794, 1295, -451, 929, -2378, 1144, -434,
+            1070, 436, 3550, 3568, -1649, -715, -3461, 1407, 2001, 1203, -3770, -1712, -2230, 3542, -2589, 3547,
+            2059, 236, -3434, 3693, -2161, 670, -2719, -2339, 2422, -1181, -3450, -222, -1348, 226, -2247, 1779,
+        }
+} ;
+static const vec512 y_10753 = { .v = {
+        1018, -1520, -2935, -4189, 2413, 918, 4, 1299, -2695, 1341, -205, -4744, -3784, 2629, 2565, -3062,
+        223, -4875, 2790, -2576, -3686, -2503, 3550, -3085, 730, 1931, -4513, 4876, -3364, 5213, 2178, 2984,
+        4188, -4035, 4129, -544, 357, 4347, 1284, -2388, -4855, 341, -1287, 4102, 425, 5175, -4616, -4379,
+        -3688, 5063, 3091, 1085, -376, 3012, -268, -1009, -2236, -3823, 2982, -4742, -4544, -4095, 193, 847,
+        -1018, 1520, 2935, 4189, -2413, -918, -4, -1299, 2695, -1341, 205, 4744, 3784, -2629, -2565, 3062,
+        -223, 4875, -2790, 2576, 3686, 2503, -3550, 3085, -730, -1931, 4513, -4876, 3364, -5213, -2178, -2984,
+        -4188, 4035, -4129, 544, -357, -4347, -1284, 2388, 4855, -341, 1287, -4102, -425, -5175, 4616, 4379,
+        3688, -5063, -3091, -1085, 376, -3012, 268, 1009, 2236, 3823, -2982, 4742, 4544, 4095, -193, -847,
+        -4734, 4977, -400, -864, 567, -5114, -4286, 635, 512, -1356, -779, -2973, 675, -5064, -1006, 1268,
+        2998, 2981, -151, -3337, 3198, -909, 2737, -970, 2774, 886, 2206, 1324, 2271, 454, -326, -3715,
+        -3441, -4580, 636, 2234, -794, 3615, 578, -472, 3057, -5156, -2740, 2684, 1615, -1841, -336, -1586,
+        5341, -116, 5294, 4123, 5023, -1458, -3169, 467, -2045, 4828, -1572, -5116, -2213, -4808, 2884, 1068,
+        4734, -4977, 400, 864, -567, 5114, 4286, -635, -512, 1356, 779, 2973, -675, 5064, 1006, -1268,
+        -2998, -2981, 151, 3337, -3198, 909, -2737, 970, -2774, -886, -2206, -1324, -2271, -454, 326, 3715,
+        3441, 4580, -636, -2234, 794, -3615, -578, 472, -3057, 5156, 2740, -2684, -1615, 1841, 336, 1586,
+        -5341, 116, -5294, -4123, -5023, 1458, 3169, -467, 2045, -4828, 1572, 5116, 2213, 4808, -2884, -1068,
+        3453, 2196, 2118, 5005, 2428, -2062, -1930, 2283, 4601, 3524, -3241, -1409, -2230, -5015, 4359, 4254,
+        5309, 2657, -2050, -4428, 4250, -2015, -3148, -778, 2624, -1573, 40, 2237, -573, -4447, 2909, 1122,
+        854, -4782, 2439, 4408, 5172, 4784, 4144, 1639, 3760, 2139, 2680, -663, 4621, 3135, 1349, -97,
+        5215, 3410, -2117, -1992, -1381, -1635, 274, -2419, 3570, 458, 2087, -2374, -1132, 2662, -1722, 5313,
+        -3453, -2196, -2118, -5005, -2428, 2062, 1930, -2283, -4601, -3524, 3241, 1409, 2230, 5015, -4359, -4254,
+        -5309, -2657, 2050, 4428, -4250, 2015, 3148, 778, -2624, 1573, -40, -2237, 573, 4447, -2909, -1122,
+        -854, 4782, -2439, -4408, -5172, -4784, -4144, -1639, -3760, -2139, -2680, 663, -4621, -3135, -1349, 97,
+        -5215, -3410, 2117, 1992, 1381, 1635, -274, 2419, -3570, -458, -2087, 2374, 1132, -2662, 1722, -5313,
+        -2487, -554, 4519, 2449, 73, 3419, 624, -1663, -1053, 4889, 279, 1893, 1111, 1510, 2279, -4540,
+        2529, 2963, 5120, -3995, -5107, -3360, -5356, 2625, -4403, 152, -5083, -2807, 2113, -4000, -4328, 3125,
+        -2605, 4967, -1056, 1160, 1927, 693, -4003, 3827, -4670, -569, 3535, -5268, 1782, 825, 355, 5068,
+        5334, 4859, -1689, -2788, -4891, -3260, 1204, 3891, -4720, -4973, 2813, 2205, 834, -4393, -2151, 3096,
+        2487, 554, -4519, -2449, -73, -3419, -624, 1663, 1053, -4889, -279, -1893, -1111, -1510, -2279, 4540,
+        -2529, -2963, -5120, 3995, 5107, 3360, 5356, -2625, 4403, -152, 5083, 2807, -2113, 4000, 4328, -3125,
+        2605, -4967, 1056, -1160, -1927, -693, 4003, -3827, 4670, 569, -3535, 5268, -1782, -825, -355, -5068,
+        -5334, -4859, 1689, 2788, 4891, 3260, -1204, -3891, 4720, 4973, -2813, -2205, -834, 4393, 2151, -3096,
+    }
+} ;
+/*
+  can also compute these on the fly, and share storage,
+  at expense of 2 NTTs on top of the 24 NTTs below:
+  ...
+  for (i = 0;i < 512;++i) y_7681[i] = 0;
+  y_7681[1] = -3593;
+  PQCLEAN_NTRULPR857_AVX2_ntt512_7681(y_7681,1);
+  ...
+  for (i = 0;i < 512;++i) y_10753[i] = 0;
+  y_10753[1] = 1018;
+  PQCLEAN_NTRULPR857_AVX2_ntt512_10753(y_10753,1);
+*/
+
+static void mult1024(int16 h[2048], const int16 f[1024], const int16 g[1024]) {
+    vec4x512 x1, x2;
+    vec2048 x3, x4;
+#define fpad (x1.v)
+#define gpad (x2.v)
+#define hpad fpad
+#define h_7681 (x3.v)
+#define h_10753 (x4.v)
+    int i;
+
+    stride(fpad, f);
+    PQCLEAN_NTRULPR857_AVX2_ntt512_7681(fpad[0], 4);
+
+    stride(gpad, g);
+    PQCLEAN_NTRULPR857_AVX2_ntt512_7681(gpad[0], 4);
+
+    for (i = 0; i < 512; i += 16) {
+        int16x16 f0 = squeeze_7681_x16(load_x16(&fpad[0][i]));
+        int16x16 f1 = squeeze_7681_x16(load_x16(&fpad[1][i]));
+        int16x16 f2 = squeeze_7681_x16(load_x16(&fpad[2][i]));
+        int16x16 f3 = squeeze_7681_x16(load_x16(&fpad[3][i]));
+        int16x16 g0 = squeeze_7681_x16(load_x16(&gpad[0][i]));
+        int16x16 g1 = squeeze_7681_x16(load_x16(&gpad[1][i]));
+        int16x16 g2 = squeeze_7681_x16(load_x16(&gpad[2][i]));
+        int16x16 g3 = squeeze_7681_x16(load_x16(&gpad[3][i]));
+        int16x16 d0 = mulmod_7681_x16(f0, g0);
+        int16x16 d1 = mulmod_7681_x16(f1, g1);
+        int16x16 d2 = mulmod_7681_x16(f2, g2);
+        int16x16 d3 = mulmod_7681_x16(f3, g3);
+        int16x16 d0d1 = add_x16(d0, d1);
+        int16x16 d0d1d2 = add_x16(d0d1, d2);
+        int16x16 d0d1d2d3 = squeeze_7681_x16(add_x16(d0d1d2, d3));
+        int16x16 d2d3 = add_x16(d2, d3);
+        int16x16 d1d2d3 = add_x16(d1, d2d3);
+        int16x16 e01 = mulmod_7681_x16(sub_x16(f0, f1), sub_x16(g0, g1));
+        int16x16 e02 = mulmod_7681_x16(sub_x16(f0, f2), sub_x16(g0, g2));
+        int16x16 e03 = mulmod_7681_x16(sub_x16(f0, f3), sub_x16(g0, g3));
+        int16x16 e12 = mulmod_7681_x16(sub_x16(f1, f2), sub_x16(g1, g2));
+        int16x16 e13 = mulmod_7681_x16(sub_x16(f1, f3), sub_x16(g1, g3));
+        int16x16 e23 = mulmod_7681_x16(sub_x16(f2, f3), sub_x16(g2, g3));
+        int16x16 h0 = d0;
+        int16x16 h1 = sub_x16(d0d1, e01);
+        int16x16 h2 = sub_x16(d0d1d2, e02);
+        int16x16 h3 = sub_x16(d0d1d2d3, add_x16(e12, e03));
+        int16x16 h4 = sub_x16(d1d2d3, e13);
+        int16x16 h5 = sub_x16(d2d3, e23);
+        int16x16 h6 = d3;
+        int16x16 twist = load_x16(&y_7681.v[i]);
+        h4 = mulmod_7681_x16(h4, twist);
+        h5 = mulmod_7681_x16(h5, twist);
+        h6 = mulmod_7681_x16(h6, twist);
+        h0 = add_x16(h0, h4);
+        h1 = add_x16(h1, h5);
+        h2 = add_x16(h2, h6);
+        store_x16(&hpad[0][i], squeeze_7681_x16(h0));
+        store_x16(&hpad[1][i], squeeze_7681_x16(h1));
+        store_x16(&hpad[2][i], squeeze_7681_x16(h2));
+        store_x16(&hpad[3][i], squeeze_7681_x16(h3));
+    }
+
+    PQCLEAN_NTRULPR857_AVX2_invntt512_7681(hpad[0], 4);
+    unstride(h_7681, (const int16(*)[512]) hpad);
+
+    stride(fpad, f);
+    PQCLEAN_NTRULPR857_AVX2_ntt512_10753(fpad[0], 4);
+
+    stride(gpad, g);
+    PQCLEAN_NTRULPR857_AVX2_ntt512_10753(gpad[0], 4);
+
+    for (i = 0; i < 512; i += 16) {
+        int16x16 f0 = squeeze_10753_x16(load_x16(&fpad[0][i]));
+        int16x16 f1 = squeeze_10753_x16(load_x16(&fpad[1][i]));
+        int16x16 f2 = squeeze_10753_x16(load_x16(&fpad[2][i]));
+        int16x16 f3 = squeeze_10753_x16(load_x16(&fpad[3][i]));
+        int16x16 g0 = squeeze_10753_x16(load_x16(&gpad[0][i]));
+        int16x16 g1 = squeeze_10753_x16(load_x16(&gpad[1][i]));
+        int16x16 g2 = squeeze_10753_x16(load_x16(&gpad[2][i]));
+        int16x16 g3 = squeeze_10753_x16(load_x16(&gpad[3][i]));
+        int16x16 d0 = mulmod_10753_x16(f0, g0);
+        int16x16 d1 = mulmod_10753_x16(f1, g1);
+        int16x16 d2 = mulmod_10753_x16(f2, g2);
+        int16x16 d3 = mulmod_10753_x16(f3, g3);
+        int16x16 d0d1 = add_x16(d0, d1);
+        int16x16 d0d1d2 = add_x16(d0d1, d2);
+        int16x16 d0d1d2d3 = squeeze_10753_x16(add_x16(d0d1d2, d3));
+        int16x16 d2d3 = add_x16(d2, d3);
+        int16x16 d1d2d3 = add_x16(d1, d2d3);
+        int16x16 e01 = mulmod_10753_x16(sub_x16(f0, f1), sub_x16(g0, g1));
+        int16x16 e02 = mulmod_10753_x16(sub_x16(f0, f2), sub_x16(g0, g2));
+        int16x16 e03 = mulmod_10753_x16(sub_x16(f0, f3), sub_x16(g0, g3));
+        int16x16 e12 = mulmod_10753_x16(sub_x16(f1, f2), sub_x16(g1, g2));
+        int16x16 e13 = mulmod_10753_x16(sub_x16(f1, f3), sub_x16(g1, g3));
+        int16x16 e23 = mulmod_10753_x16(sub_x16(f2, f3), sub_x16(g2, g3));
+        int16x16 h0 = d0;
+        int16x16 h1 = sub_x16(d0d1, e01);
+        int16x16 h2 = sub_x16(d0d1d2, e02);
+        int16x16 h3 = sub_x16(d0d1d2d3, add_x16(e12, e03));
+        int16x16 h4 = sub_x16(d1d2d3, e13);
+        int16x16 h5 = sub_x16(d2d3, e23);
+        int16x16 h6 = d3;
+        int16x16 twist = load_x16(&y_10753.v[i]);
+        h4 = mulmod_10753_x16(h4, twist);
+        h5 = mulmod_10753_x16(h5, twist);
+        h6 = mulmod_10753_x16(h6, twist);
+        h0 = add_x16(h0, h4);
+        h1 = add_x16(h1, h5);
+        h2 = add_x16(h2, h6);
+        store_x16(&hpad[0][i], squeeze_10753_x16(h0));
+        store_x16(&hpad[1][i], squeeze_10753_x16(h1));
+        store_x16(&hpad[2][i], squeeze_10753_x16(h2));
+        store_x16(&hpad[3][i], squeeze_10753_x16(h3));
+    }
+
+    PQCLEAN_NTRULPR857_AVX2_invntt512_10753(hpad[0], 4);
+    unstride(h_10753, (const int16(*)[512]) hpad);
+
+    for (i = 0; i < 2048; i += 16) {
+        int16x16 u1 = load_x16(&h_10753[i]);
+        int16x16 u2 = load_x16(&h_7681[i]);
+        int16x16 t;
+        u1 = mulmod_10753_x16(u1, const_x16(1268));
+        u2 = mulmod_7681_x16(u2, const_x16(956));
+        t = mulmod_7681_x16(sub_x16(u2, u1), const_x16(-2539));
+        t = add_x16(u1, mulmod_5167_x16(t, const_x16(2146)));
+        store_x16(&h[i], t);
+    }
+}
+
+#define crypto_decode_pxint16 PQCLEAN_NTRULPR857_AVX2_crypto_decode_857xint16
+#define crypto_encode_pxint16 PQCLEAN_NTRULPR857_AVX2_crypto_encode_857xint16
+
+#define p 857
+#define q 5167
+
+static inline int16x16 freeze_5167_x16(int16x16 x) {
+    int16x16 mask, xq;
+    x = add_x16(x, const_x16(q)&signmask_x16(x));
+    mask = signmask_x16(sub_x16(x, const_x16((q + 1) / 2)));
+    xq = sub_x16(x, const_x16(q));
+    x = _mm256_blendv_epi8(xq, x, mask);
+    return x;
+}
+
+int PQCLEAN_NTRULPR857_AVX2_crypto_core_multsntrup857(unsigned char *outbytes, const unsigned char *inbytes, const unsigned char *kbytes) {
+    vec1024 x1, x2;
+    vec2048 x3;
+#define f (x1.v)
+#define g (x2.v)
+#define fg (x3.v)
+#define h f
+    int i;
+    int16x16 x;
+
+    x = const_x16(0);
+    for (i = p & ~15; i < 1024; i += 16) {
+        store_x16(&f[i], x);
+    }
+    for (i = p & ~15; i < 1024; i += 16) {
+        store_x16(&g[i], x);
+    }
+
+    crypto_decode_pxint16(f, inbytes);
+
+    for (i = 0; i < 1024; i += 16) {
+        x = load_x16(&f[i]);
+        x = freeze_5167_x16(squeeze_5167_x16(x));
+        store_x16(&f[i], x);
+    }
+    for (i = 0; i < p; ++i) {
+        int8 gi = kbytes[i];
+        int8 gi0 = gi & 1;
+        g[i] = gi0 - (gi & (gi0 << 1));
+    }
+
+    mult1024(fg, f, g);
+
+    fg[0] -= fg[p - 1];
+    for (i = 0; i < 1024; i += 16) {
+        int16x16 fgi = load_x16(&fg[i]);
+        int16x16 fgip = load_x16(&fg[i + p]);
+        int16x16 fgip1 = load_x16(&fg[i + p - 1]);
+        x = add_x16(fgi, add_x16(fgip, fgip1));
+        x = freeze_5167_x16(squeeze_5167_x16(x));
+        store_x16(&h[i], x);
+    }
+
+    crypto_encode_pxint16(outbytes, h);
+
+    return 0;
+}
diff --git a/crypto_kem/ntrulpr857/avx2/crypto_core_multsntrup857.h b/crypto_kem/ntrulpr857/avx2/crypto_core_multsntrup857.h
new file mode 100644
index 00000000..f6ab0e08
--- /dev/null
+++ b/crypto_kem/ntrulpr857/avx2/crypto_core_multsntrup857.h
@@ -0,0 +1,11 @@
+#ifndef PQCLEAN_NTRULPR857_AVX2_CRYPTO_CORE_MULTSNTRUP857_H
+#define PQCLEAN_NTRULPR857_AVX2_CRYPTO_CORE_MULTSNTRUP857_H
+
+#include <stdint.h>
+#define PQCLEAN_NTRULPR857_AVX2_crypto_core_multsntrup857_OUTPUTBYTES 1714
+#define PQCLEAN_NTRULPR857_AVX2_crypto_core_multsntrup857_INPUTBYTES 1714
+#define PQCLEAN_NTRULPR857_AVX2_crypto_core_multsntrup857_KEYBYTES 857
+#define PQCLEAN_NTRULPR857_AVX2_crypto_core_multsntrup857_CONSTBYTES 0
+
+int PQCLEAN_NTRULPR857_AVX2_crypto_core_multsntrup857(unsigned char *outbytes, const unsigned char *inbytes, const unsigned char *kbytes);
+#endif
diff --git a/crypto_kem/ntrulpr857/avx2/crypto_core_multsntrup857_ntt.c b/crypto_kem/ntrulpr857/avx2/crypto_core_multsntrup857_ntt.c
new file mode 100644
index 00000000..bcadf637
--- /dev/null
+++ b/crypto_kem/ntrulpr857/avx2/crypto_core_multsntrup857_ntt.c
@@ -0,0 +1,927 @@
+#include "crypto_core_multsntrup857.h"
+#include "crypto_core_multsntrup857_ntt.h"
+#include <immintrin.h>
+#include <stdint.h>
+
+/* auto-generated; do not edit */
+
+
+typedef int8_t int8;
+typedef int16_t int16;
+
+#define zeta(n,i) (((__m256i *) zeta_##n)[(i)])
+#define zeta_x4(n,i) (((__m256i *) zeta_x4_##n)[(i)])
+#define zeta_qinv(n,i) (((__m256i *) qinvzeta_##n)[(i)])
+#define zeta_x4_qinv(n,i) (((__m256i *) qinvzeta_x4_##n)[(i)])
+#define zetainv(n,i) _mm256_loadu_reverse16((__m256i *) ((int16 *) zeta_##n+(n)/2+1-16*((i)+1)))
+#define zetainv_x4(n,i) _mm256_loadu_reverse16((__m256i *) ((int16 *) zeta_x4_##n+2*(n)+4-16*((i)+1)))
+#define zetainv_qinv(n,i) _mm256_loadu_reverse16((__m256i *) ((int16 *) qinvzeta_##n+(n)/2+1-16*((i)+1)))
+#define zetainv_x4_qinv(n,i) _mm256_loadu_reverse16((__m256i *) ((int16 *) qinvzeta_x4_##n+2*(n)+4-16*((i)+1)))
+
+typedef union {
+    int16 data[93 * 16];
+    __m256i _dummy;
+} vec1488;
+
+static const vec1488 qdata_7681 = { .data = {
+
+#define q_x16 (qdata[0])
+        7681, 7681, 7681, 7681, 7681, 7681, 7681, 7681, 7681, 7681, 7681, 7681, 7681, 7681, 7681, 7681,
+
+#define qrecip_x16 (qdata[1])
+        17474, 17474, 17474, 17474, 17474, 17474, 17474, 17474, 17474, 17474, 17474, 17474, 17474, 17474, 17474, 17474,
+
+#define qshift_x16 (qdata[2])
+        16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+
+#define zeta4_x16 (qdata[3])
+        -3777, -3777, -3777, -3777, -3777, -3777, -3777, -3777, -3777, -3777, -3777, -3777, -3777, -3777, -3777, -3777,
+
+#define zeta4_x16_qinv (qdata[4])
+        -28865, -28865, -28865, -28865, -28865, -28865, -28865, -28865, -28865, -28865, -28865, -28865, -28865, -28865, -28865, -28865,
+
+#define zeta8_x16 (qdata[5])
+        -3625, -3625, -3625, -3625, -3625, -3625, -3625, -3625, -3625, -3625, -3625, -3625, -3625, -3625, -3625, -3625,
+
+#define zeta8_x16_qinv (qdata[6])
+        -16425, -16425, -16425, -16425, -16425, -16425, -16425, -16425, -16425, -16425, -16425, -16425, -16425, -16425, -16425, -16425,
+
+#define zetainv8_x16 (qdata[7])
+        -3182, -3182, -3182, -3182, -3182, -3182, -3182, -3182, -3182, -3182, -3182, -3182, -3182, -3182, -3182, -3182,
+
+#define zetainv8_x16_qinv (qdata[8])
+        -10350, -10350, -10350, -10350, -10350, -10350, -10350, -10350, -10350, -10350, -10350, -10350, -10350, -10350, -10350, -10350,
+
+#define zeta_x4_16 (qdata+9)
+        -3593, -3593, -3593, -3593, -2194, -2194, -2194, -2194, -3625, -3625, -3625, -3625, 1100, 1100, 1100, 1100,
+        -3777, -3777, -3777, -3777, -2456, -2456, -2456, -2456, 3182, 3182, 3182, 3182, 3696, 3696, 3696, 3696,
+        3593, 3593, 3593, 3593, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+#define qinvzeta_x4_16 (qdata+12)
+        -9, -9, -9, -9, 4974, 4974, 4974, 4974, -16425, -16425, -16425, -16425, 7244, 7244, 7244, 7244,
+        -28865, -28865, -28865, -28865, -14744, -14744, -14744, -14744, 10350, 10350, 10350, 10350, -4496, -4496, -4496, -4496,
+        9, 9, 9, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+#define zeta_x4_32 (qdata+15)
+        -3593, -3593, -3593, -3593, 1414, 1414, 1414, 1414, -2194, -2194, -2194, -2194, -2495, -2495, -2495, -2495,
+        -3625, -3625, -3625, -3625, 2876, 2876, 2876, 2876, 1100, 1100, 1100, 1100, -2250, -2250, -2250, -2250,
+        -3777, -3777, -3777, -3777, -1701, -1701, -1701, -1701, -2456, -2456, -2456, -2456, 834, 834, 834, 834,
+        3182, 3182, 3182, 3182, -2319, -2319, -2319, -2319, 3696, 3696, 3696, 3696, 121, 121, 121, 121,
+        3593, 3593, 3593, 3593, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+#define qinvzeta_x4_32 (qdata+20)
+        -9, -9, -9, -9, 20870, 20870, 20870, 20870, 4974, 4974, 4974, 4974, 22593, 22593, 22593, 22593,
+        -16425, -16425, -16425, -16425, 828, 828, 828, 828, 7244, 7244, 7244, 7244, -23754, -23754, -23754, -23754,
+        -28865, -28865, -28865, -28865, 20315, 20315, 20315, 20315, -14744, -14744, -14744, -14744, 18242, 18242, 18242, 18242,
+        10350, 10350, 10350, 10350, -18191, -18191, -18191, -18191, -4496, -4496, -4496, -4496, -11655, -11655, -11655, -11655,
+        9, 9, 9, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+#define zeta_64 (qdata+25)
+        -3593, -617, 1414, 3706, -2194, -1296, -2495, -2237, -3625, 2830, 2876, -1599, 1100, 1525, -2250, 2816,
+        -3777, 1921, -1701, 2006, -2456, 1483, 834, -1986, 3182, 3364, -2319, -1993, 3696, -2557, 121, 2088,
+        3593, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+#define qinvzeta_64 (qdata+28)
+        -9, 19351, 20870, -15750, 4974, -9488, 22593, 7491, -16425, 26382, 828, 23489, 7244, 20469, -23754, 2816,
+        -28865, -5759, 20315, -3114, -14744, 15307, 18242, -19394, 10350, -10972, -18191, -31177, -4496, -25597, -11655, 22568,
+        9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+#define zeta_128 (qdata+31)
+        -3593, -2804, -617, -396, 1414, -549, 3706, 810, -2194, -1321, -1296, 438, -2495, -2535, -2237, -3689,
+        -3625, 2043, 2830, -1881, 2876, 3153, -1599, 7, 1100, -514, 1525, -1760, -2250, -2440, 2816, 3600,
+        -3777, 103, 1921, -3174, -1701, 1535, 2006, -1887, -2456, 1399, 1483, -679, 834, 3772, -1986, 1738,
+        3182, -1431, 3364, -3555, -2319, -2310, -1993, 638, 3696, -2956, -2557, -1305, 121, 2555, 2088, -3266,
+        3593, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+#define qinvzeta_128 (qdata+36)
+        -9, -29428, 19351, 26228, 20870, 21467, -15750, 5930, 4974, -14121, -9488, -21066, 22593, 2073, 7491, 16279,
+        -16425, -25093, 26382, 26279, 828, -29103, 23489, 11783, 7244, 14846, 20469, 14624, -23754, -6536, 2816, 11792,
+        -28865, -4505, -5759, -6246, 20315, 9215, -3114, 6817, -14744, 4983, 15307, -28839, 18242, 1724, -19394, 23242,
+        10350, -21399, -10972, -29667, -18191, -21766, -31177, 15998, -4496, 23668, -25597, -5913, -11655, -24581, 22568, -20674,
+        9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+#define zeta_256 (qdata+41)
+        -3593, 2665, -2804, -2572, -617, 727, -396, 3417, 1414, 2579, -549, 373, 3706, 3750, 810, -1054,
+        -2194, -2133, -1321, 1681, -1296, -1386, 438, -2732, -2495, 1919, -2535, -2391, -2237, 2835, -3689, 2,
+        -3625, -783, 2043, 3145, 2830, 1533, -1881, 2789, 2876, 2649, 3153, 3692, -1599, -1390, 7, -1166,
+        1100, 3310, -514, 2224, 1525, -2743, -1760, 2385, -2250, -486, -2440, -1756, 2816, -3816, 3600, -3831,
+        -3777, -1799, 103, 1497, 1921, 1521, -3174, -194, -1701, -859, 1535, 2175, 2006, -2762, -1887, -1698,
+        -2456, -3480, 1399, 2883, 1483, -3428, -679, -2113, 834, 1532, 3772, -660, -1986, -2764, 1738, -915,
+        3182, 1056, -1431, 1350, 3364, 1464, -3555, 2919, -2319, -2160, -2310, 730, -1993, -1598, 638, 3456,
+        3696, -1168, -2956, -3588, -2557, -921, -1305, 3405, 121, -404, 2555, -3135, 2088, 2233, -3266, -2426,
+        3593, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+#define qinvzeta_256 (qdata+50)
+        -9, -17303, -29428, 24052, 19351, -12073, 26228, -24743, 20870, -12269, 21467, 19317, -15750, -25946, 5930, 32738,
+        4974, -4693, -14121, 2193, -9488, 26262, -21066, 7508, 22593, 9599, 2073, 10409, 7491, -12013, 16279, -15358,
+        -16425, -16655, -25093, 32329, 26382, 24573, 26279, 13541, 828, -25511, -29103, 26220, 23489, -8558, 11783, -24718,
+        7244, 10478, 14846, 26800, 20469, 26441, 14624, -29871, -23754, -3558, -6536, -16092, 2816, 8472, 11792, -7415,
+        -28865, -13575, -4505, -26663, -5759, -14351, -6246, -17602, 20315, -22875, 9215, 9855, -3114, -24266, 6817, -2722,
+        -14744, -15768, 4983, 12611, 15307, -21860, -28839, -27201, 18242, 32252, 1724, 21868, -19394, -8908, 23242, 13933,
+        10350, 17440, -21399, -11962, -10972, 30136, -29667, -1689, -18191, 6032, -21766, 30426, -31177, 15810, 15998, 3456,
+        -4496, -9360, 23668, 27132, -25597, -5529, -5913, 1869, -11655, 22124, -24581, 21953, 22568, 23225, -20674, 17030,
+        9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+#define zeta_512 (qdata+59)
+        -3593, 2005, 2665, 2891, -2804, 2345, -2572, 1121, -617, -188, 727, 2786, -396, -3208, 3417, -17,
+        1414, -3752, 2579, 2815, -549, 1837, 373, 151, 3706, -1012, 3750, -1509, 810, -3214, -1054, 3177,
+        -2194, -1403, -2133, -3314, -1321, 83, 1681, -658, -1296, 2070, -1386, -3547, 438, 3781, -2732, 2230,
+        -2495, -1669, 1919, 2589, -2535, -3312, -2391, -3542, -2237, -1441, 2835, -3568, -3689, -402, 2, -1070,
+        -3625, 3763, -783, -3550, 2043, -2303, 3145, -436, 2830, -893, 1533, 1712, -1881, 124, 2789, -2001,
+        2876, -2460, 2649, 3770, 3153, 2965, 3692, -1203, -1599, 2874, -1390, -1407, 7, -3745, -1166, 1649,
+        1100, 2937, 3310, 3461, -514, -1526, 2224, 715, 1525, -1689, -2743, 434, -1760, -3163, 2385, -929,
+        -2250, -2167, -486, -1144, -2440, -370, -1756, 2378, 2816, -1084, -3816, -1586, 3600, 1931, -3831, -1242,
+        -3777, 592, -1799, 2340, 103, -1338, 1497, -2071, 1921, 1519, 1521, 451, -3174, 589, -194, -3744,
+        -1701, 3677, -859, -1295, 1535, 642, 2175, -3794, 2006, 2130, -2762, 2918, -1887, 3334, -1698, 2072,
+        -2456, 509, -3480, 2998, 1399, -3408, 2883, 1476, 1483, -2262, -3428, -1779, -679, 2258, -2113, 1348,
+        834, -692, 1532, 2247, 3772, 2083, -660, -226, -1986, 2532, -2764, -3693, 1738, -429, -915, -2059,
+        3182, 2812, 1056, 3434, -1431, -2515, 1350, -236, 3364, -2386, 1464, 222, -3555, -2963, 2919, -2422,
+        -2319, -3657, -2160, 3450, -2310, -791, 730, 1181, -1993, -1404, -1598, 2339, 638, -3366, 3456, 2161,
+        3696, -3343, -1168, 2719, -2956, -826, -3588, -670, -2557, 777, -921, 1151, -1305, -796, 3405, -1278,
+        121, -3287, -404, 1072, 2555, 293, -3135, 2767, 2088, -3335, 2233, 3581, -3266, 3723, -2426, -179,
+        3593, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+#define qinvzeta_512 (qdata+76)
+        -9, 4565, -17303, 16715, -29428, 15145, 24052, -22943, 19351, 1860, -12073, -28958, 26228, -7304, -24743, -529,
+        20870, -24232, -12269, 10495, 21467, -16083, 19317, 20119, -15750, -27636, -25946, -12261, 5930, -26766, 32738, -16791,
+        4974, 25733, -4693, 20238, -14121, 18003, 2193, 6510, -9488, 29718, 26262, -25563, -21066, -1851, 7508, -19274,
+        22593, -28805, 9599, -23523, 2073, 4880, 10409, 1578, 7491, -10145, -12013, 4624, 16279, 6766, -15358, 24530,
+        -16425, 5299, -16655, -2526, -25093, -9983, 32329, 5708, 26382, -23933, 24573, 26288, 26279, 30844, 13541, 30255,
+        828, 15972, -25511, 17082, -29103, -27243, 26220, -2739, 23489, 16186, -8558, -9087, 11783, -12449, -24718, -14223,
+        7244, -8839, 10478, 30597, 14846, -12790, 26800, 14539, 20469, -6297, 26441, 9650, 14624, -25179, -29871, -9633,
+        -23754, -5751, -3558, 2952, -6536, 23182, -16092, 23882, 2816, 964, 8472, -10802, 11792, -17013, -7415, -30938,
+        -28865, -23984, -13575, -11996, -4505, -14650, -26663, -22039, -5759, 1007, -14351, 10179, -6246, -947, -17602, -20128,
+        20315, 10333, -22875, -17167, 9215, -14718, 9855, -29394, -3114, 27730, -24266, 5990, 6817, 22790, -2722, 14360,
+        -14744, 23549, -15768, -18506, 4983, 21168, 12611, 3524, 15307, 2858, -21860, 29453, -28839, 27858, -27201, 3396,
+        18242, 5452, 32252, -18745, 1724, -4573, 21868, 31518, -19394, 20964, -8908, -18541, 23242, 17491, 13933, 16885,
+        10350, -32004, 17440, -24214, -21399, -20435, -11962, -22764, -10972, -27986, 30136, -802, -29667, 11885, -1689, -13686,
+        -18191, 32695, 6032, -16006, -21766, -20759, 30426, -24931, -31177, -32124, 15810, -4317, 15998, 26330, 3456, -13711,
+        -4496, -19215, -9360, 26783, 23668, -14138, 27132, -32414, -25597, -2807, -5529, 8831, -5913, 17636, 1869, -16638,
+        -11655, 9513, 22124, 25648, -24581, -21723, 21953, -14129, 22568, -15111, 23225, 26621, -20674, -15221, 17030, -1715,
+        9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    }
+};
+
+static const vec1488 qdata_10753 = { .data = {
+
+        10753, 10753, 10753, 10753, 10753, 10753, 10753, 10753, 10753, 10753, 10753, 10753, 10753, 10753, 10753, 10753,
+
+        24964, 24964, 24964, 24964, 24964, 24964, 24964, 24964, 24964, 24964, 24964, 24964, 24964, 24964, 24964, 24964,
+
+        8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+
+        223, 223, 223, 223, 223, 223, 223, 223, 223, 223, 223, 223, 223, 223, 223, 223,
+
+        27359, 27359, 27359, 27359, 27359, 27359, 27359, 27359, 27359, 27359, 27359, 27359, 27359, 27359, 27359, 27359,
+
+        4188, 4188, 4188, 4188, 4188, 4188, 4188, 4188, 4188, 4188, 4188, 4188, 4188, 4188, 4188, 4188,
+
+        -1956, -1956, -1956, -1956, -1956, -1956, -1956, -1956, -1956, -1956, -1956, -1956, -1956, -1956, -1956, -1956,
+
+        3688, 3688, 3688, 3688, 3688, 3688, 3688, 3688, 3688, 3688, 3688, 3688, 3688, 3688, 3688, 3688,
+
+        -408, -408, -408, -408, -408, -408, -408, -408, -408, -408, -408, -408, -408, -408, -408, -408,
+
+        1018, 1018, 1018, 1018, 2413, 2413, 2413, 2413, 4188, 4188, 4188, 4188, 357, 357, 357, 357,
+        223, 223, 223, 223, -3686, -3686, -3686, -3686, -3688, -3688, -3688, -3688, -376, -376, -376, -376,
+        -1018, -1018, -1018, -1018, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+        -6, -6, -6, -6, 10093, 10093, 10093, 10093, -1956, -1956, -1956, -1956, 28517, 28517, 28517, 28517,
+        27359, 27359, 27359, 27359, -21094, -21094, -21094, -21094, 408, 408, 408, 408, -20856, -20856, -20856, -20856,
+        6, 6, 6, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+        1018, 1018, 1018, 1018, -2695, -2695, -2695, -2695, 2413, 2413, 2413, 2413, 425, 425, 425, 425,
+        4188, 4188, 4188, 4188, -4855, -4855, -4855, -4855, 357, 357, 357, 357, -3364, -3364, -3364, -3364,
+        223, 223, 223, 223, 730, 730, 730, 730, -3686, -3686, -3686, -3686, -4544, -4544, -4544, -4544,
+        -3688, -3688, -3688, -3688, -2236, -2236, -2236, -2236, -376, -376, -376, -376, 3784, 3784, 3784, 3784,
+        -1018, -1018, -1018, -1018, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+        -6, -6, -6, -6, 7033, 7033, 7033, 7033, 10093, 10093, 10093, 10093, 18345, 18345, 18345, 18345,
+        -1956, -1956, -1956, -1956, 29449, 29449, 29449, 29449, 28517, 28517, 28517, 28517, -9508, -9508, -9508, -9508,
+        27359, 27359, 27359, 27359, 16090, 16090, 16090, 16090, -21094, -21094, -21094, -21094, 28224, 28224, 28224, 28224,
+        408, 408, 408, 408, -12476, -12476, -12476, -12476, -20856, -20856, -20856, -20856, 16072, 16072, 16072, 16072,
+        6, 6, 6, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+        1018, -1520, -2695, 1341, 2413, 918, 425, 5175, 4188, -4035, -4855, 341, 357, 4347, -3364, 5213,
+        223, -4875, 730, 1931, -3686, -2503, -4544, -4095, -3688, 5063, -2236, -3823, -376, 3012, 3784, -2629,
+        -1018, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+        -6, 23056, 7033, 829, 10093, 26518, 18345, 3639, -1956, -4547, 29449, 3925, 28517, -7429, -9508, -11683,
+        27359, -17675, 16090, 14731, -21094, -25543, 28224, -14847, 408, 28103, -12476, 10001, -20856, -7228, 16072, 18363,
+        6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+        1018, -2935, -1520, -4744, -2695, -205, 1341, 1299, 2413, 4, 918, -4379, 425, -4616, 5175, -544,
+        4188, 4129, -4035, 4102, -4855, -1287, 341, -2388, 357, 1284, 4347, 2984, -3364, 2178, 5213, -2576,
+        223, 2790, -4875, 4876, 730, -4513, 1931, -3085, -3686, 3550, -2503, 847, -4544, 193, -4095, 1085,
+        -3688, 3091, 5063, -4742, -2236, 2982, -3823, -1009, -376, -268, 3012, 3062, 3784, -2565, -2629, 4189,
+        -1018, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+        -6, 31369, 23056, 15736, 7033, -24269, 829, -6381, 10093, 22532, 26518, 23781, 18345, 15864, 3639, 15840,
+        -1956, -23007, -4547, 5126, 29449, 8441, 3925, -16724, 28517, 23812, -7429, 31656, -9508, -19326, -11683, -27152,
+        27359, 20198, -17675, 6924, 16090, 22623, 14731, 5619, -21094, -24098, -25543, 3407, 28224, 22209, -14847, 573,
+        408, -4589, 28103, -5766, -12476, -12378, 10001, -31217, -20856, -2316, -7228, -20490, 16072, -14341, 18363, -12707,
+        6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+        1018, -4734, -2935, -400, -1520, 4977, -4744, -2973, -2695, 512, -205, -779, 1341, -1356, 1299, 635,
+        2413, 567, 4, -4286, 918, -5114, -4379, -1586, 425, 1615, -4616, -336, 5175, -1841, -544, 2234,
+        4188, -3441, 4129, 636, -4035, -4580, 4102, 2684, -4855, 3057, -1287, -2740, 341, -5156, -2388, -472,
+        357, -794, 1284, 578, 4347, 3615, 2984, -3715, -3364, 2271, 2178, -326, 5213, 454, -2576, -3337,
+        223, 2998, 2790, -151, -4875, 2981, 4876, 1324, 730, 2774, -4513, 2206, 1931, 886, -3085, -970,
+        -3686, 3198, 3550, 2737, -2503, -909, 847, 1068, -4544, -2213, 193, 2884, -4095, -4808, 1085, 4123,
+        -3688, 5341, 3091, 5294, 5063, -116, -4742, -5116, -2236, -2045, 2982, -1572, -3823, 4828, -1009, 467,
+        -376, 5023, -268, -3169, 3012, -1458, 3062, -1268, 3784, -675, -2565, 1006, -2629, 5064, 4189, 864,
+        -1018, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+        -6, -26238, 31369, -24976, 23056, -30351, 15736, -18845, 7033, 512, -24269, -13579, 829, 29364, -6381, -11141,
+        10093, -969, 22532, 6978, 26518, -4090, 23781, 11726, 18345, 4175, 15864, 7856, 3639, 719, 15840, -31558,
+        -1956, 31887, -23007, -21892, -4547, 22044, 5126, -19844, 29449, -32271, 8441, 32076, 3925, -11300, -16724, 28200,
+        28517, 16614, 23812, 11842, -7429, -2017, 31656, 28541, -9508, 29407, -19326, 31418, -11683, -31290, -27152, 27895,
+        27359, 12214, 20198, -14999, -17675, -1627, 6924, -13012, 16090, -4394, 22623, 7326, 14731, -22666, 5619, 8246,
+        -21094, 24702, -24098, 177, -25543, 7795, 3407, -13268, 28224, 2395, 22209, -7356, -14847, -17096, 573, -24037,
+        408, -11555, -4589, -30546, 28103, 1932, -5766, 17412, -12476, 31235, -12378, -7716, 10001, -1316, -31217, 25555,
+        -20856, -609, -2316, -8801, -7228, 11854, -20490, 780, 16072, -17571, -14341, -2066, 18363, 17352, -12707, 17248,
+        6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+        1018, 3453, -4734, 4519, -2935, 2118, -400, -554, -1520, 2196, 4977, 1893, -4744, -1409, -2973, -1053,
+        -2695, 4601, 512, 279, -205, -3241, -779, 4889, 1341, 3524, -1356, -1663, 1299, 2283, 635, 73,
+        2413, 2428, 567, 624, 4, -1930, -4286, 3419, 918, -2062, -5114, 5068, -4379, -97, -1586, 1782,
+        425, 4621, 1615, 355, -4616, 1349, -336, 825, 5175, 3135, -1841, 1160, -544, 4408, 2234, -2605,
+        4188, 854, -3441, -1056, 4129, 2439, 636, 4967, -4035, -4782, -4580, -5268, 4102, -663, 2684, -4670,
+        -4855, 3760, 3057, 3535, -1287, 2680, -2740, -569, 341, 2139, -5156, 3827, -2388, 1639, -472, 1927,
+        357, 5172, -794, -4003, 1284, 4144, 578, 693, 4347, 4784, 3615, 3125, 2984, 1122, -3715, 2113,
+        -3364, -573, 2271, -4328, 2178, 2909, -326, -4000, 5213, -4447, 454, -3995, -2576, -4428, -3337, 2529,
+        223, 5309, 2998, 5120, 2790, -2050, -151, 2963, -4875, 2657, 2981, -2807, 4876, 2237, 1324, -4403,
+        730, 2624, 2774, -5083, -4513, 40, 2206, 152, 1931, -1573, 886, 2625, -3085, -778, -970, -5107,
+        -3686, 4250, 3198, -5356, 3550, -3148, 2737, -3360, -2503, -2015, -909, 3096, 847, 5313, 1068, 834,
+        -4544, -1132, -2213, -2151, 193, -1722, 2884, -4393, -4095, 2662, -4808, -2788, 1085, -1992, 4123, 5334,
+        -3688, 5215, 5341, -1689, 3091, -2117, 5294, 4859, 5063, 3410, -116, 2205, -4742, -2374, -5116, -4720,
+        -2236, 3570, -2045, 2813, 2982, 2087, -1572, -4973, -3823, 458, 4828, 3891, -1009, -2419, 467, -4891,
+        -376, -1381, 5023, 1204, -268, 274, -3169, -3260, 3012, -1635, -1458, 4540, 3062, -4254, -1268, -1111,
+        3784, 2230, -675, -2279, -2565, -4359, 1006, -1510, -2629, 5015, 5064, -2449, 4189, -5005, 864, 2487,
+        -1018, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+        -6, -29827, -26238, -21593, 31369, -29626, -24976, -7722, 23056, -16236, -30351, 30053, 15736, 9343, -18845, -16925,
+        7033, 14329, 512, 15127, -24269, -21161, -13579, -1767, 829, -6716, 29364, -12415, -6381, 31467, -11141, 1609,
+        10093, -20100, -969, -23952, 22532, -25482, 6978, 8027, 26518, 17394, -4090, -25652, 23781, -5729, 11726, -21770,
+        18345, -4083, 4175, -15517, 15864, -19643, 7856, -22215, 3639, -18881, 719, -19320, 15840, -7880, -31558, 22483,
+        -1956, -6314, 31887, 15328, -23007, -7289, -21892, 11623, -4547, 31058, 22044, 13164, 5126, -15511, -19844, 6594,
+        29449, 11952, -32271, 6095, 8441, 23160, 32076, 22471, 3925, 6747, -11300, 12531, -16724, 8295, 28200, -7801,
+        28517, -29644, 16614, -20899, 23812, 12336, 11842, 20661, -7429, 12976, -2017, 23093, 31656, -3998, 28541, 24129,
+        -9508, -61, 29407, -232, -19326, -13987, 31418, 12384, -11683, -31583, -31290, 24165, -27152, 26292, 27895, 8161,
+        27359, 4797, 12214, 5120, 20198, 19454, -14999, -4717, -17675, 8289, -1627, 31497, 6924, 1725, -13012, 19661,
+        16090, -30144, -4394, -9691, 22623, 28712, 7326, 4248, 14731, 3035, -22666, 24641, 5619, -24330, 8246, -13811,
+        -21094, -13158, 24702, -23788, -24098, 27572, 177, 13024, -25543, -29151, 7795, 7192, 3407, 27329, -13268, 12098,
+        28224, -19564, 2395, -8807, 22209, 32070, -7356, -22313, -14847, 20070, -17096, 23836, 573, -14280, -24037, -1834,
+        408, 32351, -11555, 4967, -4589, 18875, -30546, -6917, 28103, -26286, 1932, 18077, -5766, 29370, 17412, 19856,
+        -12476, 23026, 31235, -30467, -12378, -24025, -7716, -12653, 10001, -8758, -1316, -20173, -31217, -11123, 25555, 23269,
+        -20856, -29541, -609, 31924, -2316, 3346, -8801, -13500, -7228, 14237, 11854, 14780, -20490, -9374, 780, 16809,
+        16072, 11446, -17571, -8935, -14341, 5369, -2066, -18918, 18363, 19863, 17352, -16273, -12707, 3699, 17248, 951,
+        6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    }
+};
+
+static inline __m256i sub_x16(__m256i a, __m256i b) {
+    //__asm__("vpsubw %1,%0,%0" : "+x"(a),"+x"(b));
+    return _mm256_sub_epi16(a, b);
+}
+
+static inline __m256i add_x16(__m256i a, __m256i b) {
+    return _mm256_add_epi16(a, b);
+}
+
+static inline __m256i reduce_x16(const __m256i *qdata, __m256i x) {
+    __m256i y = _mm256_mulhi_epi16(x, qrecip_x16);
+    y = _mm256_mulhrs_epi16(y, qshift_x16);
+    y = _mm256_mullo_epi16(y, q_x16);
+    return sub_x16(x, y);
+}
+
+static inline __m256i mulmod_x16_scaled(const __m256i *qdata, __m256i x, __m256i y, __m256i yqinv) {
+    __m256i b = _mm256_mulhi_epi16(x, y);
+    __m256i d = _mm256_mullo_epi16(x, yqinv);
+    __m256i e = _mm256_mulhi_epi16(d, q_x16);
+    return sub_x16(b, e);
+}
+
+typedef union {
+    int8 data[32];
+    __m256i _dummy;
+} byte32;
+static const byte32 shuffle_buf = { .data = {
+        14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1,
+        14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1,
+    }
+};
+#define shuffle (*(__m256i *) shuffle_buf.data)
+
+static inline __m256i _mm256_loadu_reverse16(const __m256i *p) {
+    __m256i x = _mm256_loadu_si256(p);
+    x = _mm256_permute2x128_si256(x, x, 1);
+    x = _mm256_shuffle_epi8(x, shuffle);
+    return x;
+}
+
+static void ntt128(int16 *f, int reps, const __m256i *qdata) {
+    __m256i f0, f1, f2, f3, g0, g1, g2, g3, h0, h1, h2, h3;
+    int16 *origf = f;
+    int rep;
+    __m256i zetainv_128_0 = zetainv(128, 0);
+    __m256i zetainv_qinv_128_0 = zetainv_qinv(128, 0);
+    __m256i zetainv_x4_32_0 = zetainv_x4(32, 0);
+    __m256i zetainv_x4_qinv_32_0 = zetainv_x4_qinv(32, 0);
+    __m256i zetainv_128_1 = zetainv(128, 1);
+    __m256i zetainv_qinv_128_1 = zetainv_qinv(128, 1);
+    __m256i zetainv_x4_32_1 = zetainv_x4(32, 1);
+    __m256i zetainv_x4_qinv_32_1 = zetainv_x4_qinv(32, 1);
+    for (rep = 0; rep < reps; ++rep) {
+        f1 = _mm256_loadu_si256((__m256i *) (f + 32));
+        f3 = _mm256_loadu_si256((__m256i *) (f + 96));
+        g3 = sub_x16(f1, f3);
+        g3 = mulmod_x16_scaled(qdata, g3, zeta4_x16, zeta4_x16_qinv);
+        g1 = add_x16(f1, f3);
+
+        f0 = _mm256_loadu_si256((__m256i *) (f + 0));
+        f2 = _mm256_loadu_si256((__m256i *) (f + 64));
+        g2 = sub_x16(f0, f2);
+        g0 = add_x16(f0, f2);
+
+        f3 = sub_x16(g3, g2);
+        f2 = add_x16(g2, g3);
+        f3 = mulmod_x16_scaled(qdata, f3, zetainv_128_0, zetainv_qinv_128_0);
+        f2 = mulmod_x16_scaled(qdata, f2, zeta(128, 0), zeta_qinv(128, 0));
+
+        g2 = _mm256_unpacklo_epi16(f2, f3);
+        g3 = _mm256_unpackhi_epi16(f2, f3);
+
+        f1 = sub_x16(g0, g1);
+        f0 = add_x16(g0, g1);
+        f1 = mulmod_x16_scaled(qdata, f1, zeta(64, 0), zeta_qinv(64, 0));
+        f0 = reduce_x16(qdata, f0);
+
+        g0 = _mm256_unpacklo_epi16(f0, f1);
+        h0 = _mm256_unpacklo_epi32(g0, g2);
+        h1 = _mm256_unpackhi_epi32(g0, g2);
+        g1 = _mm256_unpackhi_epi16(f0, f1);
+        h2 = _mm256_unpacklo_epi32(g1, g3);
+        h3 = _mm256_unpackhi_epi32(g1, g3);
+        f0 = _mm256_permute2x128_si256(h0, h1, 0x20);
+        f2 = _mm256_permute2x128_si256(h0, h1, 0x31);
+        f1 = _mm256_permute2x128_si256(h2, h3, 0x20);
+        f3 = _mm256_permute2x128_si256(h2, h3, 0x31);
+
+        _mm256_storeu_si256((__m256i *) (f + 0), f0);
+        _mm256_storeu_si256((__m256i *) (f + 64), f2);
+        _mm256_storeu_si256((__m256i *) (f + 32), f1);
+        _mm256_storeu_si256((__m256i *) (f + 96), f3);
+
+        f1 = _mm256_loadu_si256((__m256i *) (f + 48));
+        f3 = _mm256_loadu_si256((__m256i *) (f + 112));
+        g3 = sub_x16(f1, f3);
+        g3 = mulmod_x16_scaled(qdata, g3, zeta4_x16, zeta4_x16_qinv);
+        g1 = add_x16(f1, f3);
+
+        f0 = _mm256_loadu_si256((__m256i *) (f + 16));
+        f2 = _mm256_loadu_si256((__m256i *) (f + 80));
+        g2 = sub_x16(f0, f2);
+        g0 = add_x16(f0, f2);
+
+        f3 = sub_x16(g3, g2);
+        f2 = add_x16(g2, g3);
+        f3 = mulmod_x16_scaled(qdata, f3, zetainv_128_1, zetainv_qinv_128_1);
+        f2 = mulmod_x16_scaled(qdata, f2, zeta(128, 1), zeta_qinv(128, 1));
+
+        g2 = _mm256_unpacklo_epi16(f2, f3);
+        g3 = _mm256_unpackhi_epi16(f2, f3);
+
+        f1 = sub_x16(g0, g1);
+        f0 = add_x16(g0, g1);
+        f1 = mulmod_x16_scaled(qdata, f1, zeta(64, 1), zeta_qinv(64, 1));
+        f0 = reduce_x16(qdata, f0);
+
+        g0 = _mm256_unpacklo_epi16(f0, f1);
+        h0 = _mm256_unpacklo_epi32(g0, g2);
+        h1 = _mm256_unpackhi_epi32(g0, g2);
+        g1 = _mm256_unpackhi_epi16(f0, f1);
+        h2 = _mm256_unpacklo_epi32(g1, g3);
+        h3 = _mm256_unpackhi_epi32(g1, g3);
+        f0 = _mm256_permute2x128_si256(h0, h1, 0x20);
+        f2 = _mm256_permute2x128_si256(h0, h1, 0x31);
+        f1 = _mm256_permute2x128_si256(h2, h3, 0x20);
+        f3 = _mm256_permute2x128_si256(h2, h3, 0x31);
+
+        _mm256_storeu_si256((__m256i *) (f + 16), f0);
+        _mm256_storeu_si256((__m256i *) (f + 80), f2);
+        _mm256_storeu_si256((__m256i *) (f + 48), f1);
+        _mm256_storeu_si256((__m256i *) (f + 112), f3);
+
+        f += 128;
+    }
+    f = origf;
+    for (rep = 0; rep < reps; ++rep) {
+        f1 = _mm256_loadu_si256((__m256i *) (f + 64));
+        f3 = _mm256_loadu_si256((__m256i *) (f + 80));
+        g3 = sub_x16(f1, f3);
+        g3 = mulmod_x16_scaled(qdata, g3, zeta4_x16, zeta4_x16_qinv);
+        g1 = add_x16(f1, f3);
+
+        f0 = _mm256_loadu_si256((__m256i *) (f + 0));
+        f2 = _mm256_loadu_si256((__m256i *) (f + 16));
+        g2 = sub_x16(f0, f2);
+        g0 = add_x16(f0, f2);
+
+        f3 = sub_x16(g3, g2);
+        f2 = add_x16(g2, g3);
+        f3 = mulmod_x16_scaled(qdata, f3, zetainv_x4_32_0, zetainv_x4_qinv_32_0);
+        f2 = mulmod_x16_scaled(qdata, f2, zeta_x4(32, 0), zeta_x4_qinv(32, 0));
+
+        g2 = _mm256_unpacklo_epi64(f2, f3);
+        g3 = _mm256_unpackhi_epi64(f2, f3);
+
+        f1 = sub_x16(g0, g1);
+        f0 = add_x16(g0, g1);
+        f1 = mulmod_x16_scaled(qdata, f1, zeta_x4(16, 0), zeta_x4_qinv(16, 0));
+        f0 = reduce_x16(qdata, f0);
+
+        g1 = _mm256_unpackhi_epi64(f0, f1);
+        g0 = _mm256_unpacklo_epi64(f0, f1);
+        f1 = _mm256_permute2x128_si256(g1, g3, 0x20);
+        f3 = _mm256_permute2x128_si256(g1, g3, 0x31);
+        f0 = _mm256_permute2x128_si256(g0, g2, 0x20);
+        f2 = _mm256_permute2x128_si256(g0, g2, 0x31);
+
+        _mm256_storeu_si256((__m256i *) (f + 64), f1);
+        _mm256_storeu_si256((__m256i *) (f + 80), f3);
+        _mm256_storeu_si256((__m256i *) (f + 0), f0);
+        _mm256_storeu_si256((__m256i *) (f + 16), f2);
+
+        f1 = _mm256_loadu_si256((__m256i *) (f + 96));
+        f3 = _mm256_loadu_si256((__m256i *) (f + 112));
+        g3 = sub_x16(f1, f3);
+        g3 = mulmod_x16_scaled(qdata, g3, zeta4_x16, zeta4_x16_qinv);
+        g1 = add_x16(f1, f3);
+
+        f0 = _mm256_loadu_si256((__m256i *) (f + 32));
+        f2 = _mm256_loadu_si256((__m256i *) (f + 48));
+        g2 = sub_x16(f0, f2);
+        g0 = add_x16(f0, f2);
+
+        f3 = sub_x16(g3, g2);
+        f2 = add_x16(g2, g3);
+        f3 = mulmod_x16_scaled(qdata, f3, zetainv_x4_32_1, zetainv_x4_qinv_32_1);
+        f2 = mulmod_x16_scaled(qdata, f2, zeta_x4(32, 1), zeta_x4_qinv(32, 1));
+
+        g2 = _mm256_unpacklo_epi64(f2, f3);
+        g3 = _mm256_unpackhi_epi64(f2, f3);
+
+        f1 = sub_x16(g0, g1);
+        f0 = add_x16(g0, g1);
+        f1 = mulmod_x16_scaled(qdata, f1, zeta_x4(16, 1), zeta_x4_qinv(16, 1));
+        f0 = reduce_x16(qdata, f0);
+
+        g1 = _mm256_unpackhi_epi64(f0, f1);
+        g0 = _mm256_unpacklo_epi64(f0, f1);
+        f1 = _mm256_permute2x128_si256(g1, g3, 0x20);
+        f3 = _mm256_permute2x128_si256(g1, g3, 0x31);
+        f0 = _mm256_permute2x128_si256(g0, g2, 0x20);
+        f2 = _mm256_permute2x128_si256(g0, g2, 0x31);
+
+        _mm256_storeu_si256((__m256i *) (f + 96), f1);
+        _mm256_storeu_si256((__m256i *) (f + 112), f3);
+        _mm256_storeu_si256((__m256i *) (f + 32), f0);
+        _mm256_storeu_si256((__m256i *) (f + 48), f2);
+
+        f += 128;
+    }
+    f = origf;
+    for (rep = 0; rep < reps; ++rep) {
+
+        f1 = _mm256_loadu_si256((__m256i *) (f + 16));
+        f3 = _mm256_loadu_si256((__m256i *) (f + 48));
+        g3 = sub_x16(f1, f3);
+        g3 = mulmod_x16_scaled(qdata, g3, zeta4_x16, zeta4_x16_qinv);
+        g1 = add_x16(f1, f3);
+
+        f0 = _mm256_loadu_si256((__m256i *) (f + 0));
+        f2 = _mm256_loadu_si256((__m256i *) (f + 32));
+        g2 = sub_x16(f0, f2);
+        g0 = add_x16(f0, f2);
+
+        f2 = add_x16(g2, g3);
+        f3 = sub_x16(g2, g3);
+        f2 = reduce_x16(qdata, f2);
+        f3 = reduce_x16(qdata, f3);
+
+        f1 = sub_x16(g0, g1);
+        f0 = add_x16(g0, g1);
+        f0 = reduce_x16(qdata, f0);
+
+        h0 = f0;
+        h1 = f1;
+        h2 = f2;
+        h3 = f3;
+
+        f1 = _mm256_loadu_si256((__m256i *) (f + 80));
+        f3 = _mm256_loadu_si256((__m256i *) (f + 112));
+        g3 = sub_x16(f1, f3);
+        g3 = mulmod_x16_scaled(qdata, g3, zeta4_x16, zeta4_x16_qinv);
+        g1 = add_x16(f1, f3);
+
+        f0 = _mm256_loadu_si256((__m256i *) (f + 64));
+        f2 = _mm256_loadu_si256((__m256i *) (f + 96));
+        g2 = sub_x16(f0, f2);
+        g0 = add_x16(f0, f2);
+
+        f3 = sub_x16(g3, g2);
+        f2 = add_x16(g2, g3);
+        f3 = mulmod_x16_scaled(qdata, f3, zetainv8_x16, zetainv8_x16_qinv);
+        f2 = mulmod_x16_scaled(qdata, f2, zeta8_x16, zeta8_x16_qinv);
+
+        f1 = sub_x16(g0, g1);
+        f0 = add_x16(g0, g1);
+        f1 = mulmod_x16_scaled(qdata, f1, zeta4_x16, zeta4_x16_qinv);
+        f0 = reduce_x16(qdata, f0);
+
+        g0 = add_x16(h0, f0);
+        g1 = add_x16(h1, f1);
+        g2 = add_x16(h2, f2);
+        g3 = add_x16(h3, f3);
+        _mm256_storeu_si256((__m256i *) (f + 0), g0);
+        _mm256_storeu_si256((__m256i *) (f + 16), g1);
+        _mm256_storeu_si256((__m256i *) (f + 32), g2);
+        _mm256_storeu_si256((__m256i *) (f + 48), g3);
+        g0 = sub_x16(h0, f0);
+        g1 = sub_x16(h1, f1);
+        g2 = sub_x16(h2, f2);
+        g3 = sub_x16(h3, f3);
+        _mm256_storeu_si256((__m256i *) (f + 64), g0);
+        _mm256_storeu_si256((__m256i *) (f + 80), g1);
+        _mm256_storeu_si256((__m256i *) (f + 96), g2);
+        _mm256_storeu_si256((__m256i *) (f + 112), g3);
+        f += 128;
+    }
+}
+
+static void ntt512(int16 *f, int reps, const __m256i *qdata) {
+    __m256i f0, f1, f2, f3, g0, g1, g2, g3; /* [-Werror=unused-variable] */ /* ,h0,h1,h2,h3; */
+    int16 *origf = f;
+    int rep;
+    __m256i zetainv_512[8];
+    __m256i zetainv_qinv_512[8];
+    int i;
+    for (i = 0; i < 8; ++i) {
+        zetainv_512[i] = zetainv(512, i);
+    }
+    for (i = 0; i < 8; ++i) {
+        zetainv_qinv_512[i] = zetainv_qinv(512, i);
+    }
+    for (rep = 0; rep < reps; ++rep) {
+        for (i = 0; i < 8; ++i) {
+            f1 = _mm256_loadu_si256((__m256i *) (f + 16 * i + 128));
+            f3 = _mm256_loadu_si256((__m256i *) (f + 16 * i + 384));
+            g3 = sub_x16(f1, f3);
+            g3 = mulmod_x16_scaled(qdata, g3, zeta4_x16, zeta4_x16_qinv);
+            g1 = add_x16(f1, f3);
+
+            f0 = _mm256_loadu_si256((__m256i *) (f + 16 * i));
+            f2 = _mm256_loadu_si256((__m256i *) (f + 16 * i + 256));
+            g2 = sub_x16(f0, f2);
+            g0 = add_x16(f0, f2);
+
+            f3 = sub_x16(g3, g2);
+            f2 = add_x16(g2, g3);
+            f3 = mulmod_x16_scaled(qdata, f3, zetainv_512[i], zetainv_qinv_512[i]);
+            f2 = mulmod_x16_scaled(qdata, f2, zeta(512, i), zeta_qinv(512, i));
+
+            f1 = sub_x16(g0, g1);
+            f0 = add_x16(g0, g1);
+            f1 = mulmod_x16_scaled(qdata, f1, zeta(256, i), zeta_qinv(256, i));
+            f0 = reduce_x16(qdata, f0);
+
+            _mm256_storeu_si256((__m256i *) (f + 16 * i + 384), f3);
+            _mm256_storeu_si256((__m256i *) (f + 16 * i + 256), f2);
+            _mm256_storeu_si256((__m256i *) (f + 16 * i + 128), f1);
+            _mm256_storeu_si256((__m256i *) (f + 16 * i), f0);
+
+        }
+        f += 512;
+    }
+    f = origf;
+    ntt128(f, reps * 4, qdata);
+}
+
+void PQCLEAN_NTRULPR857_AVX2_ntt512_7681(int16 *f, int reps) {
+    ntt512(f, reps, (const __m256i *) qdata_7681.data);
+}
+
+void PQCLEAN_NTRULPR857_AVX2_ntt512_10753(int16 *f, int reps) {
+    ntt512(f, reps, (const __m256i *) qdata_10753.data);
+}
+
+static void invntt128(int16 *f, int reps, const __m256i *qdata) {
+    __m256i f0, f1, f2, f3, g0, g1, g2, g3, h0, h1, h2, h3;
+    int16 *origf = f;
+    int rep;
+    __m256i zetainv_x4_16_0 = zetainv_x4(16, 0);
+    __m256i zetainv_x4_qinv_16_0 = zetainv_x4_qinv(16, 0);
+    __m256i zetainv_x4_32_0 = zetainv_x4(32, 0);
+    __m256i zetainv_x4_qinv_32_0 = zetainv_x4_qinv(32, 0);
+    __m256i zetainv_64_0 = zetainv(64, 0);
+    __m256i zetainv_qinv_64_0 = zetainv_qinv(64, 0);
+    __m256i zetainv_128_0 = zetainv(128, 0);
+    __m256i zetainv_qinv_128_0 = zetainv_qinv(128, 0);
+    __m256i zetainv_x4_16_1 = zetainv_x4(16, 1);
+    __m256i zetainv_x4_qinv_16_1 = zetainv_x4_qinv(16, 1);
+    __m256i zetainv_x4_32_1 = zetainv_x4(32, 1);
+    __m256i zetainv_x4_qinv_32_1 = zetainv_x4_qinv(32, 1);
+    __m256i zetainv_64_1 = zetainv(64, 1);
+    __m256i zetainv_qinv_64_1 = zetainv_qinv(64, 1);
+    __m256i zetainv_128_1 = zetainv(128, 1);
+    __m256i zetainv_qinv_128_1 = zetainv_qinv(128, 1);
+    for (rep = 0; rep < reps; ++rep) {
+        f0 = _mm256_loadu_si256((__m256i *) (f +   0));
+        f1 = _mm256_loadu_si256((__m256i *) (f +  64));
+        f2 = _mm256_loadu_si256((__m256i *) (f +  16));
+        f3 = _mm256_loadu_si256((__m256i *) (f +  80));
+        g0 = _mm256_loadu_si256((__m256i *) (f +  32));
+        g1 = _mm256_loadu_si256((__m256i *) (f +  96));
+        g2 = _mm256_loadu_si256((__m256i *) (f +  48));
+        g3 = _mm256_loadu_si256((__m256i *) (f + 112));
+
+        h1 = sub_x16(f0, f1);
+        h1 = reduce_x16(qdata, h1);
+        h0 = add_x16(f0, f1);
+        h3 = sub_x16(f2, f3);
+        h3 = mulmod_x16_scaled(qdata, h3, zeta4_x16, zeta4_x16_qinv);
+        h2 = add_x16(f2, f3);
+        f1 = sub_x16(g0, g1);
+        f1 = mulmod_x16_scaled(qdata, f1, zetainv8_x16, zetainv8_x16_qinv);
+        f0 = add_x16(g0, g1);
+        f3 = sub_x16(g2, g3);
+        f3 = mulmod_x16_scaled(qdata, f3, zeta8_x16, zeta8_x16_qinv);
+        f2 = add_x16(g2, g3);
+
+        g0 = add_x16(h0, h2);
+        g0 = reduce_x16(qdata, g0);
+        g2 = sub_x16(h0, h2);
+        g2 = reduce_x16(qdata, g2);
+        g1 = sub_x16(h1, h3);
+        g3 = add_x16(h1, h3);
+        h2 = sub_x16(f0, f2);
+        h2 = mulmod_x16_scaled(qdata, h2, zeta4_x16, zeta4_x16_qinv);
+        h0 = add_x16(f0, f2);
+        h3 = add_x16(f1, f3);
+        h3 = mulmod_x16_scaled(qdata, h3, zeta4_x16, zeta4_x16_qinv);
+        h1 = sub_x16(f1, f3);
+
+        f0 = add_x16(g0, h0);
+        g0 = sub_x16(g0, h0);
+        f1 = add_x16(g1, h1);
+        g1 = sub_x16(g1, h1);
+        f2 = sub_x16(g2, h2);
+        g2 = add_x16(g2, h2);
+        f3 = sub_x16(g3, h3);
+        g3 = add_x16(g3, h3);
+
+        _mm256_storeu_si256((__m256i *) (f +   0), f0);
+        _mm256_storeu_si256((__m256i *) (f +  32), g0);
+        _mm256_storeu_si256((__m256i *) (f +  64), f1);
+        _mm256_storeu_si256((__m256i *) (f +  96), g1);
+        _mm256_storeu_si256((__m256i *) (f +  16), f2);
+        _mm256_storeu_si256((__m256i *) (f +  48), g2);
+        _mm256_storeu_si256((__m256i *) (f +  80), f3);
+        _mm256_storeu_si256((__m256i *) (f + 112), g3);
+
+        f += 128;
+    }
+    f = origf;
+    for (rep = 0; rep < reps; ++rep) {
+        f0 = _mm256_loadu_si256((__m256i *) (f + 0));
+        f1 = _mm256_loadu_si256((__m256i *) (f + 64));
+        f2 = _mm256_loadu_si256((__m256i *) (f + 16));
+        f3 = _mm256_loadu_si256((__m256i *) (f + 80));
+
+        g0 = _mm256_unpacklo_epi64(f0, f1);
+        g1 = _mm256_unpacklo_epi64(f2, f3);
+        g2 = _mm256_unpackhi_epi64(f0, f1);
+        g3 = _mm256_unpackhi_epi64(f2, f3);
+        f2 = _mm256_permute2x128_si256(g0, g1, 0x31);
+        f3 = _mm256_permute2x128_si256(g2, g3, 0x31);
+        f0 = _mm256_permute2x128_si256(g0, g1, 0x20);
+        f1 = _mm256_permute2x128_si256(g2, g3, 0x20);
+
+        f2 = mulmod_x16_scaled(qdata, f2, zetainv_x4_32_0, zetainv_x4_qinv_32_0);
+        f3 = mulmod_x16_scaled(qdata, f3, zeta_x4(32, 0), zeta_x4_qinv(32, 0));
+
+        g3 = add_x16(f3, f2);
+        g3 = mulmod_x16_scaled(qdata, g3, zeta4_x16, zeta4_x16_qinv);
+        g2 = sub_x16(f3, f2);
+
+        f0 = reduce_x16(qdata, f0);
+        f1 = mulmod_x16_scaled(qdata, f1, zetainv_x4_16_0, zetainv_x4_qinv_16_0);
+
+        g1 = add_x16(f0, f1);
+        g0 = sub_x16(f0, f1);
+
+        f1 = add_x16(g1, g3);
+        f3 = sub_x16(g1, g3);
+        f0 = add_x16(g0, g2);
+        f2 = sub_x16(g0, g2);
+
+        _mm256_storeu_si256((__m256i *) (f + 64), f1);
+        _mm256_storeu_si256((__m256i *) (f + 80), f3);
+        _mm256_storeu_si256((__m256i *) (f + 0), f0);
+        _mm256_storeu_si256((__m256i *) (f + 16), f2);
+
+        f0 = _mm256_loadu_si256((__m256i *) (f + 32));
+        f1 = _mm256_loadu_si256((__m256i *) (f + 96));
+        f2 = _mm256_loadu_si256((__m256i *) (f + 48));
+        f3 = _mm256_loadu_si256((__m256i *) (f + 112));
+
+        g0 = _mm256_unpacklo_epi64(f0, f1);
+        g1 = _mm256_unpacklo_epi64(f2, f3);
+        g2 = _mm256_unpackhi_epi64(f0, f1);
+        g3 = _mm256_unpackhi_epi64(f2, f3);
+        f2 = _mm256_permute2x128_si256(g0, g1, 0x31);
+        f3 = _mm256_permute2x128_si256(g2, g3, 0x31);
+        f0 = _mm256_permute2x128_si256(g0, g1, 0x20);
+        f1 = _mm256_permute2x128_si256(g2, g3, 0x20);
+
+        f2 = mulmod_x16_scaled(qdata, f2, zetainv_x4_32_1, zetainv_x4_qinv_32_1);
+        f3 = mulmod_x16_scaled(qdata, f3, zeta_x4(32, 1), zeta_x4_qinv(32, 1));
+
+        g3 = add_x16(f3, f2);
+        g3 = mulmod_x16_scaled(qdata, g3, zeta4_x16, zeta4_x16_qinv);
+        g2 = sub_x16(f3, f2);
+
+        f0 = reduce_x16(qdata, f0);
+        f1 = mulmod_x16_scaled(qdata, f1, zetainv_x4_16_1, zetainv_x4_qinv_16_1);
+
+        g1 = add_x16(f0, f1);
+        g0 = sub_x16(f0, f1);
+
+        f1 = add_x16(g1, g3);
+        f3 = sub_x16(g1, g3);
+        f0 = add_x16(g0, g2);
+        f2 = sub_x16(g0, g2);
+
+        _mm256_storeu_si256((__m256i *) (f + 96), f1);
+        _mm256_storeu_si256((__m256i *) (f + 112), f3);
+        _mm256_storeu_si256((__m256i *) (f + 32), f0);
+        _mm256_storeu_si256((__m256i *) (f + 48), f2);
+
+        f += 128;
+    }
+    f = origf;
+    for (rep = 0; rep < reps; ++rep) {
+        f0 = _mm256_loadu_si256((__m256i *) (f + 0));
+        f2 = _mm256_loadu_si256((__m256i *) (f + 64));
+        f1 = _mm256_loadu_si256((__m256i *) (f + 32));
+        f3 = _mm256_loadu_si256((__m256i *) (f + 96));
+
+        g0 = _mm256_permute2x128_si256(f0, f2, 0x20);
+        g2 = _mm256_permute2x128_si256(f0, f2, 0x31);
+        f0 = _mm256_unpacklo_epi16(g0, g2);
+        f2 = _mm256_unpackhi_epi16(g0, g2);
+        g1 = _mm256_permute2x128_si256(f1, f3, 0x20);
+        g3 = _mm256_permute2x128_si256(f1, f3, 0x31);
+        f1 = _mm256_unpacklo_epi16(g1, g3);
+        f3 = _mm256_unpackhi_epi16(g1, g3);
+        g1 = _mm256_unpackhi_epi16(f0, f2);
+        g0 = _mm256_unpacklo_epi16(f0, f2);
+        g3 = _mm256_unpackhi_epi16(f1, f3);
+        g2 = _mm256_unpacklo_epi16(f1, f3);
+        f2 = _mm256_unpacklo_epi64(g1, g3);
+        f3 = _mm256_unpackhi_epi64(g1, g3);
+        f0 = _mm256_unpacklo_epi64(g0, g2);
+        f1 = _mm256_unpackhi_epi64(g0, g2);
+
+        f2 = mulmod_x16_scaled(qdata, f2, zetainv_128_0, zetainv_qinv_128_0);
+        f3 = mulmod_x16_scaled(qdata, f3, zeta(128, 0), zeta_qinv(128, 0));
+        f0 = reduce_x16(qdata, f0);
+        f1 = mulmod_x16_scaled(qdata, f1, zetainv_64_0, zetainv_qinv_64_0);
+
+        g3 = add_x16(f3, f2);
+        g3 = mulmod_x16_scaled(qdata, g3, zeta4_x16, zeta4_x16_qinv);
+        g1 = add_x16(f0, f1);
+        g2 = sub_x16(f3, f2);
+        g0 = sub_x16(f0, f1);
+
+        f1 = add_x16(g1, g3);
+        f3 = sub_x16(g1, g3);
+        f0 = add_x16(g0, g2);
+        f2 = sub_x16(g0, g2);
+
+        _mm256_storeu_si256((__m256i *) (f + 32), f1);
+        _mm256_storeu_si256((__m256i *) (f + 96), f3);
+        _mm256_storeu_si256((__m256i *) (f + 0), f0);
+        _mm256_storeu_si256((__m256i *) (f + 64), f2);
+
+        f0 = _mm256_loadu_si256((__m256i *) (f + 16));
+        f2 = _mm256_loadu_si256((__m256i *) (f + 80));
+        f1 = _mm256_loadu_si256((__m256i *) (f + 48));
+        f3 = _mm256_loadu_si256((__m256i *) (f + 112));
+
+        g0 = _mm256_permute2x128_si256(f0, f2, 0x20);
+        g2 = _mm256_permute2x128_si256(f0, f2, 0x31);
+        f0 = _mm256_unpacklo_epi16(g0, g2);
+        f2 = _mm256_unpackhi_epi16(g0, g2);
+        g1 = _mm256_permute2x128_si256(f1, f3, 0x20);
+        g3 = _mm256_permute2x128_si256(f1, f3, 0x31);
+        f1 = _mm256_unpacklo_epi16(g1, g3);
+        f3 = _mm256_unpackhi_epi16(g1, g3);
+        g1 = _mm256_unpackhi_epi16(f0, f2);
+        g0 = _mm256_unpacklo_epi16(f0, f2);
+        g3 = _mm256_unpackhi_epi16(f1, f3);
+        g2 = _mm256_unpacklo_epi16(f1, f3);
+        f2 = _mm256_unpacklo_epi64(g1, g3);
+        f3 = _mm256_unpackhi_epi64(g1, g3);
+        f0 = _mm256_unpacklo_epi64(g0, g2);
+        f1 = _mm256_unpackhi_epi64(g0, g2);
+
+        f2 = mulmod_x16_scaled(qdata, f2, zetainv_128_1, zetainv_qinv_128_1);
+        f3 = mulmod_x16_scaled(qdata, f3, zeta(128, 1), zeta_qinv(128, 1));
+        f0 = reduce_x16(qdata, f0);
+        f1 = mulmod_x16_scaled(qdata, f1, zetainv_64_1, zetainv_qinv_64_1);
+
+        g3 = add_x16(f3, f2);
+        g3 = mulmod_x16_scaled(qdata, g3, zeta4_x16, zeta4_x16_qinv);
+        g1 = add_x16(f0, f1);
+        g2 = sub_x16(f3, f2);
+        g0 = sub_x16(f0, f1);
+
+        f1 = add_x16(g1, g3);
+        f3 = sub_x16(g1, g3);
+        f0 = add_x16(g0, g2);
+        f2 = sub_x16(g0, g2);
+
+        _mm256_storeu_si256((__m256i *) (f + 48), f1);
+        _mm256_storeu_si256((__m256i *) (f + 112), f3);
+        _mm256_storeu_si256((__m256i *) (f + 16), f0);
+        _mm256_storeu_si256((__m256i *) (f + 80), f2);
+
+        f += 128;
+    }
+}
+
+static void invntt512(int16 *f, int reps, const __m256i *qdata) {
+    __m256i f0, f1, f2, f3, g0, g1, g2, g3; /* [-Werror=unused-variable] */ /* ,h0,h1,h2,h3; */
+    /* [-Werror=unused-variable] */ /* int16 *origf = f; */
+    int rep;
+    __m256i zetainv_512[8];
+    __m256i zetainv_qinv_512[8];
+    __m256i zetainv_256[8];
+    __m256i zetainv_qinv_256[8];
+    int i;
+    for (i = 0; i < 8; ++i) {
+        zetainv_512[i] = zetainv(512, i);
+    }
+    for (i = 0; i < 8; ++i) {
+        zetainv_qinv_512[i] = zetainv_qinv(512, i);
+    }
+    for (i = 0; i < 8; ++i) {
+        zetainv_256[i] = zetainv(256, i);
+    }
+    for (i = 0; i < 8; ++i) {
+        zetainv_qinv_256[i] = zetainv_qinv(256, i);
+    }
+    invntt128(f, 4 * reps, qdata);
+    for (rep = 0; rep < reps; ++rep) {
+        for (i = 0; i < 8; ++i) {
+            f2 = _mm256_loadu_si256((__m256i *) (f + 16 * i + 256));
+            f3 = _mm256_loadu_si256((__m256i *) (f + 16 * i + 384));
+
+            f2 = mulmod_x16_scaled(qdata, f2, zetainv_512[i], zetainv_qinv_512[i]);
+            f3 = mulmod_x16_scaled(qdata, f3, zeta(512, i), zeta_qinv(512, i));
+            g3 = add_x16(f3, f2);
+            g3 = mulmod_x16_scaled(qdata, g3, zeta4_x16, zeta4_x16_qinv);
+            g2 = sub_x16(f3, f2);
+
+            f0 = _mm256_loadu_si256((__m256i *) (f + 16 * i + 0));
+            f1 = _mm256_loadu_si256((__m256i *) (f + 16 * i + 128));
+
+            f0 = reduce_x16(qdata, f0);
+            f1 = mulmod_x16_scaled(qdata, f1, zetainv_256[i], zetainv_qinv_256[i]);
+            g1 = add_x16(f0, f1);
+            g0 = sub_x16(f0, f1);
+
+            f1 = add_x16(g1, g3);
+            f3 = sub_x16(g1, g3);
+            f0 = add_x16(g0, g2);
+            f2 = sub_x16(g0, g2);
+
+            _mm256_storeu_si256((__m256i *) (f + 16 * i + 128), f1);
+            _mm256_storeu_si256((__m256i *) (f + 16 * i + 384), f3);
+            _mm256_storeu_si256((__m256i *) (f + 16 * i + 0), f0);
+            _mm256_storeu_si256((__m256i *) (f + 16 * i + 256), f2);
+        }
+        f += 512;
+    }
+}
+
+void PQCLEAN_NTRULPR857_AVX2_invntt512_7681(int16 *f, int reps) {
+    invntt512(f, reps, (const __m256i *) qdata_7681.data);
+}
+
+void PQCLEAN_NTRULPR857_AVX2_invntt512_10753(int16 *f, int reps) {
+    invntt512(f, reps, (const __m256i *) qdata_10753.data);
+}
diff --git a/crypto_kem/ntrulpr857/avx2/crypto_core_multsntrup857_ntt.h b/crypto_kem/ntrulpr857/avx2/crypto_core_multsntrup857_ntt.h
new file mode 100644
index 00000000..caaf40e3
--- /dev/null
+++ b/crypto_kem/ntrulpr857/avx2/crypto_core_multsntrup857_ntt.h
@@ -0,0 +1,13 @@
+#ifndef ntt_H
+#define ntt_H
+
+#include <stdint.h>
+
+
+
+extern void PQCLEAN_NTRULPR857_AVX2_ntt512_7681(int16_t *f, int reps);
+extern void PQCLEAN_NTRULPR857_AVX2_ntt512_10753(int16_t *f, int reps);
+extern void PQCLEAN_NTRULPR857_AVX2_invntt512_7681(int16_t *f, int reps);
+extern void PQCLEAN_NTRULPR857_AVX2_invntt512_10753(int16_t *f, int reps);
+
+#endif
diff --git a/crypto_kem/ntrulpr857/avx2/crypto_decode_256x16.c b/crypto_kem/ntrulpr857/avx2/crypto_decode_256x16.c
new file mode 100644
index 00000000..2a81fee1
--- /dev/null
+++ b/crypto_kem/ntrulpr857/avx2/crypto_decode_256x16.c
@@ -0,0 +1,11 @@
+#include "crypto_decode_256x16.h"
+
+
+void PQCLEAN_NTRULPR857_AVX2_crypto_decode_256x16(void *v, const unsigned char *s) {
+    unsigned char *T = v;
+    int i;
+    for (i = 0; i < 128; ++i) {
+        T[2 * i] = s[i] & 15;
+        T[2 * i + 1] = s[i] >> 4;
+    }
+}
diff --git a/crypto_kem/ntrulpr857/avx2/crypto_decode_256x16.h b/crypto_kem/ntrulpr857/avx2/crypto_decode_256x16.h
new file mode 100644
index 00000000..30083556
--- /dev/null
+++ b/crypto_kem/ntrulpr857/avx2/crypto_decode_256x16.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_NTRULPR857_AVX2_CRYPTO_DECODE_256X16_H
+#define PQCLEAN_NTRULPR857_AVX2_CRYPTO_DECODE_256X16_H
+
+#include <stdint.h>
+#define PQCLEAN_NTRULPR857_AVX2_crypto_decode_256x16_STRBYTES 128
+#define PQCLEAN_NTRULPR857_AVX2_crypto_decode_256x16_ITEMS 256
+#define PQCLEAN_NTRULPR857_AVX2_crypto_decode_256x16_ITEMBYTES 1
+
+void PQCLEAN_NTRULPR857_AVX2_crypto_decode_256x16(void *v, const unsigned char *s);
+#endif
diff --git a/crypto_kem/ntrulpr857/avx2/crypto_decode_256x2.c b/crypto_kem/ntrulpr857/avx2/crypto_decode_256x2.c
new file mode 100644
index 00000000..58051a54
--- /dev/null
+++ b/crypto_kem/ntrulpr857/avx2/crypto_decode_256x2.c
@@ -0,0 +1,27 @@
+#include "crypto_decode_256x2.h"
+#include <immintrin.h>
+#include <stdint.h>
+
+#define COPY _mm256_set_epi64x(0x0303030303030303,0x0202020202020202,0x0101010101010101,0x0000000000000000)
+#define MASK _mm256_set1_epi64x(0x8040201008040201)
+#define MASK2 _mm256_set1_epi64x(0x0101010101010101)
+
+void PQCLEAN_NTRULPR857_AVX2_crypto_decode_256x2(void *v, const unsigned char *s) {
+    __m256i *r = v;
+    int i;
+
+    for (i = 0; i < 8; ++i) {
+        /* bytes s0 s1 s2 s3 */
+        __m256i x = _mm256_set1_epi32(*(int32_t *) s);
+        /* s0 s1 s2 s3 s0 s1 s2 s3 s0 s1 s2 s3 s0 s1 s2 s3 s0 s1 s2 s3 s0 s1 s2 s3 s0 s1 s2 s3 s0 s1 s2 s3 */
+        x = _mm256_shuffle_epi8(x, COPY);
+        /* s0 s0 s0 s0 s0 s0 s0 s0 s1 s1 s1 s1 s1 s1 s1 s1 s2 s2 s2 s2 s2 s2 s2 s2 s3 s3 s3 s3 s3 s3 s3 s3 */
+        x = _mm256_andnot_si256(x, MASK);
+        x = _mm256_cmpeq_epi8(x, _mm256_setzero_si256());
+        x &= MASK2;
+        _mm256_storeu_si256(r, x);
+
+        s += 4;
+        r += 1;
+    }
+}
diff --git a/crypto_kem/ntrulpr857/avx2/crypto_decode_256x2.h b/crypto_kem/ntrulpr857/avx2/crypto_decode_256x2.h
new file mode 100644
index 00000000..1977de43
--- /dev/null
+++ b/crypto_kem/ntrulpr857/avx2/crypto_decode_256x2.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_NTRULPR857_AVX2_CRYPTO_DECODE_256X2_H
+#define PQCLEAN_NTRULPR857_AVX2_CRYPTO_DECODE_256X2_H
+
+#include <stdint.h>
+#define PQCLEAN_NTRULPR857_AVX2_crypto_decode_256x2_STRBYTES 32
+#define PQCLEAN_NTRULPR857_AVX2_crypto_decode_256x2_ITEMS 256
+#define PQCLEAN_NTRULPR857_AVX2_crypto_decode_256x2_ITEMBYTES 1
+
+void PQCLEAN_NTRULPR857_AVX2_crypto_decode_256x2(void *v, const unsigned char *s);
+#endif
diff --git a/crypto_kem/ntrulpr857/avx2/crypto_decode_857x1723.c b/crypto_kem/ntrulpr857/avx2/crypto_decode_857x1723.c
new file mode 100644
index 00000000..ff768213
--- /dev/null
+++ b/crypto_kem/ntrulpr857/avx2/crypto_decode_857x1723.c
@@ -0,0 +1,430 @@
+#include "crypto_decode_857x1723.h"
+#include <immintrin.h>
+/* auto-generated; do not edit */
+
+#define int16 int16_t
+#define int32 int32_t
+
+static inline int16 mullo(int16 x, int16 y) {
+    return x * y;
+}
+
+static inline int16 mulhi(int16 x, int16 y) {
+    return (x * (int32)y) >> 16;
+}
+
+static inline __m256i add(__m256i x, __m256i y) {
+    return _mm256_add_epi16(x, y);
+}
+
+static inline __m256i sub(__m256i x, __m256i y) {
+    return _mm256_sub_epi16(x, y);
+}
+
+static inline __m256i shiftleftconst(__m256i x, int16 y) {
+    return _mm256_slli_epi16(x, y);
+}
+
+static inline __m256i signedshiftrightconst(__m256i x, int16 y) {
+    return _mm256_srai_epi16(x, y);
+}
+
+static inline __m256i addconst(__m256i x, int16 y) {
+    return add(x, _mm256_set1_epi16(y));
+}
+
+static inline __m256i subconst(__m256i x, int16 y) {
+    return sub(x, _mm256_set1_epi16(y));
+}
+
+static inline __m256i mulloconst(__m256i x, int16 y) {
+    return _mm256_mullo_epi16(x, _mm256_set1_epi16(y));
+}
+
+static inline __m256i mulhiconst(__m256i x, int16 y) {
+    return _mm256_mulhi_epi16(x, _mm256_set1_epi16(y));
+}
+
+static inline __m256i ifgesubconst(__m256i x, int16 y) {
+    __m256i y16 = _mm256_set1_epi16(y);
+    __m256i top16 = _mm256_set1_epi16((int16)(y - 1));
+    return sub(x, _mm256_cmpgt_epi16(x, top16) & y16);
+}
+
+static inline __m256i ifnegaddconst(__m256i x, int16 y) {
+    return add(x, signedshiftrightconst(x, 15) & _mm256_set1_epi16(y));
+}
+
+void PQCLEAN_NTRULPR857_AVX2_crypto_decode_857x1723(void *v, const unsigned char *s) {
+    int16 *R0 = v;
+    int16 R1[429], R2[215], R3[108], R4[54], R5[27], R6[14], R7[7], R8[4], R9[2], R10[1];
+    long long i;
+    int16 a0, a1, a2;
+    __m256i A0, A1, A2, S0, S1, B0, B1, C0, C1;
+
+    s += PQCLEAN_NTRULPR857_AVX2_crypto_decode_857x1723_STRBYTES;
+    a1 = 0;
+    a1 += *--s; /* 0...255 */
+    a1 -= 160; /* -160...95 */
+    a1 += (a1 >> 15) & 160; /* 0...159 */
+    R10[0] = a1;
+
+    /* R10 ------> R9: reconstruct mod 1*[743]+[14044] */
+
+    i = 0;
+    s -= 2;
+    a0 = R10[0];
+    a0 = mulhi(a0, 276) - mulhi(mullo(a0, -22580), 743); /* -372...440 */
+    a0 += s[2 * i + 1]; /* -372...695 */
+    a0 = mulhi(a0, 276) - mulhi(mullo(a0, -22580), 743); /* -374...374 */
+    a0 += s[2 * i + 0]; /* -374...629 */
+    a0 += (a0 >> 15) & 743; /* 0...742 */
+    a1 = (s[2 * i + 1] << 8) + s[2 * i] - a0;
+    a1 = mullo(a1, -3881);
+
+    /* invalid inputs might need reduction mod 14044 */
+    a1 -= 14044;
+    a1 += (a1 >> 15) & 14044;
+
+    R9[0] = a0;
+    R9[1] = a1;
+    s -= 0;
+
+    /* R9 ------> R8: reconstruct mod 3*[436]+[8246] */
+
+    i = 0;
+    s -= 1;
+    a2 = a0 = R9[1];
+    a0 = mulhi(a0, -64) - mulhi(mullo(a0, 27056), 436); /* -234...218 */
+    a0 += s[1 * i + 0]; /* -234...473 */
+    a0 -= 436; /* -670..>37 */
+    a0 += (a0 >> 15) & 436; /* -234...435 */
+    a0 += (a0 >> 15) & 436; /* 0...435 */
+    a1 = (a2 << 6) + ((s[i] - a0) >> 2);
+    a1 = mullo(a1, 2405);
+
+    /* invalid inputs might need reduction mod 8246 */
+    a1 -= 8246;
+    a1 += (a1 >> 15) & 8246;
+
+    R8[2] = a0;
+    R8[3] = a1;
+    s -= 1;
+    for (i = 0; i >= 0; --i) {
+        a2 = a0 = R9[i];
+        a0 = mulhi(a0, -64) - mulhi(mullo(a0, 27056), 436); /* -234...218 */
+        a0 += s[1 * i + 0]; /* -234...473 */
+        a0 -= 436; /* -670..>37 */
+        a0 += (a0 >> 15) & 436; /* -234...435 */
+        a0 += (a0 >> 15) & 436; /* 0...435 */
+        a1 = (a2 << 6) + ((s[i] - a0) >> 2);
+        a1 = mullo(a1, 2405);
+
+        /* invalid inputs might need reduction mod 436 */
+        a1 -= 436;
+        a1 += (a1 >> 15) & 436;
+
+        R8[2 * i] = a0;
+        R8[2 * i + 1] = a1;
+    }
+
+    /* R8 ------> R7: reconstruct mod 6*[334]+[8246] */
+
+    R7[6] = R8[3];
+    s -= 3;
+    for (i = 2; i >= 0; --i) {
+        a2 = a0 = R8[i];
+        a0 = mulhi(a0, 62) - mulhi(mullo(a0, 15305), 334); /* -167...182 */
+        a0 += s[1 * i + 0]; /* -167...437 */
+        a0 -= 334; /* -501..>103 */
+        a0 += (a0 >> 15) & 334; /* -167...333 */
+        a0 += (a0 >> 15) & 334; /* 0...333 */
+        a1 = (a2 << 7) + ((s[i] - a0) >> 1);
+        a1 = mullo(a1, -22761);
+
+        /* invalid inputs might need reduction mod 334 */
+        a1 -= 334;
+        a1 += (a1 >> 15) & 334;
+
+        R7[2 * i] = a0;
+        R7[2 * i + 1] = a1;
+    }
+
+    /* R7 ------> R6: reconstruct mod 13*[292]+[7229] */
+
+    i = 0;
+    s -= 1;
+    a2 = a0 = R7[6];
+    a0 = mulhi(a0, 64) - mulhi(mullo(a0, 8080), 292); /* -146...162 */
+    a0 += s[1 * i + 0]; /* -146...417 */
+    a0 -= 292; /* -438..>125 */
+    a0 += (a0 >> 15) & 292; /* -146...291 */
+    a0 += (a0 >> 15) & 292; /* 0...291 */
+    a1 = (a2 << 6) + ((s[i] - a0) >> 2);
+    a1 = mullo(a1, -3591);
+
+    /* invalid inputs might need reduction mod 7229 */
+    a1 -= 7229;
+    a1 += (a1 >> 15) & 7229;
+
+    R6[12] = a0;
+    R6[13] = a1;
+    s -= 6;
+    for (i = 5; i >= 0; --i) {
+        a2 = a0 = R7[i];
+        a0 = mulhi(a0, 64) - mulhi(mullo(a0, 8080), 292); /* -146...162 */
+        a0 += s[1 * i + 0]; /* -146...417 */
+        a0 -= 292; /* -438..>125 */
+        a0 += (a0 >> 15) & 292; /* -146...291 */
+        a0 += (a0 >> 15) & 292; /* 0...291 */
+        a1 = (a2 << 6) + ((s[i] - a0) >> 2);
+        a1 = mullo(a1, -3591);
+
+        /* invalid inputs might need reduction mod 292 */
+        a1 -= 292;
+        a1 += (a1 >> 15) & 292;
+
+        R6[2 * i] = a0;
+        R6[2 * i + 1] = a1;
+    }
+
+    /* R6 ------> R5: reconstruct mod 26*[273]+[7229] */
+
+    R5[26] = R6[13];
+    s -= 13;
+    for (i = 12; i >= 0; --i) {
+        a2 = a0 = R6[i];
+        a0 = mulhi(a0, 1) - mulhi(mullo(a0, 4081), 273); /* -137...136 */
+        a0 += s[1 * i + 0]; /* -137...391 */
+        a0 -= 273; /* -410..>118 */
+        a0 += (a0 >> 15) & 273; /* -137...272 */
+        a0 += (a0 >> 15) & 273; /* 0...272 */
+        a1 = (a2 << 8) + s[i] - a0;
+        a1 = mullo(a1, 4081);
+
+        /* invalid inputs might need reduction mod 273 */
+        a1 -= 273;
+        a1 += (a1 >> 15) & 273;
+
+        R5[2 * i] = a0;
+        R5[2 * i + 1] = a1;
+    }
+
+    /* R5 ------> R4: reconstruct mod 53*[4225]+[438] */
+
+    i = 0;
+    s -= 1;
+    a2 = a0 = R5[26];
+    a0 = mulhi(a0, -259) - mulhi(mullo(a0, -3971), 4225); /* -2178...2112 */
+    a0 += s[1 * i + 0]; /* -2178...2367 */
+    a0 += (a0 >> 15) & 4225; /* 0...4224 */
+    a1 = (a2 << 8) + s[i] - a0;
+    a1 = mullo(a1, 12161);
+
+    /* invalid inputs might need reduction mod 438 */
+    a1 -= 438;
+    a1 += (a1 >> 15) & 438;
+
+    R4[52] = a0;
+    R4[53] = a1;
+    s -= 52;
+    i = 10;
+    for (;;) {
+        A0 = _mm256_loadu_si256((__m256i *) &R5[i]);
+        S0 = _mm256_loadu_si256((__m256i *) (s + 2 * i));
+        S1 = _mm256_srli_epi16(S0, 8);
+        S0 &= _mm256_set1_epi16(255);
+        A0 = sub(mulhiconst(A0, -259), mulhiconst(mulloconst(A0, -3971), 4225)); /* -2178...2112 */
+        A0 = add(A0, S1); /* -2178...2367 */
+        A0 = sub(mulhiconst(A0, -259), mulhiconst(mulloconst(A0, -3971), 4225)); /* -2122...2121 */
+        A0 = add(A0, S0); /* -2122...2376 */
+        A0 = ifnegaddconst(A0, 4225); /* 0...4224 */
+        A1 = add(shiftleftconst(S1, 8), sub(S0, A0));
+        A1 = mulloconst(A1, 12161);
+
+        /* invalid inputs might need reduction mod 4225 */
+        A1 = ifgesubconst(A1, 4225);
+
+        /* A0: r0r2r4r6r8r10r12r14 r16r18r20r22r24r26r28r30 */
+        /* A1: r1r3r5r7r9r11r13r15 r17r19r21r23r25r27r29r31 */
+        B0 = _mm256_unpacklo_epi16(A0, A1);
+        B1 = _mm256_unpackhi_epi16(A0, A1);
+        /* B0: r0r1r2r3r4r5r6r7 r16r17r18r19r20r21r22r23 */
+        /* B1: r8r9r10r11r12r13r14r15 r24r25r26r27r28r29r30r31 */
+        C0 = _mm256_permute2x128_si256(B0, B1, 0x20);
+        C1 = _mm256_permute2x128_si256(B0, B1, 0x31);
+        /* C0: r0r1r2r3r4r5r6r7 r8r9r10r11r12r13r14r15 */
+        /* C1: r16r17r18r19r20r21r22r23 r24r25r26r27r28r29r30r31 */
+        _mm256_storeu_si256((__m256i *) (&R4[2 * i]), C0);
+        _mm256_storeu_si256((__m256i *) (16 + &R4[2 * i]), C1);
+        if (!i) {
+            break;
+        }
+        i = -16 - ((~15) & -i);
+    }
+
+    /* R4 ------> R3: reconstruct mod 107*[65]+[1723] */
+
+    i = 0;
+    s -= 1;
+    a2 = a0 = R4[53];
+    a0 = mulhi(a0, 1) - mulhi(mullo(a0, 4033), 65); /* -33...32 */
+    a0 += s[1 * i + 0]; /* -33...287 */
+    a0 = mulhi(a0, 16) - mulhi(mullo(a0, -1008), 65); /* -33...32 */
+    a0 += (a0 >> 15) & 65; /* 0...64 */
+    a1 = (a2 << 8) + s[i] - a0;
+    a1 = mullo(a1, 4033);
+
+    /* invalid inputs might need reduction mod 1723 */
+    a1 -= 1723;
+    a1 += (a1 >> 15) & 1723;
+
+    R3[106] = a0;
+    R3[107] = a1;
+    s -= 0;
+    i = 37;
+    for (;;) {
+        A2 = A0 = _mm256_loadu_si256((__m256i *) &R4[i]);
+        A0 = sub(mulhiconst(A0, 16), mulhiconst(mulloconst(A0, -1008), 65)); /* -33...36 */
+        A0 = ifnegaddconst(A0, 65); /* 0...64 */
+        A1 = signedshiftrightconst(sub(A2, A0), 0);
+        A1 = mulloconst(A1, 4033);
+
+        /* invalid inputs might need reduction mod 65 */
+        A1 = ifgesubconst(A1, 65);
+
+        /* A0: r0r2r4r6r8r10r12r14 r16r18r20r22r24r26r28r30 */
+        /* A1: r1r3r5r7r9r11r13r15 r17r19r21r23r25r27r29r31 */
+        B0 = _mm256_unpacklo_epi16(A0, A1);
+        B1 = _mm256_unpackhi_epi16(A0, A1);
+        /* B0: r0r1r2r3r4r5r6r7 r16r17r18r19r20r21r22r23 */
+        /* B1: r8r9r10r11r12r13r14r15 r24r25r26r27r28r29r30r31 */
+        C0 = _mm256_permute2x128_si256(B0, B1, 0x20);
+        C1 = _mm256_permute2x128_si256(B0, B1, 0x31);
+        /* C0: r0r1r2r3r4r5r6r7 r8r9r10r11r12r13r14r15 */
+        /* C1: r16r17r18r19r20r21r22r23 r24r25r26r27r28r29r30r31 */
+        _mm256_storeu_si256((__m256i *) (&R3[2 * i]), C0);
+        _mm256_storeu_si256((__m256i *) (16 + &R3[2 * i]), C1);
+        if (!i) {
+            break;
+        }
+        i = -16 - ((~15) & -i);
+    }
+
+    /* R3 ------> R2: reconstruct mod 214*[2053]+[1723] */
+
+    R2[214] = R3[107];
+    s -= 214;
+    i = 91;
+    for (;;) {
+        A0 = _mm256_loadu_si256((__m256i *) &R3[i]);
+        S0 = _mm256_loadu_si256((__m256i *) (s + 2 * i));
+        S1 = _mm256_srli_epi16(S0, 8);
+        S0 &= _mm256_set1_epi16(255);
+        A0 = sub(mulhiconst(A0, 100), mulhiconst(mulloconst(A0, -8172), 2053)); /* -1027...1051 */
+        A0 = add(A0, S1); /* -1027...1306 */
+        A0 = sub(mulhiconst(A0, 100), mulhiconst(mulloconst(A0, -8172), 2053)); /* -1029...1028 */
+        A0 = add(A0, S0); /* -1029...1283 */
+        A0 = ifnegaddconst(A0, 2053); /* 0...2052 */
+        A1 = add(shiftleftconst(S1, 8), sub(S0, A0));
+        A1 = mulloconst(A1, -31539);
+
+        /* invalid inputs might need reduction mod 2053 */
+        A1 = ifgesubconst(A1, 2053);
+
+        /* A0: r0r2r4r6r8r10r12r14 r16r18r20r22r24r26r28r30 */
+        /* A1: r1r3r5r7r9r11r13r15 r17r19r21r23r25r27r29r31 */
+        B0 = _mm256_unpacklo_epi16(A0, A1);
+        B1 = _mm256_unpackhi_epi16(A0, A1);
+        /* B0: r0r1r2r3r4r5r6r7 r16r17r18r19r20r21r22r23 */
+        /* B1: r8r9r10r11r12r13r14r15 r24r25r26r27r28r29r30r31 */
+        C0 = _mm256_permute2x128_si256(B0, B1, 0x20);
+        C1 = _mm256_permute2x128_si256(B0, B1, 0x31);
+        /* C0: r0r1r2r3r4r5r6r7 r8r9r10r11r12r13r14r15 */
+        /* C1: r16r17r18r19r20r21r22r23 r24r25r26r27r28r29r30r31 */
+        _mm256_storeu_si256((__m256i *) (&R2[2 * i]), C0);
+        _mm256_storeu_si256((__m256i *) (16 + &R2[2 * i]), C1);
+        if (!i) {
+            break;
+        }
+        i = -16 - ((~15) & -i);
+    }
+
+    /* R2 ------> R1: reconstruct mod 428*[11597]+[1723] */
+
+    R1[428] = R2[214];
+    s -= 428;
+    i = 198;
+    for (;;) {
+        A0 = _mm256_loadu_si256((__m256i *) &R2[i]);
+        S0 = _mm256_loadu_si256((__m256i *) (s + 2 * i));
+        S1 = _mm256_srli_epi16(S0, 8);
+        S0 &= _mm256_set1_epi16(255);
+        A0 = sub(mulhiconst(A0, -3643), mulhiconst(mulloconst(A0, -1447), 11597)); /* -6710...5798 */
+        A0 = add(A0, S1); /* -6710...6053 */
+        A0 = sub(mulhiconst(A0, -3643), mulhiconst(mulloconst(A0, -1447), 11597)); /* -6135...6171 */
+        A0 = add(A0, S0); /* -6135...6426 */
+        A0 = ifnegaddconst(A0, 11597); /* 0...11596 */
+        A1 = add(shiftleftconst(S1, 8), sub(S0, A0));
+        A1 = mulloconst(A1, -11387);
+
+        /* invalid inputs might need reduction mod 11597 */
+        A1 = ifgesubconst(A1, 11597);
+
+        /* A0: r0r2r4r6r8r10r12r14 r16r18r20r22r24r26r28r30 */
+        /* A1: r1r3r5r7r9r11r13r15 r17r19r21r23r25r27r29r31 */
+        B0 = _mm256_unpacklo_epi16(A0, A1);
+        B1 = _mm256_unpackhi_epi16(A0, A1);
+        /* B0: r0r1r2r3r4r5r6r7 r16r17r18r19r20r21r22r23 */
+        /* B1: r8r9r10r11r12r13r14r15 r24r25r26r27r28r29r30r31 */
+        C0 = _mm256_permute2x128_si256(B0, B1, 0x20);
+        C1 = _mm256_permute2x128_si256(B0, B1, 0x31);
+        /* C0: r0r1r2r3r4r5r6r7 r8r9r10r11r12r13r14r15 */
+        /* C1: r16r17r18r19r20r21r22r23 r24r25r26r27r28r29r30r31 */
+        _mm256_storeu_si256((__m256i *) (&R1[2 * i]), C0);
+        _mm256_storeu_si256((__m256i *) (16 + &R1[2 * i]), C1);
+        if (!i) {
+            break;
+        }
+        i = -16 - ((~15) & -i);
+    }
+
+    /* R1 ------> R0: reconstruct mod 857*[1723] */
+
+    R0[856] = 3 * R1[428] - 2583;
+    s -= 428;
+    i = 412;
+    for (;;) {
+        A2 = A0 = _mm256_loadu_si256((__m256i *) &R1[i]);
+        S0 = _mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i *) (s + i)));
+        A0 = sub(mulhiconst(A0, 365), mulhiconst(mulloconst(A0, -9737), 1723)); /* -862...952 */
+        A0 = add(A0, S0); /* -862...1207 */
+        A0 = ifnegaddconst(A0, 1723); /* 0...1722 */
+        A1 = add(shiftleftconst(A2, 8), sub(S0, A0));
+        A1 = mulloconst(A1, 20083);
+
+        /* invalid inputs might need reduction mod 1723 */
+        A1 = ifgesubconst(A1, 1723);
+
+        A0 = mulloconst(A0, 3);
+        A1 = mulloconst(A1, 3);
+        A0 = subconst(A0, 2583);
+        A1 = subconst(A1, 2583);
+        /* A0: r0r2r4r6r8r10r12r14 r16r18r20r22r24r26r28r30 */
+        /* A1: r1r3r5r7r9r11r13r15 r17r19r21r23r25r27r29r31 */
+        B0 = _mm256_unpacklo_epi16(A0, A1);
+        B1 = _mm256_unpackhi_epi16(A0, A1);
+        /* B0: r0r1r2r3r4r5r6r7 r16r17r18r19r20r21r22r23 */
+        /* B1: r8r9r10r11r12r13r14r15 r24r25r26r27r28r29r30r31 */
+        C0 = _mm256_permute2x128_si256(B0, B1, 0x20);
+        C1 = _mm256_permute2x128_si256(B0, B1, 0x31);
+        /* C0: r0r1r2r3r4r5r6r7 r8r9r10r11r12r13r14r15 */
+        /* C1: r16r17r18r19r20r21r22r23 r24r25r26r27r28r29r30r31 */
+        _mm256_storeu_si256((__m256i *) (&R0[2 * i]), C0);
+        _mm256_storeu_si256((__m256i *) (16 + &R0[2 * i]), C1);
+        if (!i) {
+            break;
+        }
+        i = -16 - ((~15) & -i);
+    }
+}
diff --git a/crypto_kem/ntrulpr857/avx2/crypto_decode_857x1723.h b/crypto_kem/ntrulpr857/avx2/crypto_decode_857x1723.h
new file mode 100644
index 00000000..0e8cb2cf
--- /dev/null
+++ b/crypto_kem/ntrulpr857/avx2/crypto_decode_857x1723.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_NTRULPR857_AVX2_CRYPTO_DECODE_857X1723_H
+#define PQCLEAN_NTRULPR857_AVX2_CRYPTO_DECODE_857X1723_H
+
+#include <stdint.h>
+#define PQCLEAN_NTRULPR857_AVX2_crypto_decode_857x1723_STRBYTES 1152
+#define PQCLEAN_NTRULPR857_AVX2_crypto_decode_857x1723_ITEMS 857
+#define PQCLEAN_NTRULPR857_AVX2_crypto_decode_857x1723_ITEMBYTES 2
+
+void PQCLEAN_NTRULPR857_AVX2_crypto_decode_857x1723(void *v, const unsigned char *s);
+#endif
diff --git a/crypto_kem/ntrulpr857/avx2/crypto_decode_857x3.c b/crypto_kem/ntrulpr857/avx2/crypto_decode_857x3.c
new file mode 100644
index 00000000..a886f960
--- /dev/null
+++ b/crypto_kem/ntrulpr857/avx2/crypto_decode_857x3.c
@@ -0,0 +1,65 @@
+#include "crypto_decode_857x3.h"
+#include <immintrin.h>
+#define uint8 uint8_t
+
+#define p 857
+#define loops 7
+#define overshoot 10
+
+void PQCLEAN_NTRULPR857_AVX2_crypto_decode_857x3(void *v, const unsigned char *s) {
+    uint8 *f = v;
+    int loop;
+    uint8 *nextf = f + 128 - 4 * overshoot;
+    const unsigned char *nexts = s + 32 - overshoot;
+
+    for (loop = loops; loop > 0; --loop) {
+        __m256i s0 = _mm256_loadu_si256((const __m256i *) s);
+        s = nexts;
+        nexts += 32;
+
+        __m256i s1 = _mm256_srli_epi16(s0 & _mm256_set1_epi8(-16), 4);
+        s0 &= _mm256_set1_epi8(15);
+
+        __m256i a0 = _mm256_unpacklo_epi8(s0, s1);
+        /* 0 0>>4 1 1>>4 2 2>>4 3 3>>4 4 4>>4 5 5>>4 6 6>>4 7 7>>4 */
+        /* 16 16>>4 ... */
+        __m256i a1 = _mm256_unpackhi_epi8(s0, s1);
+        /* 8 8>>4 9 9>>4 10 10>>4 ... */
+        /* 24 24>>4 ... */
+
+        __m256i a2 = _mm256_srli_epi16(a0 & _mm256_set1_epi8(12), 2);
+        __m256i a3 = _mm256_srli_epi16(a1 & _mm256_set1_epi8(12), 2);
+        a0 &= _mm256_set1_epi8(3);
+        a1 &= _mm256_set1_epi8(3);
+
+        __m256i b0 = _mm256_unpacklo_epi8(a0, a2);
+        /* 0 0>>2 0>>4 0>>6 1 1>>2 1>>4 1>>6 */
+        /* 2 2>>2 2>>4 2>>6 3 3>>2 3>>4 3>.6 */
+        /* 16 16>>2 16>>4 16>>6 ... */
+        __m256i b2 = _mm256_unpackhi_epi8(a0, a2);
+        /* 4 4>>2 ... */
+        __m256i b1 = _mm256_unpacklo_epi8(a1, a3);
+        /* 8 8>>2 ... */
+        __m256i b3 = _mm256_unpackhi_epi8(a1, a3);
+        /* 12 12>>2 ... */
+
+        __m256i f0 = _mm256_permute2x128_si256(b0, b2, 0x20);
+        __m256i f2 = _mm256_permute2x128_si256(b0, b2, 0x31);
+        __m256i f1 = _mm256_permute2x128_si256(b1, b3, 0x20);
+        __m256i f3 = _mm256_permute2x128_si256(b1, b3, 0x31);
+
+        f0 = _mm256_add_epi8(f0, _mm256_set1_epi8(-1));
+        f1 = _mm256_add_epi8(f1, _mm256_set1_epi8(-1));
+        f2 = _mm256_add_epi8(f2, _mm256_set1_epi8(-1));
+        f3 = _mm256_add_epi8(f3, _mm256_set1_epi8(-1));
+
+        _mm256_storeu_si256((__m256i *) (f + 0), f0);
+        _mm256_storeu_si256((__m256i *) (f + 32), f1);
+        _mm256_storeu_si256((__m256i *) (f + 64), f2);
+        _mm256_storeu_si256((__m256i *) (f + 96), f3);
+        f = nextf;
+        nextf += 128;
+    }
+
+    *f = ((uint8)(*s & 3)) - 1;
+}
diff --git a/crypto_kem/ntrulpr857/avx2/crypto_decode_857x3.h b/crypto_kem/ntrulpr857/avx2/crypto_decode_857x3.h
new file mode 100644
index 00000000..789c740e
--- /dev/null
+++ b/crypto_kem/ntrulpr857/avx2/crypto_decode_857x3.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_NTRULPR857_AVX2_CRYPTO_DECODE_857X3_H
+#define PQCLEAN_NTRULPR857_AVX2_CRYPTO_DECODE_857X3_H
+
+#include <stdint.h>
+#define PQCLEAN_NTRULPR857_AVX2_crypto_decode_857x3_STRBYTES 215
+#define PQCLEAN_NTRULPR857_AVX2_crypto_decode_857x3_ITEMS 857
+#define PQCLEAN_NTRULPR857_AVX2_crypto_decode_857x3_ITEMBYTES 1
+
+void PQCLEAN_NTRULPR857_AVX2_crypto_decode_857x3(void *v, const unsigned char *s);
+#endif
diff --git a/crypto_kem/ntrulpr857/avx2/crypto_decode_857xint16.c b/crypto_kem/ntrulpr857/avx2/crypto_decode_857xint16.c
new file mode 100644
index 00000000..b44dd5b1
--- /dev/null
+++ b/crypto_kem/ntrulpr857/avx2/crypto_decode_857xint16.c
@@ -0,0 +1,16 @@
+#include "crypto_decode_857xint16.h"
+
+
+void PQCLEAN_NTRULPR857_AVX2_crypto_decode_857xint16(void *v, const unsigned char *s) {
+    uint16_t *x = v;
+    int i;
+
+    for (i = 0; i < 857; ++i) {
+        uint16_t u0 = s[0];
+        uint16_t u1 = s[1];
+        u1 <<= 8;
+        *x = u0 | u1;
+        x += 1;
+        s += 2;
+    }
+}
diff --git a/crypto_kem/ntrulpr857/avx2/crypto_decode_857xint16.h b/crypto_kem/ntrulpr857/avx2/crypto_decode_857xint16.h
new file mode 100644
index 00000000..7f602bb3
--- /dev/null
+++ b/crypto_kem/ntrulpr857/avx2/crypto_decode_857xint16.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_NTRULPR857_AVX2_CRYPTO_DECODE_857XINT16_H
+#define PQCLEAN_NTRULPR857_AVX2_CRYPTO_DECODE_857XINT16_H
+
+#include <stdint.h>
+#define PQCLEAN_NTRULPR857_AVX2_crypto_decode_857xint16_STRBYTES 1714
+#define PQCLEAN_NTRULPR857_AVX2_crypto_decode_857xint16_ITEMBYTES 2
+#define PQCLEAN_NTRULPR857_AVX2_crypto_decode_857xint16_ITEMS 857
+
+void PQCLEAN_NTRULPR857_AVX2_crypto_decode_857xint16(void *v, const unsigned char *s);
+#endif
diff --git a/crypto_kem/ntrulpr857/avx2/crypto_decode_857xint32.c b/crypto_kem/ntrulpr857/avx2/crypto_decode_857xint32.c
new file mode 100644
index 00000000..3fd7e3b9
--- /dev/null
+++ b/crypto_kem/ntrulpr857/avx2/crypto_decode_857xint32.c
@@ -0,0 +1,20 @@
+#include "crypto_decode_857xint32.h"
+
+
+void PQCLEAN_NTRULPR857_AVX2_crypto_decode_857xint32(void *v, const unsigned char *s) {
+    uint32_t *x = v;
+    int i;
+
+    for (i = 0; i < 857; ++i) {
+        uint32_t u0 = s[0];
+        uint32_t u1 = s[1];
+        uint32_t u2 = s[2];
+        uint32_t u3 = s[3];
+        u1 <<= 8;
+        u2 <<= 16;
+        u3 <<= 24;
+        *x = u0 | u1 | u2 | u3;
+        x += 1;
+        s += 4;
+    }
+}
diff --git a/crypto_kem/ntrulpr857/avx2/crypto_decode_857xint32.h b/crypto_kem/ntrulpr857/avx2/crypto_decode_857xint32.h
new file mode 100644
index 00000000..822eda0c
--- /dev/null
+++ b/crypto_kem/ntrulpr857/avx2/crypto_decode_857xint32.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_NTRULPR857_AVX2_CRYPTO_DECODE_857XINT32_H
+#define PQCLEAN_NTRULPR857_AVX2_CRYPTO_DECODE_857XINT32_H
+
+#include <stdint.h>
+#define PQCLEAN_NTRULPR857_AVX2_crypto_decode_857xint32_STRBYTES 3428
+#define PQCLEAN_NTRULPR857_AVX2_crypto_decode_857xint32_ITEMBYTES 4
+#define PQCLEAN_NTRULPR857_AVX2_crypto_decode_857xint32_ITEMS 857
+
+void PQCLEAN_NTRULPR857_AVX2_crypto_decode_857xint32(void *v, const unsigned char *s);
+#endif
diff --git a/crypto_kem/ntrulpr857/avx2/crypto_encode_256x16.c b/crypto_kem/ntrulpr857/avx2/crypto_encode_256x16.c
new file mode 100644
index 00000000..a60c27c6
--- /dev/null
+++ b/crypto_kem/ntrulpr857/avx2/crypto_encode_256x16.c
@@ -0,0 +1,10 @@
+#include "crypto_encode_256x16.h"
+
+
+void PQCLEAN_NTRULPR857_AVX2_crypto_encode_256x16(unsigned char *s, const void *v) {
+    const unsigned char *T = v;
+    int i;
+    for (i = 0; i < 128; ++i) {
+        s[i] = T[2 * i] + (T[2 * i + 1] << 4);
+    }
+}
diff --git a/crypto_kem/ntrulpr857/avx2/crypto_encode_256x16.h b/crypto_kem/ntrulpr857/avx2/crypto_encode_256x16.h
new file mode 100644
index 00000000..e2aa335f
--- /dev/null
+++ b/crypto_kem/ntrulpr857/avx2/crypto_encode_256x16.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_NTRULPR857_AVX2_CRYPTO_ENCODE_256X16_H
+#define PQCLEAN_NTRULPR857_AVX2_CRYPTO_ENCODE_256X16_H
+
+#include <stdint.h>
+#define PQCLEAN_NTRULPR857_AVX2_crypto_encode_256x16_STRBYTES 128
+#define PQCLEAN_NTRULPR857_AVX2_crypto_encode_256x16_ITEMS 256
+#define PQCLEAN_NTRULPR857_AVX2_crypto_encode_256x16_ITEMBYTES 1
+
+void PQCLEAN_NTRULPR857_AVX2_crypto_encode_256x16(unsigned char *s, const void *v);
+#endif
diff --git a/crypto_kem/ntrulpr857/avx2/crypto_encode_256x2.c b/crypto_kem/ntrulpr857/avx2/crypto_encode_256x2.c
new file mode 100644
index 00000000..6220e2ab
--- /dev/null
+++ b/crypto_kem/ntrulpr857/avx2/crypto_encode_256x2.c
@@ -0,0 +1,88 @@
+#include "crypto_encode_256x2.h"
+#include <immintrin.h>
+#include <stdint.h>
+
+void PQCLEAN_NTRULPR857_AVX2_crypto_encode_256x2(unsigned char *s, const void *v) {
+    __m256i a0 = _mm256_loadu_si256(0 + (__m256i *) v);
+    __m256i a1 = _mm256_loadu_si256(1 + (__m256i *) v);
+    __m256i a2 = _mm256_loadu_si256(2 + (__m256i *) v);
+    __m256i a3 = _mm256_loadu_si256(3 + (__m256i *) v);
+    __m256i a4 = _mm256_loadu_si256(4 + (__m256i *) v);
+    __m256i a5 = _mm256_loadu_si256(5 + (__m256i *) v);
+    __m256i a6 = _mm256_loadu_si256(6 + (__m256i *) v);
+    __m256i a7 = _mm256_loadu_si256(7 + (__m256i *) v);
+    __m256i bottom = _mm256_set1_epi8(1);
+    __m256i zero = _mm256_setzero_si256();
+    __m256i b0 = _mm256_cmpgt_epi8(a0 & bottom, zero);
+    __m256i b1 = _mm256_cmpgt_epi8(a1 & bottom, zero);
+    __m256i b2 = _mm256_cmpgt_epi8(a2 & bottom, zero);
+    __m256i b3 = _mm256_cmpgt_epi8(a3 & bottom, zero);
+    __m256i b4 = _mm256_cmpgt_epi8(a4 & bottom, zero);
+    __m256i b5 = _mm256_cmpgt_epi8(a5 & bottom, zero);
+    __m256i b6 = _mm256_cmpgt_epi8(a6 & bottom, zero);
+    __m256i b7 = _mm256_cmpgt_epi8(a7 & bottom, zero);
+    int32_t c0 = _mm256_movemask_epi8(b0);
+    int32_t c1 = _mm256_movemask_epi8(b1);
+    int32_t c2 = _mm256_movemask_epi8(b2);
+    int32_t c3 = _mm256_movemask_epi8(b3);
+    int32_t c4 = _mm256_movemask_epi8(b4);
+    int32_t c5 = _mm256_movemask_epi8(b5);
+    int32_t c6 = _mm256_movemask_epi8(b6);
+    int32_t c7 = _mm256_movemask_epi8(b7);
+    *s++ = c0;
+    c0 >>= 8;
+    *s++ = c0;
+    c0 >>= 8;
+    *s++ = c0;
+    c0 >>= 8;
+    *s++ = c0;
+    *s++ = c1;
+    c1 >>= 8;
+    *s++ = c1;
+    c1 >>= 8;
+    *s++ = c1;
+    c1 >>= 8;
+    *s++ = c1;
+    *s++ = c2;
+    c2 >>= 8;
+    *s++ = c2;
+    c2 >>= 8;
+    *s++ = c2;
+    c2 >>= 8;
+    *s++ = c2;
+    *s++ = c3;
+    c3 >>= 8;
+    *s++ = c3;
+    c3 >>= 8;
+    *s++ = c3;
+    c3 >>= 8;
+    *s++ = c3;
+    *s++ = c4;
+    c4 >>= 8;
+    *s++ = c4;
+    c4 >>= 8;
+    *s++ = c4;
+    c4 >>= 8;
+    *s++ = c4;
+    *s++ = c5;
+    c5 >>= 8;
+    *s++ = c5;
+    c5 >>= 8;
+    *s++ = c5;
+    c5 >>= 8;
+    *s++ = c5;
+    *s++ = c6;
+    c6 >>= 8;
+    *s++ = c6;
+    c6 >>= 8;
+    *s++ = c6;
+    c6 >>= 8;
+    *s++ = c6;
+    *s++ = c7;
+    c7 >>= 8;
+    *s++ = c7;
+    c7 >>= 8;
+    *s++ = c7;
+    c7 >>= 8;
+    *s++ = c7;
+}
diff --git a/crypto_kem/ntrulpr857/avx2/crypto_encode_256x2.h b/crypto_kem/ntrulpr857/avx2/crypto_encode_256x2.h
new file mode 100644
index 00000000..a552cb18
--- /dev/null
+++ b/crypto_kem/ntrulpr857/avx2/crypto_encode_256x2.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_NTRULPR857_AVX2_CRYPTO_ENCODE_256X2_H
+#define PQCLEAN_NTRULPR857_AVX2_CRYPTO_ENCODE_256X2_H
+
+#include <stdint.h>
+#define PQCLEAN_NTRULPR857_AVX2_crypto_encode_256x2_STRBYTES 32
+#define PQCLEAN_NTRULPR857_AVX2_crypto_encode_256x2_ITEMS 256
+#define PQCLEAN_NTRULPR857_AVX2_crypto_encode_256x2_ITEMBYTES 1
+
+void PQCLEAN_NTRULPR857_AVX2_crypto_encode_256x2(unsigned char *s, const void *v);
+#endif
diff --git a/crypto_kem/ntrulpr857/avx2/crypto_encode_857x1723.c b/crypto_kem/ntrulpr857/avx2/crypto_encode_857x1723.c
new file mode 100644
index 00000000..0fce803b
--- /dev/null
+++ b/crypto_kem/ntrulpr857/avx2/crypto_encode_857x1723.c
@@ -0,0 +1,283 @@
+#include "crypto_encode_857x1723.h"
+#include <immintrin.h>
+/* auto-generated; do not edit */
+
+#define int16 int16_t
+#define uint16 uint16_t
+#define uint32 uint32_t
+
+void PQCLEAN_NTRULPR857_AVX2_crypto_encode_857x1723(unsigned char *out, const void *v) {
+    const int16 *R0 = v;
+    /* XXX: caller could overlap R with input */
+    uint16 R[429];
+    long i;
+    const uint16 *reading;
+    uint16 *writing;
+    uint16 r0, r1;
+    uint32 r2;
+    uint32 s0;
+
+    reading = (uint16 *) R0;
+    writing = R;
+    i = 54;
+    while (i > 0) {
+        __m256i x, y;
+        --i;
+        if (!i) {
+            reading -= 8;
+            writing -= 4;
+            out -= 4;
+        }
+        x = _mm256_loadu_si256((__m256i *) reading);
+        x = _mm256_add_epi16(x, _mm256_set1_epi16(2583));
+        x &= _mm256_set1_epi16(16383);
+        x = _mm256_mulhi_epi16(x, _mm256_set1_epi16(21846));
+        y = x & _mm256_set1_epi32(65535);
+        x = _mm256_srli_epi32(x, 16);
+        x = _mm256_mullo_epi32(x, _mm256_set1_epi32(1723));
+        x = _mm256_add_epi32(y, x);
+        x = _mm256_shuffle_epi8(x, _mm256_set_epi8(
+                                    12, 8, 4, 0, 12, 8, 4, 0, 14, 13, 10, 9, 6, 5, 2, 1,
+                                    12, 8, 4, 0, 12, 8, 4, 0, 14, 13, 10, 9, 6, 5, 2, 1
+                                ));
+        x = _mm256_permute4x64_epi64(x, 0xd8);
+        _mm_storeu_si128((__m128i *) writing, _mm256_extractf128_si256(x, 0));
+        s0 = _mm256_extract_epi32(x, 4);
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 = _mm256_extract_epi32(x, 6);
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        reading += 16;
+        writing += 8;
+    }
+    R[428] = (((R0[856] + 2583) & 16383) * 10923) >> 15;
+
+    reading = (uint16 *) R;
+    writing = R;
+    i = 14;
+    while (i > 0) {
+        __m256i x, x2, y, y2;
+        --i;
+        if (!i) {
+            reading -= 20;
+            writing -= 10;
+            out -= 20;
+        }
+        x = _mm256_loadu_si256((__m256i *) (reading + 0));
+        x2 = _mm256_loadu_si256((__m256i *) (reading + 16));
+        y = x & _mm256_set1_epi32(65535);
+        y2 = x2 & _mm256_set1_epi32(65535);
+        x = _mm256_srli_epi32(x, 16);
+        x2 = _mm256_srli_epi32(x2, 16);
+        x = _mm256_mullo_epi32(x, _mm256_set1_epi32(11597));
+        x2 = _mm256_mullo_epi32(x2, _mm256_set1_epi32(11597));
+        x = _mm256_add_epi32(y, x);
+        x2 = _mm256_add_epi32(y2, x2);
+        x = _mm256_shuffle_epi8(x, _mm256_set_epi8(
+                                    15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0,
+                                    15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0
+                                ));
+        x2 = _mm256_shuffle_epi8(x2, _mm256_set_epi8(
+                                     15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0,
+                                     15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0
+                                 ));
+        x = _mm256_permute4x64_epi64(x, 0xd8);
+        x2 = _mm256_permute4x64_epi64(x2, 0xd8);
+        _mm256_storeu_si256((__m256i *) writing, _mm256_permute2f128_si256(x, x2, 0x31));
+        _mm256_storeu_si256((__m256i *) out, _mm256_permute2f128_si256(x, x2, 0x20));
+        reading += 32;
+        writing += 16;
+        out += 32;
+    }
+    R[214] = R[428];
+
+    reading = (uint16 *) R;
+    writing = R;
+    i = 7;
+    while (i > 0) {
+        __m256i x, x2, y, y2;
+        --i;
+        if (!i) {
+            reading -= 10;
+            writing -= 5;
+            out -= 10;
+        }
+        x = _mm256_loadu_si256((__m256i *) (reading + 0));
+        x2 = _mm256_loadu_si256((__m256i *) (reading + 16));
+        y = x & _mm256_set1_epi32(65535);
+        y2 = x2 & _mm256_set1_epi32(65535);
+        x = _mm256_srli_epi32(x, 16);
+        x2 = _mm256_srli_epi32(x2, 16);
+        x = _mm256_mullo_epi32(x, _mm256_set1_epi32(2053));
+        x2 = _mm256_mullo_epi32(x2, _mm256_set1_epi32(2053));
+        x = _mm256_add_epi32(y, x);
+        x2 = _mm256_add_epi32(y2, x2);
+        x = _mm256_shuffle_epi8(x, _mm256_set_epi8(
+                                    15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0,
+                                    15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0
+                                ));
+        x2 = _mm256_shuffle_epi8(x2, _mm256_set_epi8(
+                                     15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0,
+                                     15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0
+                                 ));
+        x = _mm256_permute4x64_epi64(x, 0xd8);
+        x2 = _mm256_permute4x64_epi64(x2, 0xd8);
+        _mm256_storeu_si256((__m256i *) writing, _mm256_permute2f128_si256(x, x2, 0x31));
+        _mm256_storeu_si256((__m256i *) out, _mm256_permute2f128_si256(x, x2, 0x20));
+        reading += 32;
+        writing += 16;
+        out += 32;
+    }
+    R[107] = R[214];
+
+    for (i = 0; i < 53; ++i) {
+        r0 = R[2 * i];
+        r1 = R[2 * i + 1];
+        r2 = r0 + r1 * (uint32)65;
+        R[i] = r2;
+    }
+    r0 = R[106];
+    r1 = R[107];
+    r2 = r0 + r1 * (uint32)65;
+    *out++ = r2;
+    r2 >>= 8;
+    R[53] = r2;
+
+    reading = (uint16 *) R;
+    writing = R;
+    i = 2;
+    while (i > 0) {
+        __m256i x, x2, y, y2;
+        --i;
+        if (!i) {
+            reading -= 12;
+            writing -= 6;
+            out -= 12;
+        }
+        x = _mm256_loadu_si256((__m256i *) (reading + 0));
+        x2 = _mm256_loadu_si256((__m256i *) (reading + 16));
+        y = x & _mm256_set1_epi32(65535);
+        y2 = x2 & _mm256_set1_epi32(65535);
+        x = _mm256_srli_epi32(x, 16);
+        x2 = _mm256_srli_epi32(x2, 16);
+        x = _mm256_mullo_epi32(x, _mm256_set1_epi32(4225));
+        x2 = _mm256_mullo_epi32(x2, _mm256_set1_epi32(4225));
+        x = _mm256_add_epi32(y, x);
+        x2 = _mm256_add_epi32(y2, x2);
+        x = _mm256_shuffle_epi8(x, _mm256_set_epi8(
+                                    15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0,
+                                    15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0
+                                ));
+        x2 = _mm256_shuffle_epi8(x2, _mm256_set_epi8(
+                                     15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0,
+                                     15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0
+                                 ));
+        x = _mm256_permute4x64_epi64(x, 0xd8);
+        x2 = _mm256_permute4x64_epi64(x2, 0xd8);
+        _mm256_storeu_si256((__m256i *) writing, _mm256_permute2f128_si256(x, x2, 0x31));
+        _mm256_storeu_si256((__m256i *) out, _mm256_permute2f128_si256(x, x2, 0x20));
+        reading += 32;
+        writing += 16;
+        out += 32;
+    }
+    r0 = R[52];
+    r1 = R[53];
+    r2 = r0 + r1 * (uint32)4225;
+    *out++ = r2;
+    r2 >>= 8;
+    R[26] = r2;
+
+    reading = (uint16 *) R;
+    writing = R;
+    i = 2;
+    while (i > 0) {
+        __m256i x, y;
+        --i;
+        if (!i) {
+            reading -= 6;
+            writing -= 3;
+            out -= 3;
+        }
+        x = _mm256_loadu_si256((__m256i *) reading);
+        y = x & _mm256_set1_epi32(65535);
+        x = _mm256_srli_epi32(x, 16);
+        x = _mm256_mullo_epi32(x, _mm256_set1_epi32(273));
+        x = _mm256_add_epi32(y, x);
+        x = _mm256_shuffle_epi8(x, _mm256_set_epi8(
+                                    12, 8, 4, 0, 12, 8, 4, 0, 14, 13, 10, 9, 6, 5, 2, 1,
+                                    12, 8, 4, 0, 12, 8, 4, 0, 14, 13, 10, 9, 6, 5, 2, 1
+                                ));
+        x = _mm256_permute4x64_epi64(x, 0xd8);
+        _mm_storeu_si128((__m128i *) writing, _mm256_extractf128_si256(x, 0));
+        s0 = _mm256_extract_epi32(x, 4);
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 = _mm256_extract_epi32(x, 6);
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        reading += 16;
+        writing += 8;
+    }
+    R[13] = R[26];
+
+    for (i = 0; i < 7; ++i) {
+        r0 = R[2 * i];
+        r1 = R[2 * i + 1];
+        r2 = r0 + r1 * (uint32)292;
+        *out++ = r2;
+        r2 >>= 8;
+        R[i] = r2;
+    }
+
+    for (i = 0; i < 3; ++i) {
+        r0 = R[2 * i];
+        r1 = R[2 * i + 1];
+        r2 = r0 + r1 * (uint32)334;
+        *out++ = r2;
+        r2 >>= 8;
+        R[i] = r2;
+    }
+    R[3] = R[6];
+
+    for (i = 0; i < 2; ++i) {
+        r0 = R[2 * i];
+        r1 = R[2 * i + 1];
+        r2 = r0 + r1 * (uint32)436;
+        *out++ = r2;
+        r2 >>= 8;
+        R[i] = r2;
+    }
+
+    r0 = R[0];
+    r1 = R[1];
+    r2 = r0 + r1 * (uint32)743;
+    *out++ = r2;
+    r2 >>= 8;
+    *out++ = r2;
+    r2 >>= 8;
+    R[0] = r2;
+
+    r0 = R[0];
+    *out++ = r0; /*clang-analyzer-deadcode.DeadStores*/ /*r0 >>= 8;*/
+}
diff --git a/crypto_kem/ntrulpr857/avx2/crypto_encode_857x1723.h b/crypto_kem/ntrulpr857/avx2/crypto_encode_857x1723.h
new file mode 100644
index 00000000..82842aea
--- /dev/null
+++ b/crypto_kem/ntrulpr857/avx2/crypto_encode_857x1723.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_NTRULPR857_AVX2_CRYPTO_ENCODE_857X1723_H
+#define PQCLEAN_NTRULPR857_AVX2_CRYPTO_ENCODE_857X1723_H
+
+#include <stdint.h>
+#define PQCLEAN_NTRULPR857_AVX2_crypto_encode_857x1723_STRBYTES 1152
+#define PQCLEAN_NTRULPR857_AVX2_crypto_encode_857x1723_ITEMS 857
+#define PQCLEAN_NTRULPR857_AVX2_crypto_encode_857x1723_ITEMBYTES 2
+
+void PQCLEAN_NTRULPR857_AVX2_crypto_encode_857x1723(unsigned char *out, const void *v);
+#endif
diff --git a/crypto_kem/ntrulpr857/avx2/crypto_encode_857x1723round.c b/crypto_kem/ntrulpr857/avx2/crypto_encode_857x1723round.c
new file mode 100644
index 00000000..48343d4f
--- /dev/null
+++ b/crypto_kem/ntrulpr857/avx2/crypto_encode_857x1723round.c
@@ -0,0 +1,285 @@
+#include "crypto_encode_857x1723round.h"
+#include <immintrin.h>
+/* auto-generated; do not edit */
+
+#define int16 int16_t
+#define uint16 uint16_t
+#define uint32 uint32_t
+
+void PQCLEAN_NTRULPR857_AVX2_crypto_encode_857x1723round(unsigned char *out, const void *v) {
+    const int16 *R0 = v;
+    /* XXX: caller could overlap R with input */
+    uint16 R[429];
+    long i;
+    const uint16 *reading;
+    uint16 *writing;
+    uint16 r0, r1;
+    uint32 r2;
+    uint32 s0;
+
+    reading = (uint16 *) R0;
+    writing = R;
+    i = 54;
+    while (i > 0) {
+        __m256i x, y;
+        --i;
+        if (!i) {
+            reading -= 8;
+            writing -= 4;
+            out -= 4;
+        }
+        x = _mm256_loadu_si256((__m256i *) reading);
+        x = _mm256_mulhrs_epi16(x, _mm256_set1_epi16(10923));
+        x = _mm256_add_epi16(x, _mm256_add_epi16(x, x));
+        x = _mm256_add_epi16(x, _mm256_set1_epi16(2583));
+        x &= _mm256_set1_epi16(16383);
+        x = _mm256_mulhi_epi16(x, _mm256_set1_epi16(21846));
+        y = x & _mm256_set1_epi32(65535);
+        x = _mm256_srli_epi32(x, 16);
+        x = _mm256_mullo_epi32(x, _mm256_set1_epi32(1723));
+        x = _mm256_add_epi32(y, x);
+        x = _mm256_shuffle_epi8(x, _mm256_set_epi8(
+                                    12, 8, 4, 0, 12, 8, 4, 0, 14, 13, 10, 9, 6, 5, 2, 1,
+                                    12, 8, 4, 0, 12, 8, 4, 0, 14, 13, 10, 9, 6, 5, 2, 1
+                                ));
+        x = _mm256_permute4x64_epi64(x, 0xd8);
+        _mm_storeu_si128((__m128i *) writing, _mm256_extractf128_si256(x, 0));
+        s0 = _mm256_extract_epi32(x, 4);
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 = _mm256_extract_epi32(x, 6);
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        reading += 16;
+        writing += 8;
+    }
+    R[428] = (((3 * ((10923 * R0[856] + 16384) >> 15) + 2583) & 16383) * 10923) >> 15;
+
+    reading = (uint16 *) R;
+    writing = R;
+    i = 14;
+    while (i > 0) {
+        __m256i x, x2, y, y2;
+        --i;
+        if (!i) {
+            reading -= 20;
+            writing -= 10;
+            out -= 20;
+        }
+        x = _mm256_loadu_si256((__m256i *) (reading + 0));
+        x2 = _mm256_loadu_si256((__m256i *) (reading + 16));
+        y = x & _mm256_set1_epi32(65535);
+        y2 = x2 & _mm256_set1_epi32(65535);
+        x = _mm256_srli_epi32(x, 16);
+        x2 = _mm256_srli_epi32(x2, 16);
+        x = _mm256_mullo_epi32(x, _mm256_set1_epi32(11597));
+        x2 = _mm256_mullo_epi32(x2, _mm256_set1_epi32(11597));
+        x = _mm256_add_epi32(y, x);
+        x2 = _mm256_add_epi32(y2, x2);
+        x = _mm256_shuffle_epi8(x, _mm256_set_epi8(
+                                    15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0,
+                                    15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0
+                                ));
+        x2 = _mm256_shuffle_epi8(x2, _mm256_set_epi8(
+                                     15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0,
+                                     15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0
+                                 ));
+        x = _mm256_permute4x64_epi64(x, 0xd8);
+        x2 = _mm256_permute4x64_epi64(x2, 0xd8);
+        _mm256_storeu_si256((__m256i *) writing, _mm256_permute2f128_si256(x, x2, 0x31));
+        _mm256_storeu_si256((__m256i *) out, _mm256_permute2f128_si256(x, x2, 0x20));
+        reading += 32;
+        writing += 16;
+        out += 32;
+    }
+    R[214] = R[428];
+
+    reading = (uint16 *) R;
+    writing = R;
+    i = 7;
+    while (i > 0) {
+        __m256i x, x2, y, y2;
+        --i;
+        if (!i) {
+            reading -= 10;
+            writing -= 5;
+            out -= 10;
+        }
+        x = _mm256_loadu_si256((__m256i *) (reading + 0));
+        x2 = _mm256_loadu_si256((__m256i *) (reading + 16));
+        y = x & _mm256_set1_epi32(65535);
+        y2 = x2 & _mm256_set1_epi32(65535);
+        x = _mm256_srli_epi32(x, 16);
+        x2 = _mm256_srli_epi32(x2, 16);
+        x = _mm256_mullo_epi32(x, _mm256_set1_epi32(2053));
+        x2 = _mm256_mullo_epi32(x2, _mm256_set1_epi32(2053));
+        x = _mm256_add_epi32(y, x);
+        x2 = _mm256_add_epi32(y2, x2);
+        x = _mm256_shuffle_epi8(x, _mm256_set_epi8(
+                                    15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0,
+                                    15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0
+                                ));
+        x2 = _mm256_shuffle_epi8(x2, _mm256_set_epi8(
+                                     15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0,
+                                     15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0
+                                 ));
+        x = _mm256_permute4x64_epi64(x, 0xd8);
+        x2 = _mm256_permute4x64_epi64(x2, 0xd8);
+        _mm256_storeu_si256((__m256i *) writing, _mm256_permute2f128_si256(x, x2, 0x31));
+        _mm256_storeu_si256((__m256i *) out, _mm256_permute2f128_si256(x, x2, 0x20));
+        reading += 32;
+        writing += 16;
+        out += 32;
+    }
+    R[107] = R[214];
+
+    for (i = 0; i < 53; ++i) {
+        r0 = R[2 * i];
+        r1 = R[2 * i + 1];
+        r2 = r0 + r1 * (uint32)65;
+        R[i] = r2;
+    }
+    r0 = R[106];
+    r1 = R[107];
+    r2 = r0 + r1 * (uint32)65;
+    *out++ = r2;
+    r2 >>= 8;
+    R[53] = r2;
+
+    reading = (uint16 *) R;
+    writing = R;
+    i = 2;
+    while (i > 0) {
+        __m256i x, x2, y, y2;
+        --i;
+        if (!i) {
+            reading -= 12;
+            writing -= 6;
+            out -= 12;
+        }
+        x = _mm256_loadu_si256((__m256i *) (reading + 0));
+        x2 = _mm256_loadu_si256((__m256i *) (reading + 16));
+        y = x & _mm256_set1_epi32(65535);
+        y2 = x2 & _mm256_set1_epi32(65535);
+        x = _mm256_srli_epi32(x, 16);
+        x2 = _mm256_srli_epi32(x2, 16);
+        x = _mm256_mullo_epi32(x, _mm256_set1_epi32(4225));
+        x2 = _mm256_mullo_epi32(x2, _mm256_set1_epi32(4225));
+        x = _mm256_add_epi32(y, x);
+        x2 = _mm256_add_epi32(y2, x2);
+        x = _mm256_shuffle_epi8(x, _mm256_set_epi8(
+                                    15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0,
+                                    15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0
+                                ));
+        x2 = _mm256_shuffle_epi8(x2, _mm256_set_epi8(
+                                     15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0,
+                                     15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0
+                                 ));
+        x = _mm256_permute4x64_epi64(x, 0xd8);
+        x2 = _mm256_permute4x64_epi64(x2, 0xd8);
+        _mm256_storeu_si256((__m256i *) writing, _mm256_permute2f128_si256(x, x2, 0x31));
+        _mm256_storeu_si256((__m256i *) out, _mm256_permute2f128_si256(x, x2, 0x20));
+        reading += 32;
+        writing += 16;
+        out += 32;
+    }
+    r0 = R[52];
+    r1 = R[53];
+    r2 = r0 + r1 * (uint32)4225;
+    *out++ = r2;
+    r2 >>= 8;
+    R[26] = r2;
+
+    reading = (uint16 *) R;
+    writing = R;
+    i = 2;
+    while (i > 0) {
+        __m256i x, y;
+        --i;
+        if (!i) {
+            reading -= 6;
+            writing -= 3;
+            out -= 3;
+        }
+        x = _mm256_loadu_si256((__m256i *) reading);
+        y = x & _mm256_set1_epi32(65535);
+        x = _mm256_srli_epi32(x, 16);
+        x = _mm256_mullo_epi32(x, _mm256_set1_epi32(273));
+        x = _mm256_add_epi32(y, x);
+        x = _mm256_shuffle_epi8(x, _mm256_set_epi8(
+                                    12, 8, 4, 0, 12, 8, 4, 0, 14, 13, 10, 9, 6, 5, 2, 1,
+                                    12, 8, 4, 0, 12, 8, 4, 0, 14, 13, 10, 9, 6, 5, 2, 1
+                                ));
+        x = _mm256_permute4x64_epi64(x, 0xd8);
+        _mm_storeu_si128((__m128i *) writing, _mm256_extractf128_si256(x, 0));
+        s0 = _mm256_extract_epi32(x, 4);
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 = _mm256_extract_epi32(x, 6);
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        reading += 16;
+        writing += 8;
+    }
+    R[13] = R[26];
+
+    for (i = 0; i < 7; ++i) {
+        r0 = R[2 * i];
+        r1 = R[2 * i + 1];
+        r2 = r0 + r1 * (uint32)292;
+        *out++ = r2;
+        r2 >>= 8;
+        R[i] = r2;
+    }
+
+    for (i = 0; i < 3; ++i) {
+        r0 = R[2 * i];
+        r1 = R[2 * i + 1];
+        r2 = r0 + r1 * (uint32)334;
+        *out++ = r2;
+        r2 >>= 8;
+        R[i] = r2;
+    }
+    R[3] = R[6];
+
+    for (i = 0; i < 2; ++i) {
+        r0 = R[2 * i];
+        r1 = R[2 * i + 1];
+        r2 = r0 + r1 * (uint32)436;
+        *out++ = r2;
+        r2 >>= 8;
+        R[i] = r2;
+    }
+
+    r0 = R[0];
+    r1 = R[1];
+    r2 = r0 + r1 * (uint32)743;
+    *out++ = r2;
+    r2 >>= 8;
+    *out++ = r2;
+    r2 >>= 8;
+    R[0] = r2;
+
+    r0 = R[0];
+    *out++ = r0; /*clang-analyzer-deadcode.DeadStores*/ /*r0 >>= 8;*/
+}
diff --git a/crypto_kem/ntrulpr857/avx2/crypto_encode_857x1723round.h b/crypto_kem/ntrulpr857/avx2/crypto_encode_857x1723round.h
new file mode 100644
index 00000000..c6bcb415
--- /dev/null
+++ b/crypto_kem/ntrulpr857/avx2/crypto_encode_857x1723round.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_NTRULPR857_AVX2_CRYPTO_ENCODE_857X1723ROUND_H
+#define PQCLEAN_NTRULPR857_AVX2_CRYPTO_ENCODE_857X1723ROUND_H
+
+#include <stdint.h>
+#define PQCLEAN_NTRULPR857_AVX2_crypto_encode_857x1723round_STRBYTES 1152
+#define PQCLEAN_NTRULPR857_AVX2_crypto_encode_857x1723round_ITEMS 857
+#define PQCLEAN_NTRULPR857_AVX2_crypto_encode_857x1723round_ITEMBYTES 2
+
+void PQCLEAN_NTRULPR857_AVX2_crypto_encode_857x1723round(unsigned char *out, const void *v);
+#endif
diff --git a/crypto_kem/ntrulpr857/avx2/crypto_encode_857x3.c b/crypto_kem/ntrulpr857/avx2/crypto_encode_857x3.c
new file mode 100644
index 00000000..c6b7d2dc
--- /dev/null
+++ b/crypto_kem/ntrulpr857/avx2/crypto_encode_857x3.c
@@ -0,0 +1,64 @@
+#include "crypto_encode_857x3.h"
+#include <immintrin.h>
+#define uint8 uint8_t
+
+#define p 857
+#define loops 7
+#define overshoot 10
+
+static const union {
+    uint8 init[32];
+    __m256i val;
+} lobytes_buf = { .init = {
+        255, 0, 255, 0, 255, 0, 255, 0,
+        255, 0, 255, 0, 255, 0, 255, 0,
+        255, 0, 255, 0, 255, 0, 255, 0,
+        255, 0, 255, 0, 255, 0, 255, 0,
+    }
+};
+#define lobytes (lobytes_buf.val)
+
+void PQCLEAN_NTRULPR857_AVX2_crypto_encode_857x3(unsigned char *s, const void *v) {
+    const uint8 *f = v;
+    int loop;
+    const uint8 *nextf = f + 128 - 4 * overshoot;
+    unsigned char *nexts = s + 32 - overshoot;
+
+    for (loop = loops; loop > 0; --loop) {
+        __m256i f0 = _mm256_loadu_si256((const __m256i *) (f + 0));
+        __m256i f1 = _mm256_loadu_si256((const __m256i *) (f + 32));
+        __m256i f2 = _mm256_loadu_si256((const __m256i *) (f + 64));
+        __m256i f3 = _mm256_loadu_si256((const __m256i *) (f + 96));
+        f = nextf;
+        nextf += 128;
+
+        __m256i a0 = _mm256_packus_epi16(f0 & lobytes, f1 & lobytes);
+        /* 0 2 4 6 8 10 12 14 32 34 36 38 40 42 44 46 */
+        /* 16 18 20 22 24 26 28 30 48 50 52 54 56 58 60 62 */
+        __m256i a1 = _mm256_packus_epi16(_mm256_srli_epi16(f0, 8), _mm256_srli_epi16(f1, 8));
+        /* 1 3 ... */
+        __m256i a2 = _mm256_packus_epi16(f2 & lobytes, f3 & lobytes);
+        __m256i a3 = _mm256_packus_epi16(_mm256_srli_epi16(f2, 8), _mm256_srli_epi16(f3, 8));
+
+        a0 = _mm256_add_epi8(a0, _mm256_slli_epi16(a1 & _mm256_set1_epi8(63), 2));
+        a2 = _mm256_add_epi8(a2, _mm256_slli_epi16(a3 & _mm256_set1_epi8(63), 2));
+
+        __m256i b0 = _mm256_packus_epi16(a0 & lobytes, a2 & lobytes);
+        /* 0 4 8 12 32 36 40 44 64 68 72 76 96 100 104 108 */
+        /* 16 20 24 28 48 52 56 60 80 84 88 92 112 116 120 124 */
+        __m256i b2 = _mm256_packus_epi16(_mm256_srli_epi16(a0, 8), _mm256_srli_epi16(a2, 8));
+        /* 2 6 ... */
+
+        b0 = _mm256_add_epi8(b0, _mm256_slli_epi16(b2 & _mm256_set1_epi8(15), 4));
+
+        b0 = _mm256_permutevar8x32_epi32(b0, _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0));
+
+        b0 = _mm256_add_epi8(b0, _mm256_set1_epi8(85));
+
+        _mm256_storeu_si256((__m256i *) s, b0);
+        s = nexts;
+        nexts += 32;
+    }
+
+    *s++ = *f++ + 1;
+}
diff --git a/crypto_kem/ntrulpr857/avx2/crypto_encode_857x3.h b/crypto_kem/ntrulpr857/avx2/crypto_encode_857x3.h
new file mode 100644
index 00000000..b6ba7814
--- /dev/null
+++ b/crypto_kem/ntrulpr857/avx2/crypto_encode_857x3.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_NTRULPR857_AVX2_CRYPTO_ENCODE_857X3_H
+#define PQCLEAN_NTRULPR857_AVX2_CRYPTO_ENCODE_857X3_H
+
+#include <stdint.h>
+#define PQCLEAN_NTRULPR857_AVX2_crypto_encode_857x3_STRBYTES 215
+#define PQCLEAN_NTRULPR857_AVX2_crypto_encode_857x3_ITEMS 857
+#define PQCLEAN_NTRULPR857_AVX2_crypto_encode_857x3_ITEMBYTES 1
+
+void PQCLEAN_NTRULPR857_AVX2_crypto_encode_857x3(unsigned char *s, const void *v);
+#endif
diff --git a/crypto_kem/ntrulpr857/avx2/crypto_encode_857xint16.c b/crypto_kem/ntrulpr857/avx2/crypto_encode_857xint16.c
new file mode 100644
index 00000000..136c7990
--- /dev/null
+++ b/crypto_kem/ntrulpr857/avx2/crypto_encode_857xint16.c
@@ -0,0 +1,13 @@
+#include "crypto_encode_857xint16.h"
+
+
+void PQCLEAN_NTRULPR857_AVX2_crypto_encode_857xint16(unsigned char *s, const void *v) {
+    const uint16_t *x = v;
+    int i;
+
+    for (i = 0; i < 857; ++i) {
+        uint16_t u = *x++;
+        *s++ = u;
+        *s++ = u >> 8;
+    }
+}
diff --git a/crypto_kem/ntrulpr857/avx2/crypto_encode_857xint16.h b/crypto_kem/ntrulpr857/avx2/crypto_encode_857xint16.h
new file mode 100644
index 00000000..fea51c59
--- /dev/null
+++ b/crypto_kem/ntrulpr857/avx2/crypto_encode_857xint16.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_NTRULPR857_AVX2_CRYPTO_ENCODE_857XINT16_H
+#define PQCLEAN_NTRULPR857_AVX2_CRYPTO_ENCODE_857XINT16_H
+
+#include <stdint.h>
+#define PQCLEAN_NTRULPR857_AVX2_crypto_encode_857xint16_STRBYTES 1714
+#define PQCLEAN_NTRULPR857_AVX2_crypto_encode_857xint16_ITEMBYTES 2
+#define PQCLEAN_NTRULPR857_AVX2_crypto_encode_857xint16_ITEMS 857
+
+void PQCLEAN_NTRULPR857_AVX2_crypto_encode_857xint16(unsigned char *s, const void *v);
+#endif
diff --git a/crypto_kem/ntrulpr857/avx2/crypto_sort_int32.c b/crypto_kem/ntrulpr857/avx2/crypto_sort_int32.c
new file mode 100644
index 00000000..84544dc6
--- /dev/null
+++ b/crypto_kem/ntrulpr857/avx2/crypto_sort_int32.c
@@ -0,0 +1,1210 @@
+#include "crypto_sort_int32.h"
+#include <immintrin.h>
+// Based on supercop-20200820/crypto_sort/int32/avx2
+
+
+#define int32 int32_t
+
+typedef __m256i int32x8;
+#define int32x8_load(z) _mm256_loadu_si256((__m256i *) (z))
+#define int32x8_store(z,i) _mm256_storeu_si256((__m256i *) (z),(i))
+#define int32x8_min _mm256_min_epi32
+#define int32x8_max _mm256_max_epi32
+
+#define int32x8_MINMAX(a,b) \
+    do { \
+        int32x8 c = int32x8_min((a),(b)); \
+        (b) = int32x8_max((a),(b)); \
+        (a) = c; \
+    } while(0)
+
+static inline void int32_MINMAX(int32 *a, int32 *b) {
+    int32 ab = *b ^ *a;
+    int32 c = (int32)((int64_t) * b - (int64_t) * a);
+    c ^= ab & (c ^ *b);
+    c >>= 31;
+    c &= ab;
+    *a ^= c;
+    *b ^= c;
+}
+
+static void minmax_vector(int32 *x, int32 *y, size_t n) {
+    if ((long long) n < 8) {
+        while ((long long) n > 0) {
+            int32_MINMAX(x, y);
+            ++x;
+            ++y;
+            --n;
+        }
+        return;
+    }
+    if (n & 7) {
+        int32x8 x0 = int32x8_load(x + n - 8);
+        int32x8 y0 = int32x8_load(y + n - 8);
+        int32x8_MINMAX(x0, y0);
+        int32x8_store(x + n - 8, x0);
+        int32x8_store(y + n - 8, y0);
+        n &= ~7;
+    }
+    do {
+        int32x8 x0 = int32x8_load(x);
+        int32x8 y0 = int32x8_load(y);
+        int32x8_MINMAX(x0, y0);
+        int32x8_store(x, x0);
+        int32x8_store(y, y0);
+        x += 8;
+        y += 8;
+        n -= 8;
+    } while (n);
+}
+
+/* stages 8,4,2,1 of size-16 bitonic merging */
+static void merge16_finish(int32 *x, int32x8 x0, int32x8 x1, int flagdown) {
+    int32x8 b0, b1, c0, c1, mask;
+
+    int32x8_MINMAX(x0, x1);
+
+    b0 = _mm256_permute2x128_si256(x0, x1, 0x20); /* A0123B0123 */
+    b1 = _mm256_permute2x128_si256(x0, x1, 0x31); /* A4567B4567 */
+
+    int32x8_MINMAX(b0, b1);
+
+    c0 = _mm256_unpacklo_epi64(b0, b1); /* A0145B0145 */
+    c1 = _mm256_unpackhi_epi64(b0, b1); /* A2367B2367 */
+
+    int32x8_MINMAX(c0, c1);
+
+    b0 = _mm256_unpacklo_epi32(c0, c1); /* A0213B0213 */
+    b1 = _mm256_unpackhi_epi32(c0, c1); /* A4657B4657 */
+
+    c0 = _mm256_unpacklo_epi64(b0, b1); /* A0246B0246 */
+    c1 = _mm256_unpackhi_epi64(b0, b1); /* A1357B1357 */
+
+    int32x8_MINMAX(c0, c1);
+
+    b0 = _mm256_unpacklo_epi32(c0, c1); /* A0123B0123 */
+    b1 = _mm256_unpackhi_epi32(c0, c1); /* A4567B4567 */
+
+    x0 = _mm256_permute2x128_si256(b0, b1, 0x20); /* A01234567 */
+    x1 = _mm256_permute2x128_si256(b0, b1, 0x31); /* A01234567 */
+
+    if (flagdown) {
+        mask = _mm256_set1_epi32(-1);
+        x0 ^= mask;
+        x1 ^= mask;
+    }
+
+    int32x8_store(&x[0], x0);
+    int32x8_store(&x[8], x1);
+}
+
+/* stages 64,32 of bitonic merging; n is multiple of 128 */
+static void int32_twostages_32(int32 *x, size_t n) {
+    size_t i;
+
+    while (n > 0) {
+        for (i = 0; i < 32; i += 8) {
+            int32x8 x0 = int32x8_load(&x[i]);
+            int32x8 x1 = int32x8_load(&x[i + 32]);
+            int32x8 x2 = int32x8_load(&x[i + 64]);
+            int32x8 x3 = int32x8_load(&x[i + 96]);
+
+            int32x8_MINMAX(x0, x2);
+            int32x8_MINMAX(x1, x3);
+            int32x8_MINMAX(x0, x1);
+            int32x8_MINMAX(x2, x3);
+
+            int32x8_store(&x[i], x0);
+            int32x8_store(&x[i + 32], x1);
+            int32x8_store(&x[i + 64], x2);
+            int32x8_store(&x[i + 96], x3);
+        }
+        x += 128;
+        n -= 128;
+    }
+}
+
+/* stages 4q,2q,q of bitonic merging */
+static size_t int32_threestages(int32 *x, size_t n, size_t q) {
+    size_t k, i;
+
+    for (k = 0; k + 8 * q <= n; k += 8 * q) {
+        for (i = k; i < k + q; i += 8) {
+            int32x8 x0 = int32x8_load(&x[i]);
+            int32x8 x1 = int32x8_load(&x[i + q]);
+            int32x8 x2 = int32x8_load(&x[i + 2 * q]);
+            int32x8 x3 = int32x8_load(&x[i + 3 * q]);
+            int32x8 x4 = int32x8_load(&x[i + 4 * q]);
+            int32x8 x5 = int32x8_load(&x[i + 5 * q]);
+            int32x8 x6 = int32x8_load(&x[i + 6 * q]);
+            int32x8 x7 = int32x8_load(&x[i + 7 * q]);
+
+            int32x8_MINMAX(x0, x4);
+            int32x8_MINMAX(x1, x5);
+            int32x8_MINMAX(x2, x6);
+            int32x8_MINMAX(x3, x7);
+            int32x8_MINMAX(x0, x2);
+            int32x8_MINMAX(x1, x3);
+            int32x8_MINMAX(x4, x6);
+            int32x8_MINMAX(x5, x7);
+            int32x8_MINMAX(x0, x1);
+            int32x8_MINMAX(x2, x3);
+            int32x8_MINMAX(x4, x5);
+            int32x8_MINMAX(x6, x7);
+
+            int32x8_store(&x[i], x0);
+            int32x8_store(&x[i + q], x1);
+            int32x8_store(&x[i + 2 * q], x2);
+            int32x8_store(&x[i + 3 * q], x3);
+            int32x8_store(&x[i + 4 * q], x4);
+            int32x8_store(&x[i + 5 * q], x5);
+            int32x8_store(&x[i + 6 * q], x6);
+            int32x8_store(&x[i + 7 * q], x7);
+        }
+    }
+
+    return k;
+}
+
+/* n is a power of 2; n >= 8; if n == 8 then flagdown */
+// NOLINTNEXTLINE(google-readability-function-size)
+static void int32_sort_2power(int32 *x, size_t n, int flagdown) {
+    size_t p, q, i, j, k;
+    int32x8 mask;
+
+    if (n == 8) {
+        int32 x0 = x[0];
+        int32 x1 = x[1];
+        int32 x2 = x[2];
+        int32 x3 = x[3];
+        int32 x4 = x[4];
+        int32 x5 = x[5];
+        int32 x6 = x[6];
+        int32 x7 = x[7];
+
+        /* odd-even sort instead of bitonic sort */
+
+        int32_MINMAX(&x1, &x0);
+        int32_MINMAX(&x3, &x2);
+        int32_MINMAX(&x2, &x0);
+        int32_MINMAX(&x3, &x1);
+        int32_MINMAX(&x2, &x1);
+
+        int32_MINMAX(&x5, &x4);
+        int32_MINMAX(&x7, &x6);
+        int32_MINMAX(&x6, &x4);
+        int32_MINMAX(&x7, &x5);
+        int32_MINMAX(&x6, &x5);
+
+        int32_MINMAX(&x4, &x0);
+        int32_MINMAX(&x6, &x2);
+        int32_MINMAX(&x4, &x2);
+
+        int32_MINMAX(&x5, &x1);
+        int32_MINMAX(&x7, &x3);
+        int32_MINMAX(&x5, &x3);
+
+        int32_MINMAX(&x2, &x1);
+        int32_MINMAX(&x4, &x3);
+        int32_MINMAX(&x6, &x5);
+
+        x[0] = x0;
+        x[1] = x1;
+        x[2] = x2;
+        x[3] = x3;
+        x[4] = x4;
+        x[5] = x5;
+        x[6] = x6;
+        x[7] = x7;
+        return;
+    }
+
+    if (n == 16) {
+        int32x8 x0, x1, b0, b1, c0, c1;
+
+        x0 = int32x8_load(&x[0]);
+        x1 = int32x8_load(&x[8]);
+
+        mask = _mm256_set_epi32(0, 0, -1, -1, 0, 0, -1, -1);
+
+        x0 ^= mask; /* A01234567 */
+        x1 ^= mask; /* B01234567 */
+
+        b0 = _mm256_unpacklo_epi32(x0, x1); /* AB0AB1AB4AB5 */
+        b1 = _mm256_unpackhi_epi32(x0, x1); /* AB2AB3AB6AB7 */
+
+        c0 = _mm256_unpacklo_epi64(b0, b1); /* AB0AB2AB4AB6 */
+        c1 = _mm256_unpackhi_epi64(b0, b1); /* AB1AB3AB5AB7 */
+
+        int32x8_MINMAX(c0, c1);
+
+        mask = _mm256_set_epi32(0, 0, -1, -1, -1, -1, 0, 0);
+        c0 ^= mask;
+        c1 ^= mask;
+
+        b0 = _mm256_unpacklo_epi32(c0, c1); /* A01B01A45B45 */
+        b1 = _mm256_unpackhi_epi32(c0, c1); /* A23B23A67B67 */
+
+        int32x8_MINMAX(b0, b1);
+
+        x0 = _mm256_unpacklo_epi64(b0, b1); /* A01234567 */
+        x1 = _mm256_unpackhi_epi64(b0, b1); /* B01234567 */
+
+        b0 = _mm256_unpacklo_epi32(x0, x1); /* AB0AB1AB4AB5 */
+        b1 = _mm256_unpackhi_epi32(x0, x1); /* AB2AB3AB6AB7 */
+
+        c0 = _mm256_unpacklo_epi64(b0, b1); /* AB0AB2AB4AB6 */
+        c1 = _mm256_unpackhi_epi64(b0, b1); /* AB1AB3AB5AB7 */
+
+        int32x8_MINMAX(c0, c1);
+
+        b0 = _mm256_unpacklo_epi32(c0, c1); /* A01B01A45B45 */
+        b1 = _mm256_unpackhi_epi32(c0, c1); /* A23B23A67B67 */
+
+        b0 ^= mask;
+        b1 ^= mask;
+
+        c0 = _mm256_permute2x128_si256(b0, b1, 0x20); /* A01B01A23B23 */
+        c1 = _mm256_permute2x128_si256(b0, b1, 0x31); /* A45B45A67B67 */
+
+        int32x8_MINMAX(c0, c1);
+
+        b0 = _mm256_permute2x128_si256(c0, c1, 0x20); /* A01B01A45B45 */
+        b1 = _mm256_permute2x128_si256(c0, c1, 0x31); /* A23B23A67B67 */
+
+        int32x8_MINMAX(b0, b1);
+
+        x0 = _mm256_unpacklo_epi64(b0, b1); /* A01234567 */
+        x1 = _mm256_unpackhi_epi64(b0, b1); /* B01234567 */
+
+        b0 = _mm256_unpacklo_epi32(x0, x1); /* AB0AB1AB4AB5 */
+        b1 = _mm256_unpackhi_epi32(x0, x1); /* AB2AB3AB6AB7 */
+
+        c0 = _mm256_unpacklo_epi64(b0, b1); /* AB0AB2AB4AB6 */
+        c1 = _mm256_unpackhi_epi64(b0, b1); /* AB1AB3AB5AB7 */
+
+        int32x8_MINMAX(c0, c1);
+
+        b0 = _mm256_unpacklo_epi32(c0, c1); /* A01B01A45B45 */
+        b1 = _mm256_unpackhi_epi32(c0, c1); /* A23B23A67B67 */
+
+        x0 = _mm256_unpacklo_epi64(b0, b1); /* A01234567 */
+        x1 = _mm256_unpackhi_epi64(b0, b1); /* B01234567 */
+
+        mask = _mm256_set1_epi32(-1);
+        if (flagdown) {
+            x1 ^= mask;
+        } else {
+            x0 ^= mask;
+        }
+
+        merge16_finish(x, x0, x1, flagdown);
+        return;
+    }
+
+    if (n == 32) {
+        int32x8 x0, x1, x2, x3;
+
+        int32_sort_2power(x, 16, 1);
+        int32_sort_2power(x + 16, 16, 0);
+
+        x0 = int32x8_load(&x[0]);
+        x1 = int32x8_load(&x[8]);
+        x2 = int32x8_load(&x[16]);
+        x3 = int32x8_load(&x[24]);
+
+        if (flagdown) {
+            mask = _mm256_set1_epi32(-1);
+            x0 ^= mask;
+            x1 ^= mask;
+            x2 ^= mask;
+            x3 ^= mask;
+        }
+
+        int32x8_MINMAX(x0, x2);
+        int32x8_MINMAX(x1, x3);
+
+        merge16_finish(x, x0, x1, flagdown);
+        merge16_finish(x + 16, x2, x3, flagdown);
+        return;
+    }
+
+    p = n >> 3;
+    for (i = 0; i < p; i += 8) {
+        int32x8 x0 = int32x8_load(&x[i]);
+        int32x8 x2 = int32x8_load(&x[i + 2 * p]);
+        int32x8 x4 = int32x8_load(&x[i + 4 * p]);
+        int32x8 x6 = int32x8_load(&x[i + 6 * p]);
+
+        /* odd-even stage instead of bitonic stage */
+
+        int32x8_MINMAX(x4, x0);
+        int32x8_MINMAX(x6, x2);
+        int32x8_MINMAX(x2, x0);
+        int32x8_MINMAX(x6, x4);
+        int32x8_MINMAX(x2, x4);
+
+        int32x8_store(&x[i], x0);
+        int32x8_store(&x[i + 2 * p], x2);
+        int32x8_store(&x[i + 4 * p], x4);
+        int32x8_store(&x[i + 6 * p], x6);
+
+        int32x8 x1 = int32x8_load(&x[i + p]);
+        int32x8 x3 = int32x8_load(&x[i + 3 * p]);
+        int32x8 x5 = int32x8_load(&x[i + 5 * p]);
+        int32x8 x7 = int32x8_load(&x[i + 7 * p]);
+
+        int32x8_MINMAX(x1, x5);
+        int32x8_MINMAX(x3, x7);
+        int32x8_MINMAX(x1, x3);
+        int32x8_MINMAX(x5, x7);
+        int32x8_MINMAX(x5, x3);
+
+        int32x8_store(&x[i + p], x1);
+        int32x8_store(&x[i + 3 * p], x3);
+        int32x8_store(&x[i + 5 * p], x5);
+        int32x8_store(&x[i + 7 * p], x7);
+    }
+
+    if (n >= 128) {
+        int flip, flipflip;
+
+        mask = _mm256_set1_epi32(-1);
+
+        for (j = 0; j < n; j += 32) {
+            int32x8 x0 = int32x8_load(&x[j]);
+            int32x8 x1 = int32x8_load(&x[j + 16]);
+            x0 ^= mask;
+            x1 ^= mask;
+            int32x8_store(&x[j], x0);
+            int32x8_store(&x[j + 16], x1);
+        }
+
+        p = 8;
+        for (;;) { /* for p in [8, 16, ..., n/16] */
+            q = p >> 1;
+            while (q >= 128) {
+                int32_threestages(x, n, q >> 2);
+                q >>= 3;
+            }
+            if (q == 64) {
+                int32_twostages_32(x, n);
+                q = 16;
+            }
+            if (q == 32) {
+                q = 8;
+                for (k = 0; k < n; k += 8 * q) {
+                    for (i = k; i < k + q; i += 8) {
+                        int32x8 x0 = int32x8_load(&x[i]);
+                        int32x8 x1 = int32x8_load(&x[i + q]);
+                        int32x8 x2 = int32x8_load(&x[i + 2 * q]);
+                        int32x8 x3 = int32x8_load(&x[i + 3 * q]);
+                        int32x8 x4 = int32x8_load(&x[i + 4 * q]);
+                        int32x8 x5 = int32x8_load(&x[i + 5 * q]);
+                        int32x8 x6 = int32x8_load(&x[i + 6 * q]);
+                        int32x8 x7 = int32x8_load(&x[i + 7 * q]);
+
+                        int32x8_MINMAX(x0, x4);
+                        int32x8_MINMAX(x1, x5);
+                        int32x8_MINMAX(x2, x6);
+                        int32x8_MINMAX(x3, x7);
+                        int32x8_MINMAX(x0, x2);
+                        int32x8_MINMAX(x1, x3);
+                        int32x8_MINMAX(x4, x6);
+                        int32x8_MINMAX(x5, x7);
+                        int32x8_MINMAX(x0, x1);
+                        int32x8_MINMAX(x2, x3);
+                        int32x8_MINMAX(x4, x5);
+                        int32x8_MINMAX(x6, x7);
+
+                        int32x8_store(&x[i], x0);
+                        int32x8_store(&x[i + q], x1);
+                        int32x8_store(&x[i + 2 * q], x2);
+                        int32x8_store(&x[i + 3 * q], x3);
+                        int32x8_store(&x[i + 4 * q], x4);
+                        int32x8_store(&x[i + 5 * q], x5);
+                        int32x8_store(&x[i + 6 * q], x6);
+                        int32x8_store(&x[i + 7 * q], x7);
+                    }
+                }
+                q = 4;
+            }
+            if (q == 16) {
+                q = 8;
+                for (k = 0; k < n; k += 4 * q) {
+                    for (i = k; i < k + q; i += 8) {
+                        int32x8 x0 = int32x8_load(&x[i]);
+                        int32x8 x1 = int32x8_load(&x[i + q]);
+                        int32x8 x2 = int32x8_load(&x[i + 2 * q]);
+                        int32x8 x3 = int32x8_load(&x[i + 3 * q]);
+
+                        int32x8_MINMAX(x0, x2);
+                        int32x8_MINMAX(x1, x3);
+                        int32x8_MINMAX(x0, x1);
+                        int32x8_MINMAX(x2, x3);
+
+                        int32x8_store(&x[i], x0);
+                        int32x8_store(&x[i + q], x1);
+                        int32x8_store(&x[i + 2 * q], x2);
+                        int32x8_store(&x[i + 3 * q], x3);
+                    }
+                }
+                q = 4;
+            }
+            if (q == 8) {
+                for (k = 0; k < n; k += q + q) {
+                    int32x8 x0 = int32x8_load(&x[k]);
+                    int32x8 x1 = int32x8_load(&x[k + q]);
+
+                    int32x8_MINMAX(x0, x1);
+
+                    int32x8_store(&x[k], x0);
+                    int32x8_store(&x[k + q], x1);
+                }
+            }
+
+            q = n >> 3;
+            flip = (p << 1 == q);
+            flipflip = !flip;
+            for (j = 0; j < q; j += p + p) {
+                for (k = j; k < j + p + p; k += p) {
+                    for (i = k; i < k + p; i += 8) {
+                        int32x8 x0 = int32x8_load(&x[i]);
+                        int32x8 x1 = int32x8_load(&x[i + q]);
+                        int32x8 x2 = int32x8_load(&x[i + 2 * q]);
+                        int32x8 x3 = int32x8_load(&x[i + 3 * q]);
+                        int32x8 x4 = int32x8_load(&x[i + 4 * q]);
+                        int32x8 x5 = int32x8_load(&x[i + 5 * q]);
+                        int32x8 x6 = int32x8_load(&x[i + 6 * q]);
+                        int32x8 x7 = int32x8_load(&x[i + 7 * q]);
+
+                        int32x8_MINMAX(x0, x1);
+                        int32x8_MINMAX(x2, x3);
+                        int32x8_MINMAX(x4, x5);
+                        int32x8_MINMAX(x6, x7);
+                        int32x8_MINMAX(x0, x2);
+                        int32x8_MINMAX(x1, x3);
+                        int32x8_MINMAX(x4, x6);
+                        int32x8_MINMAX(x5, x7);
+                        int32x8_MINMAX(x0, x4);
+                        int32x8_MINMAX(x1, x5);
+                        int32x8_MINMAX(x2, x6);
+                        int32x8_MINMAX(x3, x7);
+
+                        if (flip) {
+                            x0 ^= mask;
+                            x1 ^= mask;
+                            x2 ^= mask;
+                            x3 ^= mask;
+                            x4 ^= mask;
+                            x5 ^= mask;
+                            x6 ^= mask;
+                            x7 ^= mask;
+                        }
+
+                        int32x8_store(&x[i], x0);
+                        int32x8_store(&x[i + q], x1);
+                        int32x8_store(&x[i + 2 * q], x2);
+                        int32x8_store(&x[i + 3 * q], x3);
+                        int32x8_store(&x[i + 4 * q], x4);
+                        int32x8_store(&x[i + 5 * q], x5);
+                        int32x8_store(&x[i + 6 * q], x6);
+                        int32x8_store(&x[i + 7 * q], x7);
+                    }
+                    flip ^= 1;
+                }
+                flip ^= flipflip;
+            }
+
+            if (p << 4 == n) {
+                break;
+            }
+            p <<= 1;
+        }
+    }
+
+    for (p = 4; p >= 1; p >>= 1) {
+        int32 *z = x;
+        int32 *target = x + n;
+        if (p == 4) {
+            mask = _mm256_set_epi32(0, 0, 0, 0, -1, -1, -1, -1);
+            while (z != target) {
+                int32x8 x0 = int32x8_load(&z[0]);
+                int32x8 x1 = int32x8_load(&z[8]);
+                x0 ^= mask;
+                x1 ^= mask;
+                int32x8_store(&z[0], x0);
+                int32x8_store(&z[8], x1);
+                z += 16;
+            }
+        } else if (p == 2) {
+            mask = _mm256_set_epi32(0, 0, -1, -1, -1, -1, 0, 0);
+            while (z != target) {
+                int32x8 x0 = int32x8_load(&z[0]);
+                int32x8 x1 = int32x8_load(&z[8]);
+                x0 ^= mask;
+                x1 ^= mask;
+                int32x8 b0 = _mm256_permute2x128_si256(x0, x1, 0x20);
+                int32x8 b1 = _mm256_permute2x128_si256(x0, x1, 0x31);
+                int32x8_MINMAX(b0, b1);
+                int32x8 c0 = _mm256_permute2x128_si256(b0, b1, 0x20);
+                int32x8 c1 = _mm256_permute2x128_si256(b0, b1, 0x31);
+                int32x8_store(&z[0], c0);
+                int32x8_store(&z[8], c1);
+                z += 16;
+            }
+        } else { /* p == 1 */
+            mask = _mm256_set_epi32(0, -1, -1, 0, 0, -1, -1, 0);
+            while (z != target) {
+                int32x8 x0 = int32x8_load(&z[0]);
+                int32x8 x1 = int32x8_load(&z[8]);
+                x0 ^= mask;
+                x1 ^= mask;
+                int32x8 b0 = _mm256_permute2x128_si256(x0, x1, 0x20); /* A0123B0123 */
+                int32x8 b1 = _mm256_permute2x128_si256(x0, x1, 0x31); /* A4567B4567 */
+                int32x8 c0 = _mm256_unpacklo_epi64(b0, b1); /* A0145B0145 */
+                int32x8 c1 = _mm256_unpackhi_epi64(b0, b1); /* A2367B2367 */
+                int32x8_MINMAX(c0, c1);
+                int32x8 d0 = _mm256_unpacklo_epi64(c0, c1); /* A0123B0123 */
+                int32x8 d1 = _mm256_unpackhi_epi64(c0, c1); /* A4567B4567 */
+                int32x8_MINMAX(d0, d1);
+                int32x8 e0 = _mm256_permute2x128_si256(d0, d1, 0x20);
+                int32x8 e1 = _mm256_permute2x128_si256(d0, d1, 0x31);
+                int32x8_store(&z[0], e0);
+                int32x8_store(&z[8], e1);
+                z += 16;
+            }
+        }
+
+        q = n >> 4;
+        while (q >= 128 || q == 32) {
+            int32_threestages(x, n, q >> 2);
+            q >>= 3;
+        }
+        while (q >= 16) {
+            q >>= 1;
+            for (j = 0; j < n; j += 4 * q) {
+                for (k = j; k < j + q; k += 8) {
+                    int32x8 x0 = int32x8_load(&x[k]);
+                    int32x8 x1 = int32x8_load(&x[k + q]);
+                    int32x8 x2 = int32x8_load(&x[k + 2 * q]);
+                    int32x8 x3 = int32x8_load(&x[k + 3 * q]);
+
+                    int32x8_MINMAX(x0, x2);
+                    int32x8_MINMAX(x1, x3);
+                    int32x8_MINMAX(x0, x1);
+                    int32x8_MINMAX(x2, x3);
+
+                    int32x8_store(&x[k], x0);
+                    int32x8_store(&x[k + q], x1);
+                    int32x8_store(&x[k + 2 * q], x2);
+                    int32x8_store(&x[k + 3 * q], x3);
+                }
+            }
+            q >>= 1;
+        }
+        if (q == 8) {
+            for (j = 0; j < n; j += 2 * q) {
+                int32x8 x0 = int32x8_load(&x[j]);
+                int32x8 x1 = int32x8_load(&x[j + q]);
+
+                int32x8_MINMAX(x0, x1);
+
+                int32x8_store(&x[j], x0);
+                int32x8_store(&x[j + q], x1);
+            }
+        }
+
+        q = n >> 3;
+        for (k = 0; k < q; k += 8) {
+            int32x8 x0 = int32x8_load(&x[k]);
+            int32x8 x1 = int32x8_load(&x[k + q]);
+            int32x8 x2 = int32x8_load(&x[k + 2 * q]);
+            int32x8 x3 = int32x8_load(&x[k + 3 * q]);
+            int32x8 x4 = int32x8_load(&x[k + 4 * q]);
+            int32x8 x5 = int32x8_load(&x[k + 5 * q]);
+            int32x8 x6 = int32x8_load(&x[k + 6 * q]);
+            int32x8 x7 = int32x8_load(&x[k + 7 * q]);
+
+            int32x8_MINMAX(x0, x1);
+            int32x8_MINMAX(x2, x3);
+            int32x8_MINMAX(x4, x5);
+            int32x8_MINMAX(x6, x7);
+            int32x8_MINMAX(x0, x2);
+            int32x8_MINMAX(x1, x3);
+            int32x8_MINMAX(x4, x6);
+            int32x8_MINMAX(x5, x7);
+            int32x8_MINMAX(x0, x4);
+            int32x8_MINMAX(x1, x5);
+            int32x8_MINMAX(x2, x6);
+            int32x8_MINMAX(x3, x7);
+
+            int32x8_store(&x[k], x0);
+            int32x8_store(&x[k + q], x1);
+            int32x8_store(&x[k + 2 * q], x2);
+            int32x8_store(&x[k + 3 * q], x3);
+            int32x8_store(&x[k + 4 * q], x4);
+            int32x8_store(&x[k + 5 * q], x5);
+            int32x8_store(&x[k + 6 * q], x6);
+            int32x8_store(&x[k + 7 * q], x7);
+        }
+    }
+
+    /* everything is still masked with _mm256_set_epi32(0,-1,0,-1,0,-1,0,-1); */
+    mask = _mm256_set1_epi32(-1);
+
+    for (i = 0; i < n; i += 64) {
+        int32x8 a0 = int32x8_load(&x[i]);
+        int32x8 a1 = int32x8_load(&x[i + 8]);
+        int32x8 a2 = int32x8_load(&x[i + 16]);
+        int32x8 a3 = int32x8_load(&x[i + 24]);
+        int32x8 a4 = int32x8_load(&x[i + 32]);
+        int32x8 a5 = int32x8_load(&x[i + 40]);
+        int32x8 a6 = int32x8_load(&x[i + 48]);
+        int32x8 a7 = int32x8_load(&x[i + 56]);
+
+        int32x8 b0 = _mm256_unpacklo_epi32(a0, a1); /* AB0AB1AB4AB5 */
+        int32x8 b1 = _mm256_unpackhi_epi32(a0, a1); /* AB2AB3AB6AB7 */
+        int32x8 b2 = _mm256_unpacklo_epi32(a2, a3); /* CD0CD1CD4CD5 */
+        int32x8 b3 = _mm256_unpackhi_epi32(a2, a3); /* CD2CD3CD6CD7 */
+        int32x8 b4 = _mm256_unpacklo_epi32(a4, a5); /* EF0EF1EF4EF5 */
+        int32x8 b5 = _mm256_unpackhi_epi32(a4, a5); /* EF2EF3EF6EF7 */
+        int32x8 b6 = _mm256_unpacklo_epi32(a6, a7); /* GH0GH1GH4GH5 */
+        int32x8 b7 = _mm256_unpackhi_epi32(a6, a7); /* GH2GH3GH6GH7 */
+
+        int32x8 c0 = _mm256_unpacklo_epi64(b0, b2); /* ABCD0ABCD4 */
+        int32x8 c1 = _mm256_unpacklo_epi64(b1, b3); /* ABCD2ABCD6 */
+        int32x8 c2 = _mm256_unpackhi_epi64(b0, b2); /* ABCD1ABCD5 */
+        int32x8 c3 = _mm256_unpackhi_epi64(b1, b3); /* ABCD3ABCD7 */
+        int32x8 c4 = _mm256_unpacklo_epi64(b4, b6); /* EFGH0EFGH4 */
+        int32x8 c5 = _mm256_unpacklo_epi64(b5, b7); /* EFGH2EFGH6 */
+        int32x8 c6 = _mm256_unpackhi_epi64(b4, b6); /* EFGH1EFGH5 */
+        int32x8 c7 = _mm256_unpackhi_epi64(b5, b7); /* EFGH3EFGH7 */
+
+        if (flagdown) {
+            c2 ^= mask;
+            c3 ^= mask;
+            c6 ^= mask;
+            c7 ^= mask;
+        } else {
+            c0 ^= mask;
+            c1 ^= mask;
+            c4 ^= mask;
+            c5 ^= mask;
+        }
+
+        int32x8 d0 = _mm256_permute2x128_si256(c0, c4, 0x20); /* ABCDEFGH0 */
+        int32x8 d1 = _mm256_permute2x128_si256(c2, c6, 0x20); /* ABCDEFGH1 */
+        int32x8 d2 = _mm256_permute2x128_si256(c1, c5, 0x20); /* ABCDEFGH2 */
+        int32x8 d3 = _mm256_permute2x128_si256(c3, c7, 0x20); /* ABCDEFGH5 */
+        int32x8 d4 = _mm256_permute2x128_si256(c0, c4, 0x31); /* ABCDEFGH4 */
+        int32x8 d5 = _mm256_permute2x128_si256(c2, c6, 0x31); /* ABCDEFGH3 */
+        int32x8 d6 = _mm256_permute2x128_si256(c1, c5, 0x31); /* ABCDEFGH6 */
+        int32x8 d7 = _mm256_permute2x128_si256(c3, c7, 0x31); /* ABCDEFGH7 */
+
+        int32x8_MINMAX(d0, d1);
+        int32x8_MINMAX(d2, d3);
+        int32x8_MINMAX(d4, d5);
+        int32x8_MINMAX(d6, d7);
+        int32x8_MINMAX(d0, d2);
+        int32x8_MINMAX(d1, d3);
+        int32x8_MINMAX(d4, d6);
+        int32x8_MINMAX(d5, d7);
+        int32x8_MINMAX(d0, d4);
+        int32x8_MINMAX(d1, d5);
+        int32x8_MINMAX(d2, d6);
+        int32x8_MINMAX(d3, d7);
+
+        int32x8 e0 = _mm256_unpacklo_epi32(d0, d1);
+        int32x8 e1 = _mm256_unpackhi_epi32(d0, d1);
+        int32x8 e2 = _mm256_unpacklo_epi32(d2, d3);
+        int32x8 e3 = _mm256_unpackhi_epi32(d2, d3);
+        int32x8 e4 = _mm256_unpacklo_epi32(d4, d5);
+        int32x8 e5 = _mm256_unpackhi_epi32(d4, d5);
+        int32x8 e6 = _mm256_unpacklo_epi32(d6, d7);
+        int32x8 e7 = _mm256_unpackhi_epi32(d6, d7);
+
+        int32x8 f0 = _mm256_unpacklo_epi64(e0, e2);
+        int32x8 f1 = _mm256_unpacklo_epi64(e1, e3);
+        int32x8 f2 = _mm256_unpackhi_epi64(e0, e2);
+        int32x8 f3 = _mm256_unpackhi_epi64(e1, e3);
+        int32x8 f4 = _mm256_unpacklo_epi64(e4, e6);
+        int32x8 f5 = _mm256_unpacklo_epi64(e5, e7);
+        int32x8 f6 = _mm256_unpackhi_epi64(e4, e6);
+        int32x8 f7 = _mm256_unpackhi_epi64(e5, e7);
+
+        int32x8 g0 = _mm256_permute2x128_si256(f0, f4, 0x20);
+        int32x8 g1 = _mm256_permute2x128_si256(f2, f6, 0x20);
+        int32x8 g2 = _mm256_permute2x128_si256(f1, f5, 0x20);
+        int32x8 g3 = _mm256_permute2x128_si256(f3, f7, 0x20);
+        int32x8 g4 = _mm256_permute2x128_si256(f0, f4, 0x31);
+        int32x8 g5 = _mm256_permute2x128_si256(f2, f6, 0x31);
+        int32x8 g6 = _mm256_permute2x128_si256(f1, f5, 0x31);
+        int32x8 g7 = _mm256_permute2x128_si256(f3, f7, 0x31);
+
+        int32x8_store(&x[i], g0);
+        int32x8_store(&x[i + 8], g1);
+        int32x8_store(&x[i + 16], g2);
+        int32x8_store(&x[i + 24], g3);
+        int32x8_store(&x[i + 32], g4);
+        int32x8_store(&x[i + 40], g5);
+        int32x8_store(&x[i + 48], g6);
+        int32x8_store(&x[i + 56], g7);
+    }
+
+    q = n >> 4;
+    while (q >= 128 || q == 32) {
+        q >>= 2;
+        for (j = 0; j < n; j += 8 * q) {
+            for (i = j; i < j + q; i += 8) {
+                int32x8 x0 = int32x8_load(&x[i]);
+                int32x8 x1 = int32x8_load(&x[i + q]);
+                int32x8 x2 = int32x8_load(&x[i + 2 * q]);
+                int32x8 x3 = int32x8_load(&x[i + 3 * q]);
+                int32x8 x4 = int32x8_load(&x[i + 4 * q]);
+                int32x8 x5 = int32x8_load(&x[i + 5 * q]);
+                int32x8 x6 = int32x8_load(&x[i + 6 * q]);
+                int32x8 x7 = int32x8_load(&x[i + 7 * q]);
+                int32x8_MINMAX(x0, x4);
+                int32x8_MINMAX(x1, x5);
+                int32x8_MINMAX(x2, x6);
+                int32x8_MINMAX(x3, x7);
+                int32x8_MINMAX(x0, x2);
+                int32x8_MINMAX(x1, x3);
+                int32x8_MINMAX(x4, x6);
+                int32x8_MINMAX(x5, x7);
+                int32x8_MINMAX(x0, x1);
+                int32x8_MINMAX(x2, x3);
+                int32x8_MINMAX(x4, x5);
+                int32x8_MINMAX(x6, x7);
+                int32x8_store(&x[i], x0);
+                int32x8_store(&x[i + q], x1);
+                int32x8_store(&x[i + 2 * q], x2);
+                int32x8_store(&x[i + 3 * q], x3);
+                int32x8_store(&x[i + 4 * q], x4);
+                int32x8_store(&x[i + 5 * q], x5);
+                int32x8_store(&x[i + 6 * q], x6);
+                int32x8_store(&x[i + 7 * q], x7);
+            }
+        }
+        q >>= 1;
+    }
+    while (q >= 16) {
+        q >>= 1;
+        for (j = 0; j < n; j += 4 * q) {
+            for (i = j; i < j + q; i += 8) {
+                int32x8 x0 = int32x8_load(&x[i]);
+                int32x8 x1 = int32x8_load(&x[i + q]);
+                int32x8 x2 = int32x8_load(&x[i + 2 * q]);
+                int32x8 x3 = int32x8_load(&x[i + 3 * q]);
+                int32x8_MINMAX(x0, x2);
+                int32x8_MINMAX(x1, x3);
+                int32x8_MINMAX(x0, x1);
+                int32x8_MINMAX(x2, x3);
+                int32x8_store(&x[i], x0);
+                int32x8_store(&x[i + q], x1);
+                int32x8_store(&x[i + 2 * q], x2);
+                int32x8_store(&x[i + 3 * q], x3);
+            }
+        }
+        q >>= 1;
+    }
+    if (q == 8) {
+        for (j = 0; j < n; j += q + q) {
+            int32x8 x0 = int32x8_load(&x[j]);
+            int32x8 x1 = int32x8_load(&x[j + q]);
+            int32x8_MINMAX(x0, x1);
+            int32x8_store(&x[j], x0);
+            int32x8_store(&x[j + q], x1);
+        }
+    }
+
+    q = n >> 3;
+    for (i = 0; i < q; i += 8) {
+        int32x8 x0 = int32x8_load(&x[i]);
+        int32x8 x1 = int32x8_load(&x[i + q]);
+        int32x8 x2 = int32x8_load(&x[i + 2 * q]);
+        int32x8 x3 = int32x8_load(&x[i + 3 * q]);
+        int32x8 x4 = int32x8_load(&x[i + 4 * q]);
+        int32x8 x5 = int32x8_load(&x[i + 5 * q]);
+        int32x8 x6 = int32x8_load(&x[i + 6 * q]);
+        int32x8 x7 = int32x8_load(&x[i + 7 * q]);
+
+        int32x8_MINMAX(x0, x1);
+        int32x8_MINMAX(x2, x3);
+        int32x8_MINMAX(x4, x5);
+        int32x8_MINMAX(x6, x7);
+        int32x8_MINMAX(x0, x2);
+        int32x8_MINMAX(x1, x3);
+        int32x8_MINMAX(x4, x6);
+        int32x8_MINMAX(x5, x7);
+        int32x8_MINMAX(x0, x4);
+        int32x8_MINMAX(x1, x5);
+        int32x8_MINMAX(x2, x6);
+        int32x8_MINMAX(x3, x7);
+
+        int32x8 b0 = _mm256_unpacklo_epi32(x0, x4); /* AE0AE1AE4AE5 */
+        int32x8 b1 = _mm256_unpackhi_epi32(x0, x4); /* AE2AE3AE6AE7 */
+        int32x8 b2 = _mm256_unpacklo_epi32(x1, x5); /* BF0BF1BF4BF5 */
+        int32x8 b3 = _mm256_unpackhi_epi32(x1, x5); /* BF2BF3BF6BF7 */
+        int32x8 b4 = _mm256_unpacklo_epi32(x2, x6); /* CG0CG1CG4CG5 */
+        int32x8 b5 = _mm256_unpackhi_epi32(x2, x6); /* CG2CG3CG6CG7 */
+        int32x8 b6 = _mm256_unpacklo_epi32(x3, x7); /* DH0DH1DH4DH5 */
+        int32x8 b7 = _mm256_unpackhi_epi32(x3, x7); /* DH2DH3DH6DH7 */
+
+        int32x8 c0 = _mm256_unpacklo_epi64(b0, b4); /* AECG0AECG4 */
+        int32x8 c1 = _mm256_unpacklo_epi64(b1, b5); /* AECG2AECG6 */
+        int32x8 c2 = _mm256_unpackhi_epi64(b0, b4); /* AECG1AECG5 */
+        int32x8 c3 = _mm256_unpackhi_epi64(b1, b5); /* AECG3AECG7 */
+        int32x8 c4 = _mm256_unpacklo_epi64(b2, b6); /* BFDH0BFDH4 */
+        int32x8 c5 = _mm256_unpacklo_epi64(b3, b7); /* BFDH2BFDH6 */
+        int32x8 c6 = _mm256_unpackhi_epi64(b2, b6); /* BFDH1BFDH5 */
+        int32x8 c7 = _mm256_unpackhi_epi64(b3, b7); /* BFDH3BFDH7 */
+
+        int32x8 d0 = _mm256_permute2x128_si256(c0, c4, 0x20); /* AECGBFDH0 */
+        int32x8 d1 = _mm256_permute2x128_si256(c1, c5, 0x20); /* AECGBFDH2 */
+        int32x8 d2 = _mm256_permute2x128_si256(c2, c6, 0x20); /* AECGBFDH1 */
+        int32x8 d3 = _mm256_permute2x128_si256(c3, c7, 0x20); /* AECGBFDH3 */
+        int32x8 d4 = _mm256_permute2x128_si256(c0, c4, 0x31); /* AECGBFDH4 */
+        int32x8 d5 = _mm256_permute2x128_si256(c1, c5, 0x31); /* AECGBFDH6 */
+        int32x8 d6 = _mm256_permute2x128_si256(c2, c6, 0x31); /* AECGBFDH5 */
+        int32x8 d7 = _mm256_permute2x128_si256(c3, c7, 0x31); /* AECGBFDH7 */
+
+        if (flagdown) {
+            d0 ^= mask;
+            d1 ^= mask;
+            d2 ^= mask;
+            d3 ^= mask;
+            d4 ^= mask;
+            d5 ^= mask;
+            d6 ^= mask;
+            d7 ^= mask;
+        }
+
+        int32x8_store(&x[i], d0);
+        int32x8_store(&x[i + q], d4);
+        int32x8_store(&x[i + 2 * q], d1);
+        int32x8_store(&x[i + 3 * q], d5);
+        int32x8_store(&x[i + 4 * q], d2);
+        int32x8_store(&x[i + 5 * q], d6);
+        int32x8_store(&x[i + 6 * q], d3);
+        int32x8_store(&x[i + 7 * q], d7);
+    }
+}
+
+void PQCLEAN_NTRULPR857_AVX2_crypto_sort_int32(int32 *x, size_t n) {
+    size_t q, i, j;
+
+    if (n <= 8) {
+        if (n == 8) {
+            int32_MINMAX(&x[0], &x[1]);
+            int32_MINMAX(&x[1], &x[2]);
+            int32_MINMAX(&x[2], &x[3]);
+            int32_MINMAX(&x[3], &x[4]);
+            int32_MINMAX(&x[4], &x[5]);
+            int32_MINMAX(&x[5], &x[6]);
+            int32_MINMAX(&x[6], &x[7]);
+        }
+        if (n >= 7) {
+            int32_MINMAX(&x[0], &x[1]);
+            int32_MINMAX(&x[1], &x[2]);
+            int32_MINMAX(&x[2], &x[3]);
+            int32_MINMAX(&x[3], &x[4]);
+            int32_MINMAX(&x[4], &x[5]);
+            int32_MINMAX(&x[5], &x[6]);
+        }
+        if (n >= 6) {
+            int32_MINMAX(&x[0], &x[1]);
+            int32_MINMAX(&x[1], &x[2]);
+            int32_MINMAX(&x[2], &x[3]);
+            int32_MINMAX(&x[3], &x[4]);
+            int32_MINMAX(&x[4], &x[5]);
+        }
+        if (n >= 5) {
+            int32_MINMAX(&x[0], &x[1]);
+            int32_MINMAX(&x[1], &x[2]);
+            int32_MINMAX(&x[2], &x[3]);
+            int32_MINMAX(&x[3], &x[4]);
+        }
+        if (n >= 4) {
+            int32_MINMAX(&x[0], &x[1]);
+            int32_MINMAX(&x[1], &x[2]);
+            int32_MINMAX(&x[2], &x[3]);
+        }
+        if (n >= 3) {
+            int32_MINMAX(&x[0], &x[1]);
+            int32_MINMAX(&x[1], &x[2]);
+        }
+        if (n >= 2) {
+            int32_MINMAX(&x[0], &x[1]);
+        }
+        return;
+    }
+
+    if (!(n & (n - 1))) {
+        int32_sort_2power(x, n, 0);
+        return;
+    }
+
+    q = 8;
+    while (q < n - q) {
+        q += q;
+    }
+    /* n > q >= 8 */
+
+    if (q <= 128) { /* n <= 256 */
+        int32x8 y[32];
+        for (i = q >> 3; i < q >> 2; ++i) {
+            y[i] = _mm256_set1_epi32(0x7fffffff);
+        }
+        for (i = 0; i < n; ++i) {
+            ((int32 *) y)[i] = x[i];
+        }
+        int32_sort_2power((int32 *) y, 2 * q, 0);
+        for (i = 0; i < n; ++i) {
+            x[i] = ((int32 *) y)[i];
+        }
+        return;
+    }
+
+    int32_sort_2power(x, q, 1);
+    PQCLEAN_NTRULPR857_AVX2_crypto_sort_int32(x + q, n - q);
+
+    while (q >= 64) {
+        q >>= 2;
+        j = int32_threestages(x, n, q);
+        minmax_vector(x + j, x + j + 4 * q, n - 4 * q - j);
+        if (j + 4 * q <= n) {
+            for (i = j; i < j + q; i += 8) {
+                int32x8 x0 = int32x8_load(&x[i]);
+                int32x8 x1 = int32x8_load(&x[i + q]);
+                int32x8 x2 = int32x8_load(&x[i + 2 * q]);
+                int32x8 x3 = int32x8_load(&x[i + 3 * q]);
+                int32x8_MINMAX(x0, x2);
+                int32x8_MINMAX(x1, x3);
+                int32x8_MINMAX(x0, x1);
+                int32x8_MINMAX(x2, x3);
+                int32x8_store(&x[i], x0);
+                int32x8_store(&x[i + q], x1);
+                int32x8_store(&x[i + 2 * q], x2);
+                int32x8_store(&x[i + 3 * q], x3);
+            }
+            j += 4 * q;
+        }
+        minmax_vector(x + j, x + j + 2 * q, n - 2 * q - j);
+        if (j + 2 * q <= n) {
+            for (i = j; i < j + q; i += 8) {
+                int32x8 x0 = int32x8_load(&x[i]);
+                int32x8 x1 = int32x8_load(&x[i + q]);
+                int32x8_MINMAX(x0, x1);
+                int32x8_store(&x[i], x0);
+                int32x8_store(&x[i + q], x1);
+            }
+            j += 2 * q;
+        }
+        minmax_vector(x + j, x + j + q, n - q - j);
+        q >>= 1;
+    }
+    if (q == 32) {
+        j = 0;
+        for (; j + 64 <= n; j += 64) {
+            int32x8 x0 = int32x8_load(&x[j]);
+            int32x8 x1 = int32x8_load(&x[j + 8]);
+            int32x8 x2 = int32x8_load(&x[j + 16]);
+            int32x8 x3 = int32x8_load(&x[j + 24]);
+            int32x8 x4 = int32x8_load(&x[j + 32]);
+            int32x8 x5 = int32x8_load(&x[j + 40]);
+            int32x8 x6 = int32x8_load(&x[j + 48]);
+            int32x8 x7 = int32x8_load(&x[j + 56]);
+            int32x8_MINMAX(x0, x4);
+            int32x8_MINMAX(x1, x5);
+            int32x8_MINMAX(x2, x6);
+            int32x8_MINMAX(x3, x7);
+            int32x8_MINMAX(x0, x2);
+            int32x8_MINMAX(x1, x3);
+            int32x8_MINMAX(x4, x6);
+            int32x8_MINMAX(x5, x7);
+            int32x8_MINMAX(x0, x1);
+            int32x8_MINMAX(x2, x3);
+            int32x8_MINMAX(x4, x5);
+            int32x8_MINMAX(x6, x7);
+            int32x8 a0 = _mm256_permute2x128_si256(x0, x1, 0x20);
+            int32x8 a1 = _mm256_permute2x128_si256(x0, x1, 0x31);
+            int32x8 a2 = _mm256_permute2x128_si256(x2, x3, 0x20);
+            int32x8 a3 = _mm256_permute2x128_si256(x2, x3, 0x31);
+            int32x8 a4 = _mm256_permute2x128_si256(x4, x5, 0x20);
+            int32x8 a5 = _mm256_permute2x128_si256(x4, x5, 0x31);
+            int32x8 a6 = _mm256_permute2x128_si256(x6, x7, 0x20);
+            int32x8 a7 = _mm256_permute2x128_si256(x6, x7, 0x31);
+            int32x8_MINMAX(a0, a1);
+            int32x8_MINMAX(a2, a3);
+            int32x8_MINMAX(a4, a5);
+            int32x8_MINMAX(a6, a7);
+            int32x8 b0 = _mm256_permute2x128_si256(a0, a1, 0x20);
+            int32x8 b1 = _mm256_permute2x128_si256(a0, a1, 0x31);
+            int32x8 b2 = _mm256_permute2x128_si256(a2, a3, 0x20);
+            int32x8 b3 = _mm256_permute2x128_si256(a2, a3, 0x31);
+            int32x8 b4 = _mm256_permute2x128_si256(a4, a5, 0x20);
+            int32x8 b5 = _mm256_permute2x128_si256(a4, a5, 0x31);
+            int32x8 b6 = _mm256_permute2x128_si256(a6, a7, 0x20);
+            int32x8 b7 = _mm256_permute2x128_si256(a6, a7, 0x31);
+            int32x8 c0 = _mm256_unpacklo_epi64(b0, b1);
+            int32x8 c1 = _mm256_unpackhi_epi64(b0, b1);
+            int32x8 c2 = _mm256_unpacklo_epi64(b2, b3);
+            int32x8 c3 = _mm256_unpackhi_epi64(b2, b3);
+            int32x8 c4 = _mm256_unpacklo_epi64(b4, b5);
+            int32x8 c5 = _mm256_unpackhi_epi64(b4, b5);
+            int32x8 c6 = _mm256_unpacklo_epi64(b6, b7);
+            int32x8 c7 = _mm256_unpackhi_epi64(b6, b7);
+            int32x8_MINMAX(c0, c1);
+            int32x8_MINMAX(c2, c3);
+            int32x8_MINMAX(c4, c5);
+            int32x8_MINMAX(c6, c7);
+            int32x8 d0 = _mm256_unpacklo_epi32(c0, c1);
+            int32x8 d1 = _mm256_unpackhi_epi32(c0, c1);
+            int32x8 d2 = _mm256_unpacklo_epi32(c2, c3);
+            int32x8 d3 = _mm256_unpackhi_epi32(c2, c3);
+            int32x8 d4 = _mm256_unpacklo_epi32(c4, c5);
+            int32x8 d5 = _mm256_unpackhi_epi32(c4, c5);
+            int32x8 d6 = _mm256_unpacklo_epi32(c6, c7);
+            int32x8 d7 = _mm256_unpackhi_epi32(c6, c7);
+            int32x8 e0 = _mm256_unpacklo_epi64(d0, d1);
+            int32x8 e1 = _mm256_unpackhi_epi64(d0, d1);
+            int32x8 e2 = _mm256_unpacklo_epi64(d2, d3);
+            int32x8 e3 = _mm256_unpackhi_epi64(d2, d3);
+            int32x8 e4 = _mm256_unpacklo_epi64(d4, d5);
+            int32x8 e5 = _mm256_unpackhi_epi64(d4, d5);
+            int32x8 e6 = _mm256_unpacklo_epi64(d6, d7);
+            int32x8 e7 = _mm256_unpackhi_epi64(d6, d7);
+            int32x8_MINMAX(e0, e1);
+            int32x8_MINMAX(e2, e3);
+            int32x8_MINMAX(e4, e5);
+            int32x8_MINMAX(e6, e7);
+            int32x8 f0 = _mm256_unpacklo_epi32(e0, e1);
+            int32x8 f1 = _mm256_unpackhi_epi32(e0, e1);
+            int32x8 f2 = _mm256_unpacklo_epi32(e2, e3);
+            int32x8 f3 = _mm256_unpackhi_epi32(e2, e3);
+            int32x8 f4 = _mm256_unpacklo_epi32(e4, e5);
+            int32x8 f5 = _mm256_unpackhi_epi32(e4, e5);
+            int32x8 f6 = _mm256_unpacklo_epi32(e6, e7);
+            int32x8 f7 = _mm256_unpackhi_epi32(e6, e7);
+            int32x8_store(&x[j], f0);
+            int32x8_store(&x[j + 8], f1);
+            int32x8_store(&x[j + 16], f2);
+            int32x8_store(&x[j + 24], f3);
+            int32x8_store(&x[j + 32], f4);
+            int32x8_store(&x[j + 40], f5);
+            int32x8_store(&x[j + 48], f6);
+            int32x8_store(&x[j + 56], f7);
+        }
+        minmax_vector(x + j, x + j + 32, n - 32 - j);
+        goto continue16;
+    }
+    if (q == 16) {
+        j = 0;
+continue16:
+        for (; j + 32 <= n; j += 32) {
+            int32x8 x0 = int32x8_load(&x[j]);
+            int32x8 x1 = int32x8_load(&x[j + 8]);
+            int32x8 x2 = int32x8_load(&x[j + 16]);
+            int32x8 x3 = int32x8_load(&x[j + 24]);
+            int32x8_MINMAX(x0, x2);
+            int32x8_MINMAX(x1, x3);
+            int32x8_MINMAX(x0, x1);
+            int32x8_MINMAX(x2, x3);
+            int32x8 a0 = _mm256_permute2x128_si256(x0, x1, 0x20);
+            int32x8 a1 = _mm256_permute2x128_si256(x0, x1, 0x31);
+            int32x8 a2 = _mm256_permute2x128_si256(x2, x3, 0x20);
+            int32x8 a3 = _mm256_permute2x128_si256(x2, x3, 0x31);
+            int32x8_MINMAX(a0, a1);
+            int32x8_MINMAX(a2, a3);
+            int32x8 b0 = _mm256_permute2x128_si256(a0, a1, 0x20);
+            int32x8 b1 = _mm256_permute2x128_si256(a0, a1, 0x31);
+            int32x8 b2 = _mm256_permute2x128_si256(a2, a3, 0x20);
+            int32x8 b3 = _mm256_permute2x128_si256(a2, a3, 0x31);
+            int32x8 c0 = _mm256_unpacklo_epi64(b0, b1);
+            int32x8 c1 = _mm256_unpackhi_epi64(b0, b1);
+            int32x8 c2 = _mm256_unpacklo_epi64(b2, b3);
+            int32x8 c3 = _mm256_unpackhi_epi64(b2, b3);
+            int32x8_MINMAX(c0, c1);
+            int32x8_MINMAX(c2, c3);
+            int32x8 d0 = _mm256_unpacklo_epi32(c0, c1);
+            int32x8 d1 = _mm256_unpackhi_epi32(c0, c1);
+            int32x8 d2 = _mm256_unpacklo_epi32(c2, c3);
+            int32x8 d3 = _mm256_unpackhi_epi32(c2, c3);
+            int32x8 e0 = _mm256_unpacklo_epi64(d0, d1);
+            int32x8 e1 = _mm256_unpackhi_epi64(d0, d1);
+            int32x8 e2 = _mm256_unpacklo_epi64(d2, d3);
+            int32x8 e3 = _mm256_unpackhi_epi64(d2, d3);
+            int32x8_MINMAX(e0, e1);
+            int32x8_MINMAX(e2, e3);
+            int32x8 f0 = _mm256_unpacklo_epi32(e0, e1);
+            int32x8 f1 = _mm256_unpackhi_epi32(e0, e1);
+            int32x8 f2 = _mm256_unpacklo_epi32(e2, e3);
+            int32x8 f3 = _mm256_unpackhi_epi32(e2, e3);
+            int32x8_store(&x[j], f0);
+            int32x8_store(&x[j + 8], f1);
+            int32x8_store(&x[j + 16], f2);
+            int32x8_store(&x[j + 24], f3);
+        }
+        minmax_vector(x + j, x + j + 16, n - 16 - j);
+        goto continue8;
+    }
+    /* q == 8 */
+    j = 0;
+continue8:
+    for (; j + 16 <= n; j += 16) {
+        int32x8 x0 = int32x8_load(&x[j]);
+        int32x8 x1 = int32x8_load(&x[j + 8]);
+        int32x8_MINMAX(x0, x1);
+        int32x8_store(&x[j], x0);
+        int32x8_store(&x[j + 8], x1);
+        int32x8 a0 = _mm256_permute2x128_si256(x0, x1, 0x20); /* x0123y0123 */
+        int32x8 a1 = _mm256_permute2x128_si256(x0, x1, 0x31); /* x4567y4567 */
+        int32x8_MINMAX(a0, a1);
+        int32x8 b0 = _mm256_permute2x128_si256(a0, a1, 0x20); /* x01234567 */
+        int32x8 b1 = _mm256_permute2x128_si256(a0, a1, 0x31); /* y01234567 */
+        int32x8 c0 = _mm256_unpacklo_epi64(b0, b1); /* x01y01x45y45 */
+        int32x8 c1 = _mm256_unpackhi_epi64(b0, b1); /* x23y23x67y67 */
+        int32x8_MINMAX(c0, c1);
+        int32x8 d0 = _mm256_unpacklo_epi32(c0, c1); /* x02x13x46x57 */
+        int32x8 d1 = _mm256_unpackhi_epi32(c0, c1); /* y02y13y46y57 */
+        int32x8 e0 = _mm256_unpacklo_epi64(d0, d1); /* x02y02x46y46 */
+        int32x8 e1 = _mm256_unpackhi_epi64(d0, d1); /* x13y13x57y57 */
+        int32x8_MINMAX(e0, e1);
+        int32x8 f0 = _mm256_unpacklo_epi32(e0, e1); /* x01234567 */
+        int32x8 f1 = _mm256_unpackhi_epi32(e0, e1); /* y01234567 */
+        int32x8_store(&x[j], f0);
+        int32x8_store(&x[j + 8], f1);
+    }
+    minmax_vector(x + j, x + j + 8, n - 8 - j);
+    if (j + 8 <= n) {
+        int32_MINMAX(&x[j], &x[j + 4]);
+        int32_MINMAX(&x[j + 1], &x[j + 5]);
+        int32_MINMAX(&x[j + 2], &x[j + 6]);
+        int32_MINMAX(&x[j + 3], &x[j + 7]);
+        int32_MINMAX(&x[j], &x[j + 2]);
+        int32_MINMAX(&x[j + 1], &x[j + 3]);
+        int32_MINMAX(&x[j], &x[j + 1]);
+        int32_MINMAX(&x[j + 2], &x[j + 3]);
+        int32_MINMAX(&x[j + 4], &x[j + 6]);
+        int32_MINMAX(&x[j + 5], &x[j + 7]);
+        int32_MINMAX(&x[j + 4], &x[j + 5]);
+        int32_MINMAX(&x[j + 6], &x[j + 7]);
+        j += 8;
+    }
+    minmax_vector(x + j, x + j + 4, n - 4 - j);
+    if (j + 4 <= n) {
+        int32_MINMAX(&x[j], &x[j + 2]);
+        int32_MINMAX(&x[j + 1], &x[j + 3]);
+        int32_MINMAX(&x[j], &x[j + 1]);
+        int32_MINMAX(&x[j + 2], &x[j + 3]);
+        j += 4;
+    }
+    if (j + 3 <= n) {
+        int32_MINMAX(&x[j], &x[j + 2]);
+    }
+    if (j + 2 <= n) {
+        int32_MINMAX(&x[j], &x[j + 1]);
+    }
+}
diff --git a/crypto_kem/ntrulpr857/avx2/crypto_sort_int32.h b/crypto_kem/ntrulpr857/avx2/crypto_sort_int32.h
new file mode 100644
index 00000000..84854cd5
--- /dev/null
+++ b/crypto_kem/ntrulpr857/avx2/crypto_sort_int32.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_NTRULPR857_AVX2_CRYPTO_SORT
+#define PQCLEAN_NTRULPR857_AVX2_CRYPTO_SORT
+
+#include <stddef.h>
+#include <stdint.h>
+
+
+void PQCLEAN_NTRULPR857_AVX2_crypto_sort_int32(int32_t *x, size_t n);
+
+#endif
diff --git a/crypto_kem/ntrulpr857/avx2/crypto_sort_uint32.c b/crypto_kem/ntrulpr857/avx2/crypto_sort_uint32.c
new file mode 100644
index 00000000..9c88b709
--- /dev/null
+++ b/crypto_kem/ntrulpr857/avx2/crypto_sort_uint32.c
@@ -0,0 +1,20 @@
+#include "crypto_sort_int32.h"
+#include "crypto_sort_uint32.h"
+#include <stdint.h>
+
+#define uint32 uint32_t
+
+/* can save time by vectorizing xor loops */
+/* can save time by integrating xor loops with int32_sort */
+
+void PQCLEAN_NTRULPR857_AVX2_crypto_sort_uint32(uint32_t *array, size_t n) {
+    uint32 *x = array;
+    size_t j;
+    for (j = 0; j < n; ++j) {
+        x[j] ^= 0x80000000;
+    }
+    PQCLEAN_NTRULPR857_AVX2_crypto_sort_int32((int32_t *)array, n);
+    for (j = 0; j < n; ++j) {
+        x[j] ^= 0x80000000;
+    }
+}
diff --git a/crypto_kem/ntrulpr857/avx2/crypto_sort_uint32.h b/crypto_kem/ntrulpr857/avx2/crypto_sort_uint32.h
new file mode 100644
index 00000000..5fa43ab6
--- /dev/null
+++ b/crypto_kem/ntrulpr857/avx2/crypto_sort_uint32.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_NTRULPR857_AVX2_CRYPTO_SORT_UINT32_H
+#define PQCLEAN_NTRULPR857_AVX2_CRYPTO_SORT_UINT32_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+
+void PQCLEAN_NTRULPR857_AVX2_crypto_sort_uint32(uint32_t *array, size_t n);
+
+#endif
diff --git a/crypto_kem/ntrulpr857/avx2/crypto_stream_aes256ctr.c b/crypto_kem/ntrulpr857/avx2/crypto_stream_aes256ctr.c
new file mode 100644
index 00000000..1d3eeaa9
--- /dev/null
+++ b/crypto_kem/ntrulpr857/avx2/crypto_stream_aes256ctr.c
@@ -0,0 +1,15 @@
+#include "crypto_stream_aes256ctr.h"
+
+
+int PQCLEAN_NTRULPR857_AVX2_crypto_stream_aes256ctr(
+    uint8_t *out,
+    size_t outlen,
+    const uint8_t nonce[AESCTR_NONCEBYTES],
+    const uint8_t key[AES256_KEYBYTES]) {
+
+    aes256ctx state;
+    aes256_ctr_keyexp(&state, key);
+    aes256_ctr(out, outlen, nonce, &state);
+    aes256_ctx_release(&state);
+    return 0;
+}
diff --git a/crypto_kem/ntrulpr857/avx2/crypto_stream_aes256ctr.h b/crypto_kem/ntrulpr857/avx2/crypto_stream_aes256ctr.h
new file mode 100644
index 00000000..05ea1718
--- /dev/null
+++ b/crypto_kem/ntrulpr857/avx2/crypto_stream_aes256ctr.h
@@ -0,0 +1,15 @@
+#ifndef PQCLEAN_NTRULPR857_AVX2_CRYPTO_STREAM_AES256CTR_H
+#define PQCLEAN_NTRULPR857_AVX2_CRYPTO_STREAM_AES256CTR_H
+#include "aes.h"
+#include <stddef.h>
+#include <stdint.h>
+
+
+
+int PQCLEAN_NTRULPR857_AVX2_crypto_stream_aes256ctr(
+    uint8_t *out,
+    size_t outlen,
+    const uint8_t nonce[AESCTR_NONCEBYTES],
+    const uint8_t key[AES256_KEYBYTES]);
+
+#endif
diff --git a/crypto_kem/ntrulpr857/avx2/crypto_verify_1312.c b/crypto_kem/ntrulpr857/avx2/crypto_verify_1312.c
new file mode 100644
index 00000000..950aa37b
--- /dev/null
+++ b/crypto_kem/ntrulpr857/avx2/crypto_verify_1312.c
@@ -0,0 +1,36 @@
+#include "crypto_verify_1312.h"
+#include <immintrin.h>
+
+int PQCLEAN_NTRULPR857_AVX2_crypto_verify_1312(const unsigned char *x, const unsigned char *y) {
+    __m256i diff = _mm256_set1_epi8(0);
+    unsigned int differentbits = 0;
+    int i = PQCLEAN_NTRULPR857_AVX2_crypto_verify_1312_BYTES;
+
+    i -= 32;
+    for (;;) {
+        do {
+            __m256i x0 = _mm256_loadu_si256((__m256i *) x);
+            __m256i y0 = _mm256_loadu_si256((__m256i *) y);
+            diff |= x0 ^ y0;
+            i -= 32;
+            x += 32;
+            y += 32;
+        } while (i >= 0);
+        if (i <= -32) {
+            break;
+        }
+        x += i;
+        y += i;
+    }
+
+    diff |= _mm256_srli_epi16(diff, 8);
+    diff |= _mm256_srli_epi32(diff, 16);
+    diff |= _mm256_srli_epi64(diff, 32);
+
+    differentbits = _mm256_extract_epi8(diff, 0);
+    differentbits |= _mm256_extract_epi8(diff, 8);
+    differentbits |= _mm256_extract_epi8(diff, 16);
+    differentbits |= _mm256_extract_epi8(diff, 24);
+
+    return (int) (1 & ((differentbits - 1) >> 8)) - 1;
+}
diff --git a/crypto_kem/ntrulpr857/avx2/crypto_verify_1312.h b/crypto_kem/ntrulpr857/avx2/crypto_verify_1312.h
new file mode 100644
index 00000000..3d1baf27
--- /dev/null
+++ b/crypto_kem/ntrulpr857/avx2/crypto_verify_1312.h
@@ -0,0 +1,8 @@
+#ifndef PQCLEAN_NTRULPR857_AVX2_CRYPTO_VERIFY_1312_H
+#define PQCLEAN_NTRULPR857_AVX2_CRYPTO_VERIFY_1312_H
+
+#include <stdint.h>
+#define PQCLEAN_NTRULPR857_AVX2_crypto_verify_1312_BYTES 1312
+
+int PQCLEAN_NTRULPR857_AVX2_crypto_verify_1312(const unsigned char *x, const unsigned char *y);
+#endif
diff --git a/crypto_kem/ntrulpr857/avx2/kem.c b/crypto_kem/ntrulpr857/avx2/kem.c
new file mode 100644
index 00000000..c13eebf7
--- /dev/null
+++ b/crypto_kem/ntrulpr857/avx2/kem.c
@@ -0,0 +1,287 @@
+#include "api.h"
+#include "crypto_sort_uint32.h"
+#include "crypto_stream_aes256ctr.h"
+#include "params.h"
+#include "randombytes.h"
+#include "sha2.h"
+
+
+
+#define int8 int8_t
+#define int16 int16_t
+#define int32 int32_t
+#define uint16 uint16_t
+#define uint32 uint32_t
+#define uint64 uint64_t
+
+/* ----- masks */
+
+/* return -1 if x<0; otherwise return 0 */
+static int int16_negative_mask(int16 x) {
+    uint16 u = x;
+    u >>= 15;
+    return -(int) u;
+    /* alternative with gcc -fwrapv: */
+    /* x>>15 compiles to CPU's arithmetic right shift */
+}
+
+/* ----- arithmetic mod 3 */
+
+typedef int8 small;
+/* F3 is always represented as -1,0,1 */
+
+/* ----- arithmetic mod q */
+
+#define q12 ((q-1)/2)
+typedef int16 Fq;
+
+/* works for -14000000 < x < 14000000 if q in 4591, 4621, 5167 */
+/* assumes twos complement; use, e.g., gcc -fwrapv */
+static Fq Fq_freeze(int32 x) {
+    x -= q * ((q18 * x) >> 18);
+    x -= q * ((q27 * x + 67108864) >> 27);
+    return x;
+}
+
+/* works for all uint32 x */
+static Fq Fq_bigfreeze(uint32 x) {
+    x -= q * ((x * (uint64)q31) >> 31);
+    x -= q * ((x * (uint64)q31) >> 31);
+    x -= q;
+    x += (-(x >> 31)) & (uint32)q;
+    return x;
+}
+
+/* ----- Top and Right */
+
+static int8 Top(Fq C) {
+    return (tau1 * (int32)(C + tau0) + 16384) >> 15;
+}
+
+static Fq Right(int8 T) {
+    return Fq_freeze(tau3 * (int32)T - tau2);
+}
+
+/* ----- polynomials mod q */
+
+/* h = h*g in the ring Rq */
+static void Rq_mult_small(Fq *h, const small *g) {
+    crypto_encode_pxint16((unsigned char *) h, h);
+    crypto_core_mult((unsigned char *) h, (const unsigned char *) h, (const unsigned char *) g);
+    crypto_decode_pxint16(h, (const unsigned char *) h);
+}
+
+/* ----- sorting to generate short polynomial */
+
+static void Short_fromlist(small *out, const uint32 *in) {
+    uint32 L[ppadsort];
+    int i;
+
+    for (i = 0; i < w; ++i) {
+        L[i] = in[i] & (uint32) - 2;
+    }
+    for (i = w; i < p; ++i) {
+        L[i] = (in[i] & (uint32) - 3) | 1;
+    }
+    for (i = p; i < ppadsort; ++i) {
+        L[i] = 0xffffffff;
+    }
+    PQCLEAN_NTRULPR857_AVX2_crypto_sort_uint32(L, ppadsort);
+    for (i = 0; i < p; ++i) {
+        out[i] = (L[i] & 3) - 1;
+    }
+}
+
+/* ----- underlying hash function */
+
+#define Hash_bytes 32
+
+static void Hash(unsigned char *out, const unsigned char *in, int inlen) {
+    unsigned char h[64];
+    int i;
+    sha512(h, in, inlen);
+    for (i = 0; i < 32; ++i) {
+        out[i] = h[i];
+    }
+}
+
+/* ----- higher-level randomness */
+
+static void Short_random(small *out) {
+    uint32 L[p];
+
+    randombytes((unsigned char *) L, sizeof L);
+    crypto_decode_pxint32(L, (unsigned char *) L);
+    Short_fromlist(out, L);
+}
+
+/* ----- Inputs, Generator */
+
+typedef int8 Inputs[I]; /* passed by reference */
+
+static const unsigned char aes_nonce[16] = {0};
+
+/* G = Generator(pk) */
+static void Generator(Fq *G, const unsigned char *pk) {
+    uint32 L[p];
+    int i;
+
+    PQCLEAN_NTRULPR857_AVX2_crypto_stream_aes256ctr((unsigned char *) L, 4 * p, aes_nonce, pk);
+    crypto_decode_pxint32(L, (unsigned char *) L);
+    for (i = 0; i < p; ++i) {
+        G[i] = Fq_bigfreeze(L[i]) - q12;
+    }
+}
+
+/* ----- NTRU LPRime */
+
+#define Seeds_bytes 32
+#define Ciphertexts_bytes (Rounded_bytes+Top_bytes)
+#define SecretKeys_bytes Small_bytes
+#define PublicKeys_bytes (Seeds_bytes+Rounded_bytes)
+#define Confirm_bytes 32
+
+/* c,r_enc[1:] = Hide(r,pk,cache); cache is Hash4(pk) */
+static void Hide(unsigned char *c, unsigned char *r_enc, const Inputs r, const unsigned char *pk, const unsigned char *cache) {
+    small b[p];
+    int i;
+
+    Inputs_encode(r_enc + 1, r);
+    {
+        unsigned char h[Hash_bytes];
+        uint32 L[p];
+        {
+            unsigned char s[1 + Inputs_bytes];
+            Inputs_encode(s + 1, r);
+            s[0] = 5;
+            Hash(h, s, sizeof s);
+        }
+        PQCLEAN_NTRULPR857_AVX2_crypto_stream_aes256ctr((unsigned char *) L, 4 * p, aes_nonce, h);
+        crypto_decode_pxint32(L, (unsigned char *) L);
+        Short_fromlist(b, L);
+    }
+    {
+        Fq bG[p];
+        Generator(bG, pk);
+        Rq_mult_small(bG, b);
+        Round_and_encode(c, bG);
+        c += Rounded_bytes;
+    }
+    {
+        Fq bA[p];
+        int8 T[I];
+        Rounded_decode(bA, pk + Seeds_bytes);
+        Rq_mult_small(bA, b);
+        for (i = 0; i < I; ++i) {
+            T[i] = Top(Fq_freeze(bA[i] + r[i] * q12));
+        }
+        Top_encode(c, T);
+        c += Top_bytes;
+    }
+    {
+        unsigned char x[1 + Inputs_bytes + Hash_bytes];
+        for (i = 0; i < Inputs_bytes; ++i) {
+            x[1 + i] = r_enc[1 + i];
+        }
+        for (i = 0; i < Hash_bytes; ++i) {
+            x[1 + Inputs_bytes + i] = cache[i];
+        }
+        x[0] = 2;
+        Hash(c, x, sizeof x);
+    }
+}
+
+
+int PQCLEAN_NTRULPR857_AVX2_crypto_kem_keypair(unsigned char *pk, unsigned char *sk) {
+    Fq aG[p];
+    int i;
+    randombytes(pk, Seeds_bytes);
+    Generator(aG, pk);
+    {
+        small a[p];
+        Short_random(a);
+        Rq_mult_small(aG, a);
+        Small_encode(sk, a);
+    }
+    Round_and_encode(pk + Seeds_bytes, aG);
+    {
+        unsigned char sksave = sk[SecretKeys_bytes - 1];
+        for (i = 0; i < PublicKeys_bytes; ++i) {
+            sk[SecretKeys_bytes + i] = pk[i];
+        }
+        sk[SecretKeys_bytes - 1] = 4;
+        Hash(sk + SecretKeys_bytes + PublicKeys_bytes + Inputs_bytes, sk + SecretKeys_bytes - 1, 1 + PublicKeys_bytes);
+        sk[SecretKeys_bytes - 1] = sksave;
+        randombytes(sk + SecretKeys_bytes + PublicKeys_bytes, Inputs_bytes);
+    }
+    return 0;
+}
+
+int PQCLEAN_NTRULPR857_AVX2_crypto_kem_enc(unsigned char *c, unsigned char *k, const unsigned char *pk) {
+    int i;
+    unsigned char cache[Hash_bytes];
+    {
+        unsigned char y[1 + PublicKeys_bytes];
+        for (i = 0; i < PublicKeys_bytes; ++i) {
+            y[1 + i] = pk[i];
+        }
+        y[0] = 4;
+        Hash(cache, y, sizeof y);
+    }
+    Inputs r;
+    {
+        unsigned char s[Inputs_bytes];
+        randombytes(s, sizeof s);
+        Inputs_decode(r, s);
+    }
+    {
+        unsigned char x[1 + Inputs_bytes + Ciphertexts_bytes + Confirm_bytes];
+        Hide(c, x, r, pk, cache);
+        for (i = 0; i < Ciphertexts_bytes + Confirm_bytes; ++i) {
+            x[1 + Inputs_bytes + i] = c[i];
+        }
+        x[0] = 1;
+        Hash(k, x, sizeof x);
+    }
+    return 0;
+}
+
+int PQCLEAN_NTRULPR857_AVX2_crypto_kem_dec(unsigned char *k, const unsigned char *c, const unsigned char *sk) {
+    const unsigned char *pk = sk + SecretKeys_bytes;
+    const unsigned char *rho = pk + PublicKeys_bytes;
+    const unsigned char *cache = rho + Inputs_bytes;
+    Inputs r;
+    int i;
+    {
+        Fq aB[p];
+        Rounded_decode(aB, c);
+        {
+            small a[p];
+            Small_decode(a, sk);
+            Rq_mult_small(aB, a);
+        }
+        {
+            int8 T[I];
+            Top_decode(T, c + Rounded_bytes);
+            for (i = 0; i < I; ++i) {
+                r[i] = -int16_negative_mask(Fq_freeze(Right(T[i]) - aB[i] + 4 * w + 1));
+            }
+        }
+    }
+    {
+        unsigned char cnew[Ciphertexts_bytes + Confirm_bytes];
+        int mask;
+        unsigned char x[1 + Inputs_bytes + Ciphertexts_bytes + Confirm_bytes];
+        Hide(cnew, x, r, pk, cache);
+        mask = crypto_verify_clen(c, cnew);
+        for (i = 0; i < Inputs_bytes; ++i) {
+            x[1 + i] ^= mask & (x[1 + i] ^ rho[i]);
+        }
+        for (i = 0; i < Ciphertexts_bytes + Confirm_bytes; ++i) {
+            x[1 + Inputs_bytes + i] = c[i];
+        }
+        x[0] = 1 + mask;
+        Hash(k, x, sizeof x);
+    }
+    return 0;
+}
diff --git a/crypto_kem/ntrulpr857/avx2/params.h b/crypto_kem/ntrulpr857/avx2/params.h
new file mode 100644
index 00000000..94309645
--- /dev/null
+++ b/crypto_kem/ntrulpr857/avx2/params.h
@@ -0,0 +1,61 @@
+#ifndef params_H
+#define params_H
+#include "crypto_core_multsntrup857.h"
+#include "crypto_decode_256x16.h"
+#include "crypto_decode_256x2.h"
+#include "crypto_decode_857x1723.h"
+#include "crypto_decode_857x3.h"
+#include "crypto_decode_857xint16.h"
+#include "crypto_decode_857xint32.h"
+#include "crypto_encode_256x16.h"
+#include "crypto_encode_256x2.h"
+#include "crypto_encode_857x1723.h"
+#include "crypto_encode_857x1723round.h"
+#include "crypto_encode_857x3.h"
+#include "crypto_encode_857xint16.h"
+#include "crypto_verify_1312.h"
+
+
+#define p 857
+#define q 5167
+#define w 281
+#define tau0 2433
+#define tau1 101
+#define tau2 2265
+#define tau3 324
+#define I 256
+
+#define ppadsort 857
+
+#define q18 51 /* round(2^18/q) */
+#define q27 25976 /* round(2^27/q) */
+#define q31 415615 /* floor(2^31/q) */
+
+#define crypto_verify_clen PQCLEAN_NTRULPR857_AVX2_crypto_verify_1312
+
+#define Rounded_bytes PQCLEAN_NTRULPR857_AVX2_crypto_decode_857x1723_STRBYTES
+#define Rounded_decode PQCLEAN_NTRULPR857_AVX2_crypto_decode_857x1723
+
+#define Round_and_encode PQCLEAN_NTRULPR857_AVX2_crypto_encode_857x1723round
+
+#define Small_bytes PQCLEAN_NTRULPR857_AVX2_crypto_encode_857x3_STRBYTES
+#define Small_encode PQCLEAN_NTRULPR857_AVX2_crypto_encode_857x3
+#define Small_decode PQCLEAN_NTRULPR857_AVX2_crypto_decode_857x3
+
+#define Top_bytes PQCLEAN_NTRULPR857_AVX2_crypto_encode_256x16_STRBYTES
+#define Top_encode PQCLEAN_NTRULPR857_AVX2_crypto_encode_256x16
+#define Top_decode PQCLEAN_NTRULPR857_AVX2_crypto_decode_256x16
+
+#define Inputs_bytes PQCLEAN_NTRULPR857_AVX2_crypto_encode_256x2_STRBYTES
+#define Inputs_encode PQCLEAN_NTRULPR857_AVX2_crypto_encode_256x2
+#define Inputs_decode PQCLEAN_NTRULPR857_AVX2_crypto_decode_256x2
+
+#define crypto_decode_pxint32 PQCLEAN_NTRULPR857_AVX2_crypto_decode_857xint32
+
+#define crypto_decode_pxint16 PQCLEAN_NTRULPR857_AVX2_crypto_decode_857xint16
+
+#define crypto_encode_pxint16 PQCLEAN_NTRULPR857_AVX2_crypto_encode_857xint16
+
+#define crypto_core_mult PQCLEAN_NTRULPR857_AVX2_crypto_core_multsntrup857
+
+#endif
diff --git a/crypto_kem/ntrulpr857/clean/LICENSE b/crypto_kem/ntrulpr857/clean/LICENSE
new file mode 100644
index 00000000..d5d21fff
--- /dev/null
+++ b/crypto_kem/ntrulpr857/clean/LICENSE
@@ -0,0 +1 @@
+Public Domain
diff --git a/crypto_kem/ntrulpr857/clean/Makefile b/crypto_kem/ntrulpr857/clean/Makefile
new file mode 100644
index 00000000..62c358a8
--- /dev/null
+++ b/crypto_kem/ntrulpr857/clean/Makefile
@@ -0,0 +1,19 @@
+# This Makefile can be used with GNU Make or BSD Make
+
+LIB=libntrulpr857_clean.a
+HEADERS=api.h crypto_core_multsntrup857.h crypto_decode_256x16.h crypto_decode_256x2.h crypto_decode_857x1723.h crypto_decode_857x3.h crypto_decode_857xint16.h crypto_decode_857xint32.h crypto_encode_256x16.h crypto_encode_256x2.h crypto_encode_857x1723.h crypto_encode_857x1723round.h crypto_encode_857x3.h crypto_encode_857xint16.h crypto_sort_int32.h crypto_sort_uint32.h crypto_stream_aes256ctr.h crypto_verify_1312.h params.h 
+OBJECTS=crypto_core_multsntrup857.o crypto_decode_256x16.o crypto_decode_256x2.o crypto_decode_857x1723.o crypto_decode_857x3.o crypto_decode_857xint16.o crypto_decode_857xint32.o crypto_encode_256x16.o crypto_encode_256x2.o crypto_encode_857x1723.o crypto_encode_857x1723round.o crypto_encode_857x3.o crypto_encode_857xint16.o crypto_sort_int32.o crypto_sort_uint32.o crypto_stream_aes256ctr.o crypto_verify_1312.o kem.o 
+
+CFLAGS=-O3 -Wall -Wextra -Wpedantic -Wvla -Werror -Wredundant-decls -Wmissing-prototypes -std=c99 -I../../../common $(EXTRAFLAGS)
+
+all: $(LIB)
+
+%.o: %.c $(HEADERS)
+	$(CC) $(CFLAGS) -c -o $@ $<
+
+$(LIB): $(OBJECTS)
+	$(AR) -r $@ $(OBJECTS)
+
+clean:
+	$(RM) $(OBJECTS)
+	$(RM) $(LIB)
diff --git a/crypto_kem/ntrulpr857/clean/Makefile.Microsoft_nmake b/crypto_kem/ntrulpr857/clean/Makefile.Microsoft_nmake
new file mode 100644
index 00000000..ae1b6943
--- /dev/null
+++ b/crypto_kem/ntrulpr857/clean/Makefile.Microsoft_nmake
@@ -0,0 +1,19 @@
+# This Makefile can be used with Microsoft Visual Studio's nmake using the command:
+#    nmake /f Makefile.Microsoft_nmake
+
+LIBRARY=libntrulpr857_clean.lib
+OBJECTS=crypto_core_multsntrup857.obj crypto_decode_256x16.obj crypto_decode_256x2.obj crypto_decode_857x1723.obj crypto_decode_857x3.obj crypto_decode_857xint16.obj crypto_decode_857xint32.obj crypto_encode_256x16.obj crypto_encode_256x2.obj crypto_encode_857x1723.obj crypto_encode_857x1723round.obj crypto_encode_857x3.obj crypto_encode_857xint16.obj crypto_sort_int32.obj crypto_sort_uint32.obj crypto_stream_aes256ctr.obj crypto_verify_1312.obj kem.obj 
+
+CFLAGS=/nologo /O2 /I ..\..\..\common /W4 /WX
+
+all: $(LIBRARY)
+
+# Make sure objects are recompiled if headers change.
+$(OBJECTS): *.h
+
+$(LIBRARY): $(OBJECTS)
+    LIB.EXE /NOLOGO /WX /OUT:$@ $**
+
+clean:
+    -DEL $(OBJECTS)
+    -DEL $(LIBRARY)
diff --git a/crypto_kem/ntrulpr857/clean/api.h b/crypto_kem/ntrulpr857/clean/api.h
new file mode 100644
index 00000000..93134750
--- /dev/null
+++ b/crypto_kem/ntrulpr857/clean/api.h
@@ -0,0 +1,16 @@
+#ifndef PQCLEAN_NTRULPR857_CLEAN_API_H
+#define PQCLEAN_NTRULPR857_CLEAN_API_H
+
+
+
+#define PQCLEAN_NTRULPR857_CLEAN_CRYPTO_ALGNAME "ntrulpr857"
+
+#define PQCLEAN_NTRULPR857_CLEAN_CRYPTO_SECRETKEYBYTES 1463
+#define PQCLEAN_NTRULPR857_CLEAN_CRYPTO_PUBLICKEYBYTES 1184
+#define PQCLEAN_NTRULPR857_CLEAN_CRYPTO_CIPHERTEXTBYTES 1312
+#define PQCLEAN_NTRULPR857_CLEAN_CRYPTO_BYTES 32
+
+int PQCLEAN_NTRULPR857_CLEAN_crypto_kem_keypair(unsigned char *pk, unsigned char *sk);
+int PQCLEAN_NTRULPR857_CLEAN_crypto_kem_enc(unsigned char *c, unsigned char *k, const unsigned char *pk);
+int PQCLEAN_NTRULPR857_CLEAN_crypto_kem_dec(unsigned char *k, const unsigned char *c, const unsigned char *sk);
+#endif
diff --git a/crypto_kem/ntrulpr857/clean/crypto_core_multsntrup857.c b/crypto_kem/ntrulpr857/clean/crypto_core_multsntrup857.c
new file mode 100644
index 00000000..28c153d4
--- /dev/null
+++ b/crypto_kem/ntrulpr857/clean/crypto_core_multsntrup857.c
@@ -0,0 +1,60 @@
+#include "crypto_core_multsntrup857.h"
+#include "params.h"
+
+
+#define int8 int8_t
+#define int16 int16_t
+#define int32 int32_t
+typedef int8 small;
+
+typedef int16 Fq;
+/* always represented as -(q-1)/2...(q-1)/2 */
+
+/* works for -14000000 < x < 14000000 if q in 4591, 4621, 5167 */
+static Fq Fq_freeze(int32 x) {
+    x -= q * ((q18 * x) >> 18);
+    x -= q * ((q27 * x + 67108864) >> 27);
+    return x;
+}
+
+int PQCLEAN_NTRULPR857_CLEAN_crypto_core_multsntrup857(unsigned char *outbytes, const unsigned char *inbytes, const unsigned char *kbytes) {
+    Fq f[p];
+    small g[p];
+    Fq fg[p + p - 1];
+    int32 result;
+    int i, j;
+
+    crypto_decode_pxint16(f, inbytes);
+    for (i = 0; i < p; ++i) {
+        f[i] = Fq_freeze(f[i]);
+    }
+
+    for (i = 0; i < p; ++i) {
+        small gi = kbytes[i];
+        small gi0 = gi & 1;
+        g[i] = gi0 - (gi & (gi0 << 1));
+    }
+
+    for (i = 0; i < p; ++i) {
+        result = 0;
+        for (j = 0; j <= i; ++j) {
+            result += f[j] * (int32)g[i - j];
+        }
+        fg[i] = Fq_freeze(result);
+    }
+    for (i = p; i < p + p - 1; ++i) {
+        result = 0;
+        for (j = i - p + 1; j < p; ++j) {
+            result += f[j] * (int32)g[i - j];
+        }
+        fg[i] = Fq_freeze(result);
+    }
+
+    for (i = p + p - 2; i >= p; --i) {
+        fg[i - p] = Fq_freeze(fg[i - p] + fg[i]);
+        fg[i - p + 1] = Fq_freeze(fg[i - p + 1] + fg[i]);
+    }
+
+    crypto_encode_pxint16(outbytes, fg);
+    return 0;
+}
diff --git a/crypto_kem/ntrulpr857/clean/crypto_core_multsntrup857.h b/crypto_kem/ntrulpr857/clean/crypto_core_multsntrup857.h
new file mode 100644
index 00000000..cde1e9da
--- /dev/null
+++ b/crypto_kem/ntrulpr857/clean/crypto_core_multsntrup857.h
@@ -0,0 +1,11 @@
+#ifndef PQCLEAN_NTRULPR857_CLEAN_CRYPTO_CORE_MULTSNTRUP857_H
+#define PQCLEAN_NTRULPR857_CLEAN_CRYPTO_CORE_MULTSNTRUP857_H
+
+#include <stdint.h>
+#define PQCLEAN_NTRULPR857_CLEAN_crypto_core_multsntrup857_OUTPUTBYTES 1714
+#define PQCLEAN_NTRULPR857_CLEAN_crypto_core_multsntrup857_INPUTBYTES 1714
+#define PQCLEAN_NTRULPR857_CLEAN_crypto_core_multsntrup857_KEYBYTES 857
+#define PQCLEAN_NTRULPR857_CLEAN_crypto_core_multsntrup857_CONSTBYTES 0
+
+int PQCLEAN_NTRULPR857_CLEAN_crypto_core_multsntrup857(unsigned char *outbytes, const unsigned char *inbytes, const unsigned char *kbytes);
+#endif
diff --git a/crypto_kem/ntrulpr857/clean/crypto_decode_256x16.c b/crypto_kem/ntrulpr857/clean/crypto_decode_256x16.c
new file mode 100644
index 00000000..a43c7272
--- /dev/null
+++ b/crypto_kem/ntrulpr857/clean/crypto_decode_256x16.c
@@ -0,0 +1,11 @@
+#include "crypto_decode_256x16.h"
+
+
+void PQCLEAN_NTRULPR857_CLEAN_crypto_decode_256x16(void *v, const unsigned char *s) {
+    unsigned char *T = v;
+    int i;
+    for (i = 0; i < 128; ++i) {
+        T[2 * i] = s[i] & 15;
+        T[2 * i + 1] = s[i] >> 4;
+    }
+}
diff --git a/crypto_kem/ntrulpr857/clean/crypto_decode_256x16.h b/crypto_kem/ntrulpr857/clean/crypto_decode_256x16.h
new file mode 100644
index 00000000..afdd8757
--- /dev/null
+++ b/crypto_kem/ntrulpr857/clean/crypto_decode_256x16.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_NTRULPR857_CLEAN_CRYPTO_DECODE_256X16_H
+#define PQCLEAN_NTRULPR857_CLEAN_CRYPTO_DECODE_256X16_H
+
+#include <stdint.h>
+#define PQCLEAN_NTRULPR857_CLEAN_crypto_decode_256x16_STRBYTES 128
+#define PQCLEAN_NTRULPR857_CLEAN_crypto_decode_256x16_ITEMS 256
+#define PQCLEAN_NTRULPR857_CLEAN_crypto_decode_256x16_ITEMBYTES 1
+
+void PQCLEAN_NTRULPR857_CLEAN_crypto_decode_256x16(void *v, const unsigned char *s);
+#endif
diff --git a/crypto_kem/ntrulpr857/clean/crypto_decode_256x2.c b/crypto_kem/ntrulpr857/clean/crypto_decode_256x2.c
new file mode 100644
index 00000000..80e8c85c
--- /dev/null
+++ b/crypto_kem/ntrulpr857/clean/crypto_decode_256x2.c
@@ -0,0 +1,10 @@
+#include "crypto_decode_256x2.h"
+
+
+void PQCLEAN_NTRULPR857_CLEAN_crypto_decode_256x2(void *v, const unsigned char *s) {
+    unsigned char *r = v;
+    int i;
+    for (i = 0; i < 256; ++i) {
+        r[i] = 1 & (s[i >> 3] >> (i & 7));
+    }
+}
diff --git a/crypto_kem/ntrulpr857/clean/crypto_decode_256x2.h b/crypto_kem/ntrulpr857/clean/crypto_decode_256x2.h
new file mode 100644
index 00000000..acf4e947
--- /dev/null
+++ b/crypto_kem/ntrulpr857/clean/crypto_decode_256x2.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_NTRULPR857_CLEAN_CRYPTO_DECODE_256X2_H
+#define PQCLEAN_NTRULPR857_CLEAN_CRYPTO_DECODE_256X2_H
+
+#include <stdint.h>
+#define PQCLEAN_NTRULPR857_CLEAN_crypto_decode_256x2_STRBYTES 32
+#define PQCLEAN_NTRULPR857_CLEAN_crypto_decode_256x2_ITEMS 256
+#define PQCLEAN_NTRULPR857_CLEAN_crypto_decode_256x2_ITEMBYTES 1
+
+void PQCLEAN_NTRULPR857_CLEAN_crypto_decode_256x2(void *v, const unsigned char *s);
+#endif
diff --git a/crypto_kem/ntrulpr857/clean/crypto_decode_857x1723.c b/crypto_kem/ntrulpr857/clean/crypto_decode_857x1723.c
new file mode 100644
index 00000000..ca5630a2
--- /dev/null
+++ b/crypto_kem/ntrulpr857/clean/crypto_decode_857x1723.c
@@ -0,0 +1,202 @@
+#include "crypto_decode_857x1723.h"
+
+/* auto-generated; do not edit */
+
+#define int16 int16_t
+#define uint16 uint16_t
+#define uint32 uint32_t
+#define uint64 uint64_t
+
+/*
+CPU division instruction typically takes time depending on x.
+This software is designed to take time independent of x.
+Time still varies depending on m; user must ensure that m is constant.
+Time also varies on CPUs where multiplication is variable-time.
+There could be more CPU issues.
+There could also be compiler issues.
+*/
+
+static void uint32_divmod_uint14(uint32 *q, uint16 *r, uint32 x, uint16 m) {
+    uint32 v = 0x80000000;
+    uint32 qpart;
+    uint32 mask;
+
+    v /= m;
+
+    /* caller guarantees m > 0 */
+    /* caller guarantees m < 16384 */
+    /* vm <= 2^31 <= vm+m-1 */
+    /* xvm <= 2^31 x <= xvm+x(m-1) */
+
+    *q = 0;
+
+    qpart = (x * (uint64)v) >> 31;
+    /* 2^31 qpart <= xv <= 2^31 qpart + 2^31-1 */
+    /* 2^31 qpart m <= xvm <= 2^31 qpart m + (2^31-1)m */
+    /* 2^31 qpart m <= 2^31 x <= 2^31 qpart m + (2^31-1)m + x(m-1) */
+    /* 0 <= 2^31 newx <= (2^31-1)m + x(m-1) */
+    /* 0 <= newx <= (1-1/2^31)m + x(m-1)/2^31 */
+    /* 0 <= newx <= (1-1/2^31)(2^14-1) + (2^32-1)((2^14-1)-1)/2^31 */
+
+    x -= qpart * m;
+    *q += qpart;
+    /* x <= 49146 */
+
+    qpart = (x * (uint64)v) >> 31;
+    /* 0 <= newx <= (1-1/2^31)m + x(m-1)/2^31 */
+    /* 0 <= newx <= m + 49146(2^14-1)/2^31 */
+    /* 0 <= newx <= m + 0.4 */
+    /* 0 <= newx <= m */
+
+    x -= qpart * m;
+    *q += qpart;
+    /* x <= m */
+
+    x -= m;
+    *q += 1;
+    mask = -(x >> 31);
+    x += mask & (uint32)m;
+    *q += mask;
+    /* x < m */
+
+    *r = x;
+}
+
+static uint16 uint32_mod_uint14(uint32 x, uint16 m) {
+    uint32 q;
+    uint16 r;
+    uint32_divmod_uint14(&q, &r, x, m);
+    return r;
+}
+
+void PQCLEAN_NTRULPR857_CLEAN_crypto_decode_857x1723(void *v, const unsigned char *s) {
+    int16 *R0 = v;
+    uint16 R1[429], R2[215], R3[108], R4[54], R5[27], R6[14], R7[7], R8[4], R9[2], R10[1];
+    long long i;
+    uint16 r0;
+    uint32 r1, r2;
+
+    s += PQCLEAN_NTRULPR857_CLEAN_crypto_decode_857x1723_STRBYTES;
+    r1 = 0;
+    r1 = (r1 << 8) | *--s;
+    r1 = uint32_mod_uint14(r1, 160); /* needed only for invalid inputs */
+    R10[0] = r1;
+
+    r2 = R10[0];
+    r2 = (r2 << 8) | *--s;
+    r2 = (r2 << 8) | *--s;
+    uint32_divmod_uint14(&r1, &r0, r2, 743);
+    R9[0] = r0;
+    r1 = uint32_mod_uint14(r1, 14044); /* needed only for invalid inputs */
+    R9[1] = r1;
+
+    r2 = R9[1];
+    r2 = (r2 << 8) | *--s;
+    uint32_divmod_uint14(&r1, &r0, r2, 436);
+    R8[2] = r0;
+    r1 = uint32_mod_uint14(r1, 8246); /* needed only for invalid inputs */
+    R8[3] = r1;
+    r2 = R9[0];
+    r2 = (r2 << 8) | *--s;
+    uint32_divmod_uint14(&r1, &r0, r2, 436);
+    R8[0] = r0;
+    r1 = uint32_mod_uint14(r1, 436); /* needed only for invalid inputs */
+    R8[1] = r1;
+
+    R7[6] = R8[3];
+    for (i = 2; i >= 0; --i) {
+        r2 = R8[i];
+        r2 = (r2 << 8) | *--s;
+        uint32_divmod_uint14(&r1, &r0, r2, 334);
+        R7[2 * i] = r0;
+        r1 = uint32_mod_uint14(r1, 334); /* needed only for invalid inputs */
+        R7[2 * i + 1] = r1;
+    }
+
+    r2 = R7[6];
+    r2 = (r2 << 8) | *--s;
+    uint32_divmod_uint14(&r1, &r0, r2, 292);
+    R6[12] = r0;
+    r1 = uint32_mod_uint14(r1, 7229); /* needed only for invalid inputs */
+    R6[13] = r1;
+    for (i = 5; i >= 0; --i) {
+        r2 = R7[i];
+        r2 = (r2 << 8) | *--s;
+        uint32_divmod_uint14(&r1, &r0, r2, 292);
+        R6[2 * i] = r0;
+        r1 = uint32_mod_uint14(r1, 292); /* needed only for invalid inputs */
+        R6[2 * i + 1] = r1;
+    }
+
+    R5[26] = R6[13];
+    for (i = 12; i >= 0; --i) {
+        r2 = R6[i];
+        r2 = (r2 << 8) | *--s;
+        uint32_divmod_uint14(&r1, &r0, r2, 273);
+        R5[2 * i] = r0;
+        r1 = uint32_mod_uint14(r1, 273); /* needed only for invalid inputs */
+        R5[2 * i + 1] = r1;
+    }
+
+    r2 = R5[26];
+    r2 = (r2 << 8) | *--s;
+    uint32_divmod_uint14(&r1, &r0, r2, 4225);
+    R4[52] = r0;
+    r1 = uint32_mod_uint14(r1, 438); /* needed only for invalid inputs */
+    R4[53] = r1;
+    for (i = 25; i >= 0; --i) {
+        r2 = R5[i];
+        r2 = (r2 << 8) | *--s;
+        r2 = (r2 << 8) | *--s;
+        uint32_divmod_uint14(&r1, &r0, r2, 4225);
+        R4[2 * i] = r0;
+        r1 = uint32_mod_uint14(r1, 4225); /* needed only for invalid inputs */
+        R4[2 * i + 1] = r1;
+    }
+
+    r2 = R4[53];
+    r2 = (r2 << 8) | *--s;
+    uint32_divmod_uint14(&r1, &r0, r2, 65);
+    R3[106] = r0;
+    r1 = uint32_mod_uint14(r1, 1723); /* needed only for invalid inputs */
+    R3[107] = r1;
+    for (i = 52; i >= 0; --i) {
+        r2 = R4[i];
+        uint32_divmod_uint14(&r1, &r0, r2, 65);
+        R3[2 * i] = r0;
+        r1 = uint32_mod_uint14(r1, 65); /* needed only for invalid inputs */
+        R3[2 * i + 1] = r1;
+    }
+
+    R2[214] = R3[107];
+    for (i = 106; i >= 0; --i) {
+        r2 = R3[i];
+        r2 = (r2 << 8) | *--s;
+        r2 = (r2 << 8) | *--s;
+        uint32_divmod_uint14(&r1, &r0, r2, 2053);
+        R2[2 * i] = r0;
+        r1 = uint32_mod_uint14(r1, 2053); /* needed only for invalid inputs */
+        R2[2 * i + 1] = r1;
+    }
+
+    R1[428] = R2[214];
+    for (i = 213; i >= 0; --i) {
+        r2 = R2[i];
+        r2 = (r2 << 8) | *--s;
+        r2 = (r2 << 8) | *--s;
+        uint32_divmod_uint14(&r1, &r0, r2, 11597);
+        R1[2 * i] = r0;
+        r1 = uint32_mod_uint14(r1, 11597); /* needed only for invalid inputs */
+        R1[2 * i + 1] = r1;
+    }
+
+    R0[856] = 3 * R1[428] - 2583;
+    for (i = 427; i >= 0; --i) {
+        r2 = R1[i];
+        r2 = (r2 << 8) | *--s;
+        uint32_divmod_uint14(&r1, &r0, r2, 1723);
+        R0[2 * i] = 3 * r0 - 2583;
+        r1 = uint32_mod_uint14(r1, 1723); /* needed only for invalid inputs */
+        R0[2 * i + 1] = 3 * r1 - 2583;
+    }
+}
diff --git a/crypto_kem/ntrulpr857/clean/crypto_decode_857x1723.h b/crypto_kem/ntrulpr857/clean/crypto_decode_857x1723.h
new file mode 100644
index 00000000..e9c65aec
--- /dev/null
+++ b/crypto_kem/ntrulpr857/clean/crypto_decode_857x1723.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_NTRULPR857_CLEAN_CRYPTO_DECODE_857X1723_H
+#define PQCLEAN_NTRULPR857_CLEAN_CRYPTO_DECODE_857X1723_H
+
+#include <stdint.h>
+#define PQCLEAN_NTRULPR857_CLEAN_crypto_decode_857x1723_STRBYTES 1152
+#define PQCLEAN_NTRULPR857_CLEAN_crypto_decode_857x1723_ITEMS 857
+#define PQCLEAN_NTRULPR857_CLEAN_crypto_decode_857x1723_ITEMBYTES 2
+
+void PQCLEAN_NTRULPR857_CLEAN_crypto_decode_857x1723(void *v, const unsigned char *s);
+#endif
diff --git a/crypto_kem/ntrulpr857/clean/crypto_decode_857x3.c b/crypto_kem/ntrulpr857/clean/crypto_decode_857x3.c
new file mode 100644
index 00000000..9b950df8
--- /dev/null
+++ b/crypto_kem/ntrulpr857/clean/crypto_decode_857x3.c
@@ -0,0 +1,24 @@
+#include "crypto_decode_857x3.h"
+
+#define uint8 uint8_t
+
+#define p 857
+
+void PQCLEAN_NTRULPR857_CLEAN_crypto_decode_857x3(void *v, const unsigned char *s) {
+    uint8 *f = v;
+    uint8 x;
+    int i;
+
+    for (i = 0; i < p / 4; ++i) {
+        x = *s++;
+        *f++ = ((uint8)(x & 3)) - 1;
+        x >>= 2;
+        *f++ = ((uint8)(x & 3)) - 1;
+        x >>= 2;
+        *f++ = ((uint8)(x & 3)) - 1;
+        x >>= 2;
+        *f++ = ((uint8)(x & 3)) - 1;
+    }
+    x = *s++;
+    *f++ = ((uint8)(x & 3)) - 1;
+}
diff --git a/crypto_kem/ntrulpr857/clean/crypto_decode_857x3.h b/crypto_kem/ntrulpr857/clean/crypto_decode_857x3.h
new file mode 100644
index 00000000..b7fd54cf
--- /dev/null
+++ b/crypto_kem/ntrulpr857/clean/crypto_decode_857x3.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_NTRULPR857_CLEAN_CRYPTO_DECODE_857X3_H
+#define PQCLEAN_NTRULPR857_CLEAN_CRYPTO_DECODE_857X3_H
+
+#include <stdint.h>
+#define PQCLEAN_NTRULPR857_CLEAN_crypto_decode_857x3_STRBYTES 215
+#define PQCLEAN_NTRULPR857_CLEAN_crypto_decode_857x3_ITEMS 857
+#define PQCLEAN_NTRULPR857_CLEAN_crypto_decode_857x3_ITEMBYTES 1
+
+void PQCLEAN_NTRULPR857_CLEAN_crypto_decode_857x3(void *v, const unsigned char *s);
+#endif
diff --git a/crypto_kem/ntrulpr857/clean/crypto_decode_857xint16.c b/crypto_kem/ntrulpr857/clean/crypto_decode_857xint16.c
new file mode 100644
index 00000000..ec185e8e
--- /dev/null
+++ b/crypto_kem/ntrulpr857/clean/crypto_decode_857xint16.c
@@ -0,0 +1,16 @@
+#include "crypto_decode_857xint16.h"
+
+
+void PQCLEAN_NTRULPR857_CLEAN_crypto_decode_857xint16(void *v, const unsigned char *s) {
+    uint16_t *x = v;
+    int i;
+
+    for (i = 0; i < 857; ++i) {
+        uint16_t u0 = s[0];
+        uint16_t u1 = s[1];
+        u1 <<= 8;
+        *x = u0 | u1;
+        x += 1;
+        s += 2;
+    }
+}
diff --git a/crypto_kem/ntrulpr857/clean/crypto_decode_857xint16.h b/crypto_kem/ntrulpr857/clean/crypto_decode_857xint16.h
new file mode 100644
index 00000000..c774bd94
--- /dev/null
+++ b/crypto_kem/ntrulpr857/clean/crypto_decode_857xint16.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_NTRULPR857_CLEAN_CRYPTO_DECODE_857XINT16_H
+#define PQCLEAN_NTRULPR857_CLEAN_CRYPTO_DECODE_857XINT16_H
+
+#include <stdint.h>
+#define PQCLEAN_NTRULPR857_CLEAN_crypto_decode_857xint16_STRBYTES 1714
+#define PQCLEAN_NTRULPR857_CLEAN_crypto_decode_857xint16_ITEMBYTES 2
+#define PQCLEAN_NTRULPR857_CLEAN_crypto_decode_857xint16_ITEMS 857
+
+void PQCLEAN_NTRULPR857_CLEAN_crypto_decode_857xint16(void *v, const unsigned char *s);
+#endif
diff --git a/crypto_kem/ntrulpr857/clean/crypto_decode_857xint32.c b/crypto_kem/ntrulpr857/clean/crypto_decode_857xint32.c
new file mode 100644
index 00000000..3ad8731e
--- /dev/null
+++ b/crypto_kem/ntrulpr857/clean/crypto_decode_857xint32.c
@@ -0,0 +1,20 @@
+#include "crypto_decode_857xint32.h"
+
+
+void PQCLEAN_NTRULPR857_CLEAN_crypto_decode_857xint32(void *v, const unsigned char *s) {
+    uint32_t *x = v;
+    int i;
+
+    for (i = 0; i < 857; ++i) {
+        uint32_t u0 = s[0];
+        uint32_t u1 = s[1];
+        uint32_t u2 = s[2];
+        uint32_t u3 = s[3];
+        u1 <<= 8;
+        u2 <<= 16;
+        u3 <<= 24;
+        *x = u0 | u1 | u2 | u3;
+        x += 1;
+        s += 4;
+    }
+}
diff --git a/crypto_kem/ntrulpr857/clean/crypto_decode_857xint32.h b/crypto_kem/ntrulpr857/clean/crypto_decode_857xint32.h
new file mode 100644
index 00000000..8beda2db
--- /dev/null
+++ b/crypto_kem/ntrulpr857/clean/crypto_decode_857xint32.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_NTRULPR857_CLEAN_CRYPTO_DECODE_857XINT32_H
+#define PQCLEAN_NTRULPR857_CLEAN_CRYPTO_DECODE_857XINT32_H
+
+#include <stdint.h>
+#define PQCLEAN_NTRULPR857_CLEAN_crypto_decode_857xint32_STRBYTES 3428
+#define PQCLEAN_NTRULPR857_CLEAN_crypto_decode_857xint32_ITEMBYTES 4
+#define PQCLEAN_NTRULPR857_CLEAN_crypto_decode_857xint32_ITEMS 857
+
+void PQCLEAN_NTRULPR857_CLEAN_crypto_decode_857xint32(void *v, const unsigned char *s);
+#endif
diff --git a/crypto_kem/ntrulpr857/clean/crypto_encode_256x16.c b/crypto_kem/ntrulpr857/clean/crypto_encode_256x16.c
new file mode 100644
index 00000000..dcaba21a
--- /dev/null
+++ b/crypto_kem/ntrulpr857/clean/crypto_encode_256x16.c
@@ -0,0 +1,10 @@
+#include "crypto_encode_256x16.h"
+
+
+void PQCLEAN_NTRULPR857_CLEAN_crypto_encode_256x16(unsigned char *s, const void *v) {
+    const unsigned char *T = v;
+    int i;
+    for (i = 0; i < 128; ++i) {
+        s[i] = T[2 * i] + (T[2 * i + 1] << 4);
+    }
+}
diff --git a/crypto_kem/ntrulpr857/clean/crypto_encode_256x16.h b/crypto_kem/ntrulpr857/clean/crypto_encode_256x16.h
new file mode 100644
index 00000000..30213833
--- /dev/null
+++ b/crypto_kem/ntrulpr857/clean/crypto_encode_256x16.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_NTRULPR857_CLEAN_CRYPTO_ENCODE_256X16_H
+#define PQCLEAN_NTRULPR857_CLEAN_CRYPTO_ENCODE_256X16_H
+
+#include <stdint.h>
+#define PQCLEAN_NTRULPR857_CLEAN_crypto_encode_256x16_STRBYTES 128
+#define PQCLEAN_NTRULPR857_CLEAN_crypto_encode_256x16_ITEMS 256
+#define PQCLEAN_NTRULPR857_CLEAN_crypto_encode_256x16_ITEMBYTES 1
+
+void PQCLEAN_NTRULPR857_CLEAN_crypto_encode_256x16(unsigned char *s, const void *v);
+#endif
diff --git a/crypto_kem/ntrulpr857/clean/crypto_encode_256x2.c b/crypto_kem/ntrulpr857/clean/crypto_encode_256x2.c
new file mode 100644
index 00000000..ccd8fb73
--- /dev/null
+++ b/crypto_kem/ntrulpr857/clean/crypto_encode_256x2.c
@@ -0,0 +1,13 @@
+#include "crypto_encode_256x2.h"
+
+
+void PQCLEAN_NTRULPR857_CLEAN_crypto_encode_256x2(unsigned char *s, const void *v) {
+    const unsigned char *r = v;
+    int i;
+    for (i = 0; i < 32; ++i) {
+        s[i] = 0;
+    }
+    for (i = 0; i < 256; ++i) {
+        s[i >> 3] |= (r[i] & 1) << (i & 7);
+    }
+}
diff --git a/crypto_kem/ntrulpr857/clean/crypto_encode_256x2.h b/crypto_kem/ntrulpr857/clean/crypto_encode_256x2.h
new file mode 100644
index 00000000..782cbf1f
--- /dev/null
+++ b/crypto_kem/ntrulpr857/clean/crypto_encode_256x2.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_NTRULPR857_CLEAN_CRYPTO_ENCODE_256X2_H
+#define PQCLEAN_NTRULPR857_CLEAN_CRYPTO_ENCODE_256X2_H
+
+#include <stdint.h>
+#define PQCLEAN_NTRULPR857_CLEAN_crypto_encode_256x2_STRBYTES 32
+#define PQCLEAN_NTRULPR857_CLEAN_crypto_encode_256x2_ITEMS 256
+#define PQCLEAN_NTRULPR857_CLEAN_crypto_encode_256x2_ITEMBYTES 1
+
+void PQCLEAN_NTRULPR857_CLEAN_crypto_encode_256x2(unsigned char *s, const void *v);
+#endif
diff --git a/crypto_kem/ntrulpr857/clean/crypto_encode_857x1723.c b/crypto_kem/ntrulpr857/clean/crypto_encode_857x1723.c
new file mode 100644
index 00000000..3922bf8f
--- /dev/null
+++ b/crypto_kem/ntrulpr857/clean/crypto_encode_857x1723.c
@@ -0,0 +1,130 @@
+#include "crypto_encode_857x1723.h"
+
+/* auto-generated; do not edit */
+
+#define int16 int16_t
+#define uint16 uint16_t
+#define uint32 uint32_t
+
+void PQCLEAN_NTRULPR857_CLEAN_crypto_encode_857x1723(unsigned char *out, const void *v) {
+    const int16 *R0 = v;
+    /* XXX: caller could overlap R with input */
+    uint16 R[429];
+    long i;
+    uint16 r0, r1;
+    uint32 r2;
+
+    for (i = 0; i < 428; ++i) {
+        r0 = (((R0[2 * i] + 2583) & 16383) * 10923) >> 15;
+        r1 = (((R0[2 * i + 1] + 2583) & 16383) * 10923) >> 15;
+        r2 = r0 + r1 * (uint32)1723;
+        *out++ = r2;
+        r2 >>= 8;
+        R[i] = r2;
+    }
+    R[428] = (((R0[856] + 2583) & 16383) * 10923) >> 15;
+
+    for (i = 0; i < 214; ++i) {
+        r0 = R[2 * i];
+        r1 = R[2 * i + 1];
+        r2 = r0 + r1 * (uint32)11597;
+        *out++ = r2;
+        r2 >>= 8;
+        *out++ = r2;
+        r2 >>= 8;
+        R[i] = r2;
+    }
+    R[214] = R[428];
+
+    for (i = 0; i < 107; ++i) {
+        r0 = R[2 * i];
+        r1 = R[2 * i + 1];
+        r2 = r0 + r1 * (uint32)2053;
+        *out++ = r2;
+        r2 >>= 8;
+        *out++ = r2;
+        r2 >>= 8;
+        R[i] = r2;
+    }
+    R[107] = R[214];
+
+    for (i = 0; i < 53; ++i) {
+        r0 = R[2 * i];
+        r1 = R[2 * i + 1];
+        r2 = r0 + r1 * (uint32)65;
+        R[i] = r2;
+    }
+    r0 = R[106];
+    r1 = R[107];
+    r2 = r0 + r1 * (uint32)65;
+    *out++ = r2;
+    r2 >>= 8;
+    R[53] = r2;
+
+    for (i = 0; i < 26; ++i) {
+        r0 = R[2 * i];
+        r1 = R[2 * i + 1];
+        r2 = r0 + r1 * (uint32)4225;
+        *out++ = r2;
+        r2 >>= 8;
+        *out++ = r2;
+        r2 >>= 8;
+        R[i] = r2;
+    }
+    r0 = R[52];
+    r1 = R[53];
+    r2 = r0 + r1 * (uint32)4225;
+    *out++ = r2;
+    r2 >>= 8;
+    R[26] = r2;
+
+    for (i = 0; i < 13; ++i) {
+        r0 = R[2 * i];
+        r1 = R[2 * i + 1];
+        r2 = r0 + r1 * (uint32)273;
+        *out++ = r2;
+        r2 >>= 8;
+        R[i] = r2;
+    }
+    R[13] = R[26];
+
+    for (i = 0; i < 7; ++i) {
+        r0 = R[2 * i];
+        r1 = R[2 * i + 1];
+        r2 = r0 + r1 * (uint32)292;
+        *out++ = r2;
+        r2 >>= 8;
+        R[i] = r2;
+    }
+
+    for (i = 0; i < 3; ++i) {
+        r0 = R[2 * i];
+        r1 = R[2 * i + 1];
+        r2 = r0 + r1 * (uint32)334;
+        *out++ = r2;
+        r2 >>= 8;
+        R[i] = r2;
+    }
+    R[3] = R[6];
+
+    for (i = 0; i < 2; ++i) {
+        r0 = R[2 * i];
+        r1 = R[2 * i + 1];
+        r2 = r0 + r1 * (uint32)436;
+        *out++ = r2;
+        r2 >>= 8;
+        R[i] = r2;
+    }
+
+    r0 = R[0];
+    r1 = R[1];
+    r2 = r0 + r1 * (uint32)743;
+    *out++ = r2;
+    r2 >>= 8;
+    *out++ = r2;
+    r2 >>= 8;
+    R[0] = r2;
+
+    r0 = R[0];
+    *out++ = r0; /*clang-analyzer-deadcode.DeadStores*/ /*r0 >>= 8;*/
+}
diff --git a/crypto_kem/ntrulpr857/clean/crypto_encode_857x1723.h b/crypto_kem/ntrulpr857/clean/crypto_encode_857x1723.h
new file mode 100644
index 00000000..ce1754bc
--- /dev/null
+++ b/crypto_kem/ntrulpr857/clean/crypto_encode_857x1723.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_NTRULPR857_CLEAN_CRYPTO_ENCODE_857X1723_H
+#define PQCLEAN_NTRULPR857_CLEAN_CRYPTO_ENCODE_857X1723_H
+
+#include <stdint.h>
+#define PQCLEAN_NTRULPR857_CLEAN_crypto_encode_857x1723_STRBYTES 1152
+#define PQCLEAN_NTRULPR857_CLEAN_crypto_encode_857x1723_ITEMS 857
+#define PQCLEAN_NTRULPR857_CLEAN_crypto_encode_857x1723_ITEMBYTES 2
+
+void PQCLEAN_NTRULPR857_CLEAN_crypto_encode_857x1723(unsigned char *out, const void *v);
+#endif
diff --git a/crypto_kem/ntrulpr857/clean/crypto_encode_857x1723round.c b/crypto_kem/ntrulpr857/clean/crypto_encode_857x1723round.c
new file mode 100644
index 00000000..9e7460ba
--- /dev/null
+++ b/crypto_kem/ntrulpr857/clean/crypto_encode_857x1723round.c
@@ -0,0 +1,17 @@
+#include "crypto_encode_857x1723.h"
+#include "crypto_encode_857x1723round.h"
+
+#define int16 int16_t
+
+#define p 857
+
+void PQCLEAN_NTRULPR857_CLEAN_crypto_encode_857x1723round(unsigned char *out, const void *v) {
+    const int16 *a = v;
+    int16 x[p];
+    int i;
+
+    for (i = 0; i < p; ++i) {
+        x[i] = 3 * ((10923 * a[i] + 16384) >> 15);
+    }
+    PQCLEAN_NTRULPR857_CLEAN_crypto_encode_857x1723(out, x);
+}
diff --git a/crypto_kem/ntrulpr857/clean/crypto_encode_857x1723round.h b/crypto_kem/ntrulpr857/clean/crypto_encode_857x1723round.h
new file mode 100644
index 00000000..3dd8f78c
--- /dev/null
+++ b/crypto_kem/ntrulpr857/clean/crypto_encode_857x1723round.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_NTRULPR857_CLEAN_CRYPTO_ENCODE_857X1723ROUND_H
+#define PQCLEAN_NTRULPR857_CLEAN_CRYPTO_ENCODE_857X1723ROUND_H
+
+#include <stdint.h>
+#define PQCLEAN_NTRULPR857_CLEAN_crypto_encode_857x1723round_STRBYTES 1152
+#define PQCLEAN_NTRULPR857_CLEAN_crypto_encode_857x1723round_ITEMS 857
+#define PQCLEAN_NTRULPR857_CLEAN_crypto_encode_857x1723round_ITEMBYTES 2
+
+void PQCLEAN_NTRULPR857_CLEAN_crypto_encode_857x1723round(unsigned char *out, const void *v);
+#endif
diff --git a/crypto_kem/ntrulpr857/clean/crypto_encode_857x3.c b/crypto_kem/ntrulpr857/clean/crypto_encode_857x3.c
new file mode 100644
index 00000000..b19644b4
--- /dev/null
+++ b/crypto_kem/ntrulpr857/clean/crypto_encode_857x3.c
@@ -0,0 +1,21 @@
+#include "crypto_encode_857x3.h"
+
+#define uint8 uint8_t
+
+#define p 857
+
+void PQCLEAN_NTRULPR857_CLEAN_crypto_encode_857x3(unsigned char *s, const void *v) {
+    const uint8 *f = v;
+    uint8 x;
+    int i;
+
+    for (i = 0; i < p / 4; ++i) {
+        x = *f++ + 1;
+        x += (*f++ + 1) << 2;
+        x += (*f++ + 1) << 4;
+        x += (*f++ + 1) << 6;
+        *s++ = x;
+    }
+    x = *f++ + 1;
+    *s++ = x;
+}
diff --git a/crypto_kem/ntrulpr857/clean/crypto_encode_857x3.h b/crypto_kem/ntrulpr857/clean/crypto_encode_857x3.h
new file mode 100644
index 00000000..e9fafe77
--- /dev/null
+++ b/crypto_kem/ntrulpr857/clean/crypto_encode_857x3.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_NTRULPR857_CLEAN_CRYPTO_ENCODE_857X3_H
+#define PQCLEAN_NTRULPR857_CLEAN_CRYPTO_ENCODE_857X3_H
+
+#include <stdint.h>
+#define PQCLEAN_NTRULPR857_CLEAN_crypto_encode_857x3_STRBYTES 215
+#define PQCLEAN_NTRULPR857_CLEAN_crypto_encode_857x3_ITEMS 857
+#define PQCLEAN_NTRULPR857_CLEAN_crypto_encode_857x3_ITEMBYTES 1
+
+void PQCLEAN_NTRULPR857_CLEAN_crypto_encode_857x3(unsigned char *s, const void *v);
+#endif
diff --git a/crypto_kem/ntrulpr857/clean/crypto_encode_857xint16.c b/crypto_kem/ntrulpr857/clean/crypto_encode_857xint16.c
new file mode 100644
index 00000000..76a54c19
--- /dev/null
+++ b/crypto_kem/ntrulpr857/clean/crypto_encode_857xint16.c
@@ -0,0 +1,13 @@
+#include "crypto_encode_857xint16.h"
+
+
+void PQCLEAN_NTRULPR857_CLEAN_crypto_encode_857xint16(unsigned char *s, const void *v) {
+    const uint16_t *x = v;
+    int i;
+
+    for (i = 0; i < 857; ++i) {
+        uint16_t u = *x++;
+        *s++ = u;
+        *s++ = u >> 8;
+    }
+}
diff --git a/crypto_kem/ntrulpr857/clean/crypto_encode_857xint16.h b/crypto_kem/ntrulpr857/clean/crypto_encode_857xint16.h
new file mode 100644
index 00000000..a6364ad8
--- /dev/null
+++ b/crypto_kem/ntrulpr857/clean/crypto_encode_857xint16.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_NTRULPR857_CLEAN_CRYPTO_ENCODE_857XINT16_H
+#define PQCLEAN_NTRULPR857_CLEAN_CRYPTO_ENCODE_857XINT16_H
+
+#include <stdint.h>
+#define PQCLEAN_NTRULPR857_CLEAN_crypto_encode_857xint16_STRBYTES 1714
+#define PQCLEAN_NTRULPR857_CLEAN_crypto_encode_857xint16_ITEMBYTES 2
+#define PQCLEAN_NTRULPR857_CLEAN_crypto_encode_857xint16_ITEMS 857
+
+void PQCLEAN_NTRULPR857_CLEAN_crypto_encode_857xint16(unsigned char *s, const void *v);
+#endif
diff --git a/crypto_kem/ntrulpr857/clean/crypto_sort_int32.c b/crypto_kem/ntrulpr857/clean/crypto_sort_int32.c
new file mode 100644
index 00000000..8fb7ae3d
--- /dev/null
+++ b/crypto_kem/ntrulpr857/clean/crypto_sort_int32.c
@@ -0,0 +1,86 @@
+#include "crypto_sort_int32.h"
+#include <stdint.h>
+// Based on supercop-20190110/crypto_sort/int32/x86
+
+
+#define int32 int32_t
+
+#define int32_MINMAX(a,b) \
+    do { \
+        int32_t ab = (b) ^ (a); \
+        int32_t c = (int32_t)((int64_t)(b) - (int64_t)(a)); \
+        c ^= ab & (c ^ (b)); \
+        c >>= 31; \
+        c &= ab; \
+        (a) ^= c; \
+        (b) ^= c; \
+    } while(0)
+
+/* assume 2 <= n <= 0x40000000 */
+void PQCLEAN_NTRULPR857_CLEAN_crypto_sort_int32(int32 *array, size_t n) {
+    size_t top, p, q, r, i, j;
+    int32 *x = array;
+
+    top = 1;
+    while (top < n - top) {
+        top += top;
+    }
+
+    for (p = top; p >= 1; p >>= 1) {
+        i = 0;
+        while (i + 2 * p <= n) {
+            for (j = i; j < i + p; ++j) {
+                int32_MINMAX(x[j], x[j + p]);
+            }
+            i += 2 * p;
+        }
+        for (j = i; j < n - p; ++j) {
+            int32_MINMAX(x[j], x[j + p]);
+        }
+
+        i = 0;
+        j = 0;
+        for (q = top; q > p; q >>= 1) {
+            if (j != i) {
+                for (;;) {
+                    if (j == n - q) {
+                        goto done;
+                    }
+                    int32 a = x[j + p];
+                    for (r = q; r > p; r >>= 1) {
+                        int32_MINMAX(a, x[j + r]);
+                    }
+                    x[j + p] = a;
+                    ++j;
+                    if (j == i + p) {
+                        i += 2 * p;
+                        break;
+                    }
+                }
+            }
+            while (i + p <= n - q) {
+                for (j = i; j < i + p; ++j) {
+                    int32 a = x[j + p];
+                    for (r = q; r > p; r >>= 1) {
+                        int32_MINMAX(a, x[j + r]);
+                    }
+                    x[j + p] = a;
+                }
+                i += 2 * p;
+            }
+            /* now i + p > n - q */
+            j = i;
+            while (j < n - q) {
+                int32 a = x[j + p];
+                for (r = q; r > p; r >>= 1) {
+                    int32_MINMAX(a, x[j + r]);
+                }
+                x[j + p] = a;
+                ++j;
+            }
+
+done:
+            ;
+        }
+    }
+}
diff --git a/crypto_kem/ntrulpr857/clean/crypto_sort_int32.h b/crypto_kem/ntrulpr857/clean/crypto_sort_int32.h
new file mode 100644
index 00000000..0c991081
--- /dev/null
+++ b/crypto_kem/ntrulpr857/clean/crypto_sort_int32.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_NTRULPR857_CLEAN_CRYPTO_SORT_INT32_H
+#define PQCLEAN_NTRULPR857_CLEAN_CRYPTO_SORT_INT32_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+
+void PQCLEAN_NTRULPR857_CLEAN_crypto_sort_int32(int32_t *array, size_t n);
+
+#endif
diff --git a/crypto_kem/ntrulpr857/clean/crypto_sort_uint32.c b/crypto_kem/ntrulpr857/clean/crypto_sort_uint32.c
new file mode 100644
index 00000000..9da42e12
--- /dev/null
+++ b/crypto_kem/ntrulpr857/clean/crypto_sort_uint32.c
@@ -0,0 +1,20 @@
+#include "crypto_sort_int32.h"
+#include "crypto_sort_uint32.h"
+#include <stdint.h>
+
+#define uint32 uint32_t
+
+/* can save time by vectorizing xor loops */
+/* can save time by integrating xor loops with int32_sort */
+
+void PQCLEAN_NTRULPR857_CLEAN_crypto_sort_uint32(uint32_t *array, size_t n) {
+    uint32 *x = array;
+    size_t j;
+    for (j = 0; j < n; ++j) {
+        x[j] ^= 0x80000000;
+    }
+    PQCLEAN_NTRULPR857_CLEAN_crypto_sort_int32((int32_t *)array, n);
+    for (j = 0; j < n; ++j) {
+        x[j] ^= 0x80000000;
+    }
+}
diff --git a/crypto_kem/ntrulpr857/clean/crypto_sort_uint32.h b/crypto_kem/ntrulpr857/clean/crypto_sort_uint32.h
new file mode 100644
index 00000000..8d5b3982
--- /dev/null
+++ b/crypto_kem/ntrulpr857/clean/crypto_sort_uint32.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_NTRULPR857_CLEAN_CRYPTO_SORT_UINT32_H
+#define PQCLEAN_NTRULPR857_CLEAN_CRYPTO_SORT_UINT32_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+
+void PQCLEAN_NTRULPR857_CLEAN_crypto_sort_uint32(uint32_t *array, size_t n);
+
+#endif
diff --git a/crypto_kem/ntrulpr857/clean/crypto_stream_aes256ctr.c b/crypto_kem/ntrulpr857/clean/crypto_stream_aes256ctr.c
new file mode 100644
index 00000000..ee3f9d76
--- /dev/null
+++ b/crypto_kem/ntrulpr857/clean/crypto_stream_aes256ctr.c
@@ -0,0 +1,15 @@
+#include "crypto_stream_aes256ctr.h"
+
+
+int PQCLEAN_NTRULPR857_CLEAN_crypto_stream_aes256ctr(
+    uint8_t *out,
+    size_t outlen,
+    const uint8_t nonce[AESCTR_NONCEBYTES],
+    const uint8_t key[AES256_KEYBYTES]) {
+
+    aes256ctx state;
+    aes256_ctr_keyexp(&state, key);
+    aes256_ctr(out, outlen, nonce, &state);
+    aes256_ctx_release(&state);
+    return 0;
+}
diff --git a/crypto_kem/ntrulpr857/clean/crypto_stream_aes256ctr.h b/crypto_kem/ntrulpr857/clean/crypto_stream_aes256ctr.h
new file mode 100644
index 00000000..409df71a
--- /dev/null
+++ b/crypto_kem/ntrulpr857/clean/crypto_stream_aes256ctr.h
@@ -0,0 +1,15 @@
+#ifndef PQCLEAN_NTRULPR857_CLEAN_CRYPTO_STREAM_AES256CTR_H
+#define PQCLEAN_NTRULPR857_CLEAN_CRYPTO_STREAM_AES256CTR_H
+#include "aes.h"
+#include <stddef.h>
+#include <stdint.h>
+
+
+
+int PQCLEAN_NTRULPR857_CLEAN_crypto_stream_aes256ctr(
+    uint8_t *out,
+    size_t outlen,
+    const uint8_t nonce[AESCTR_NONCEBYTES],
+    const uint8_t key[AES256_KEYBYTES]);
+
+#endif
diff --git a/crypto_kem/ntrulpr857/clean/crypto_verify_1312.c b/crypto_kem/ntrulpr857/clean/crypto_verify_1312.c
new file mode 100644
index 00000000..491e38e3
--- /dev/null
+++ b/crypto_kem/ntrulpr857/clean/crypto_verify_1312.c
@@ -0,0 +1,13 @@
+#include "crypto_verify_1312.h"
+
+
+int PQCLEAN_NTRULPR857_CLEAN_crypto_verify_1312(const unsigned char *x, const unsigned char *y) {
+    unsigned int differentbits = 0;
+    int i;
+
+    for (i = 0; i < PQCLEAN_NTRULPR857_CLEAN_crypto_verify_1312_BYTES; ++i) {
+        differentbits |= x[i] ^ y[i];
+    }
+
+    return (int) (1 & ((differentbits - 1) >> 8)) - 1;
+}
diff --git a/crypto_kem/ntrulpr857/clean/crypto_verify_1312.h b/crypto_kem/ntrulpr857/clean/crypto_verify_1312.h
new file mode 100644
index 00000000..b1468aa6
--- /dev/null
+++ b/crypto_kem/ntrulpr857/clean/crypto_verify_1312.h
@@ -0,0 +1,8 @@
+#ifndef PQCLEAN_NTRULPR857_CLEAN_CRYPTO_VERIFY_1312_H
+#define PQCLEAN_NTRULPR857_CLEAN_CRYPTO_VERIFY_1312_H
+
+#include <stdint.h>
+#define PQCLEAN_NTRULPR857_CLEAN_crypto_verify_1312_BYTES 1312
+
+int PQCLEAN_NTRULPR857_CLEAN_crypto_verify_1312(const unsigned char *x, const unsigned char *y);
+#endif
diff --git a/crypto_kem/ntrulpr857/clean/kem.c b/crypto_kem/ntrulpr857/clean/kem.c
new file mode 100644
index 00000000..5fe4d188
--- /dev/null
+++ b/crypto_kem/ntrulpr857/clean/kem.c
@@ -0,0 +1,287 @@
+#include "api.h"
+#include "crypto_sort_uint32.h"
+#include "crypto_stream_aes256ctr.h"
+#include "params.h"
+#include "randombytes.h"
+#include "sha2.h"
+
+
+
+#define int8 int8_t
+#define int16 int16_t
+#define int32 int32_t
+#define uint16 uint16_t
+#define uint32 uint32_t
+#define uint64 uint64_t
+
+/* ----- masks */
+
+/* return -1 if x<0; otherwise return 0 */
+static int int16_negative_mask(int16 x) {
+    uint16 u = x;
+    u >>= 15;
+    return -(int) u;
+    /* alternative with gcc -fwrapv: */
+    /* x>>15 compiles to CPU's arithmetic right shift */
+}
+
+/* ----- arithmetic mod 3 */
+
+typedef int8 small;
+/* F3 is always represented as -1,0,1 */
+
+/* ----- arithmetic mod q */
+
+#define q12 ((q-1)/2)
+typedef int16 Fq;
+
+/* works for -14000000 < x < 14000000 if q in 4591, 4621, 5167 */
+/* assumes twos complement; use, e.g., gcc -fwrapv */
+static Fq Fq_freeze(int32 x) {
+    x -= q * ((q18 * x) >> 18);
+    x -= q * ((q27 * x + 67108864) >> 27);
+    return x;
+}
+
+/* works for all uint32 x */
+static Fq Fq_bigfreeze(uint32 x) {
+    x -= q * ((x * (uint64)q31) >> 31);
+    x -= q * ((x * (uint64)q31) >> 31);
+    x -= q;
+    x += (-(x >> 31)) & (uint32)q;
+    return x;
+}
+
+/* ----- Top and Right */
+
+static int8 Top(Fq C) {
+    return (tau1 * (int32)(C + tau0) + 16384) >> 15;
+}
+
+static Fq Right(int8 T) {
+    return Fq_freeze(tau3 * (int32)T - tau2);
+}
+
+/* ----- polynomials mod q */
+
+/* h = h*g in the ring Rq */
+static void Rq_mult_small(Fq *h, const small *g) {
+    crypto_encode_pxint16((unsigned char *) h, h);
+    crypto_core_mult((unsigned char *) h, (const unsigned char *) h, (const unsigned char *) g);
+    crypto_decode_pxint16(h, (const unsigned char *) h);
+}
+
+/* ----- sorting to generate short polynomial */
+
+static void Short_fromlist(small *out, const uint32 *in) {
+    uint32 L[ppadsort];
+    int i;
+
+    for (i = 0; i < w; ++i) {
+        L[i] = in[i] & (uint32) - 2;
+    }
+    for (i = w; i < p; ++i) {
+        L[i] = (in[i] & (uint32) - 3) | 1;
+    }
+    for (i = p; i < ppadsort; ++i) {
+        L[i] = 0xffffffff;
+    }
+    PQCLEAN_NTRULPR857_CLEAN_crypto_sort_uint32(L, ppadsort);
+    for (i = 0; i < p; ++i) {
+        out[i] = (L[i] & 3) - 1;
+    }
+}
+
+/* ----- underlying hash function */
+
+#define Hash_bytes 32
+
+static void Hash(unsigned char *out, const unsigned char *in, int inlen) {
+    unsigned char h[64];
+    int i;
+    sha512(h, in, inlen);
+    for (i = 0; i < 32; ++i) {
+        out[i] = h[i];
+    }
+}
+
+/* ----- higher-level randomness */
+
+static void Short_random(small *out) {
+    uint32 L[p];
+
+    randombytes((unsigned char *) L, sizeof L);
+    crypto_decode_pxint32(L, (unsigned char *) L);
+    Short_fromlist(out, L);
+}
+
+/* ----- Inputs, Generator */
+
+typedef int8 Inputs[I]; /* passed by reference */
+
+static const unsigned char aes_nonce[16] = {0};
+
+/* G = Generator(pk) */
+static void Generator(Fq *G, const unsigned char *pk) {
+    uint32 L[p];
+    int i;
+
+    PQCLEAN_NTRULPR857_CLEAN_crypto_stream_aes256ctr((unsigned char *) L, 4 * p, aes_nonce, pk);
+    crypto_decode_pxint32(L, (unsigned char *) L);
+    for (i = 0; i < p; ++i) {
+        G[i] = Fq_bigfreeze(L[i]) - q12;
+    }
+}
+
+/* ----- NTRU LPRime */
+
+#define Seeds_bytes 32
+#define Ciphertexts_bytes (Rounded_bytes+Top_bytes)
+#define SecretKeys_bytes Small_bytes
+#define PublicKeys_bytes (Seeds_bytes+Rounded_bytes)
+#define Confirm_bytes 32
+
+/* c,r_enc[1:] = Hide(r,pk,cache); cache is Hash4(pk) */
+static void Hide(unsigned char *c, unsigned char *r_enc, const Inputs r, const unsigned char *pk, const unsigned char *cache) {
+    small b[p];
+    int i;
+
+    Inputs_encode(r_enc + 1, r);
+    {
+        unsigned char h[Hash_bytes];
+        uint32 L[p];
+        {
+            unsigned char s[1 + Inputs_bytes];
+            Inputs_encode(s + 1, r);
+            s[0] = 5;
+            Hash(h, s, sizeof s);
+        }
+        PQCLEAN_NTRULPR857_CLEAN_crypto_stream_aes256ctr((unsigned char *) L, 4 * p, aes_nonce, h);
+        crypto_decode_pxint32(L, (unsigned char *) L);
+        Short_fromlist(b, L);
+    }
+    {
+        Fq bG[p];
+        Generator(bG, pk);
+        Rq_mult_small(bG, b);
+        Round_and_encode(c, bG);
+        c += Rounded_bytes;
+    }
+    {
+        Fq bA[p];
+        int8 T[I];
+        Rounded_decode(bA, pk + Seeds_bytes);
+        Rq_mult_small(bA, b);
+        for (i = 0; i < I; ++i) {
+            T[i] = Top(Fq_freeze(bA[i] + r[i] * q12));
+        }
+        Top_encode(c, T);
+        c += Top_bytes;
+    }
+    {
+        unsigned char x[1 + Inputs_bytes + Hash_bytes];
+        for (i = 0; i < Inputs_bytes; ++i) {
+            x[1 + i] = r_enc[1 + i];
+        }
+        for (i = 0; i < Hash_bytes; ++i) {
+            x[1 + Inputs_bytes + i] = cache[i];
+        }
+        x[0] = 2;
+        Hash(c, x, sizeof x);
+    }
+}
+
+
+int PQCLEAN_NTRULPR857_CLEAN_crypto_kem_keypair(unsigned char *pk, unsigned char *sk) {
+    Fq aG[p];
+    int i;
+    randombytes(pk, Seeds_bytes);
+    Generator(aG, pk);
+    {
+        small a[p];
+        Short_random(a);
+        Rq_mult_small(aG, a);
+        Small_encode(sk, a);
+    }
+    Round_and_encode(pk + Seeds_bytes, aG);
+    {
+        unsigned char sksave = sk[SecretKeys_bytes - 1];
+        for (i = 0; i < PublicKeys_bytes; ++i) {
+            sk[SecretKeys_bytes + i] = pk[i];
+        }
+        sk[SecretKeys_bytes - 1] = 4;
+        Hash(sk + SecretKeys_bytes + PublicKeys_bytes + Inputs_bytes, sk + SecretKeys_bytes - 1, 1 + PublicKeys_bytes);
+        sk[SecretKeys_bytes - 1] = sksave;
+        randombytes(sk + SecretKeys_bytes + PublicKeys_bytes, Inputs_bytes);
+    }
+    return 0;
+}
+
+int PQCLEAN_NTRULPR857_CLEAN_crypto_kem_enc(unsigned char *c, unsigned char *k, const unsigned char *pk) {
+    int i;
+    unsigned char cache[Hash_bytes];
+    {
+        unsigned char y[1 + PublicKeys_bytes];
+        for (i = 0; i < PublicKeys_bytes; ++i) {
+            y[1 + i] = pk[i];
+        }
+        y[0] = 4;
+        Hash(cache, y, sizeof y);
+    }
+    Inputs r;
+    {
+        unsigned char s[Inputs_bytes];
+        randombytes(s, sizeof s);
+        Inputs_decode(r, s);
+    }
+    {
+        unsigned char x[1 + Inputs_bytes + Ciphertexts_bytes + Confirm_bytes];
+        Hide(c, x, r, pk, cache);
+        for (i = 0; i < Ciphertexts_bytes + Confirm_bytes; ++i) {
+            x[1 + Inputs_bytes + i] = c[i];
+        }
+        x[0] = 1;
+        Hash(k, x, sizeof x);
+    }
+    return 0;
+}
+
+int PQCLEAN_NTRULPR857_CLEAN_crypto_kem_dec(unsigned char *k, const unsigned char *c, const unsigned char *sk) {
+    const unsigned char *pk = sk + SecretKeys_bytes;
+    const unsigned char *rho = pk + PublicKeys_bytes;
+    const unsigned char *cache = rho + Inputs_bytes;
+    Inputs r;
+    int i;
+    {
+        Fq aB[p];
+        Rounded_decode(aB, c);
+        {
+            small a[p];
+            Small_decode(a, sk);
+            Rq_mult_small(aB, a);
+        }
+        {
+            int8 T[I];
+            Top_decode(T, c + Rounded_bytes);
+            for (i = 0; i < I; ++i) {
+                r[i] = -int16_negative_mask(Fq_freeze(Right(T[i]) - aB[i] + 4 * w + 1));
+            }
+        }
+    }
+    {
+        unsigned char cnew[Ciphertexts_bytes + Confirm_bytes];
+        int mask;
+        unsigned char x[1 + Inputs_bytes + Ciphertexts_bytes + Confirm_bytes];
+        Hide(cnew, x, r, pk, cache);
+        mask = crypto_verify_clen(c, cnew);
+        for (i = 0; i < Inputs_bytes; ++i) {
+            x[1 + i] ^= mask & (x[1 + i] ^ rho[i]);
+        }
+        for (i = 0; i < Ciphertexts_bytes + Confirm_bytes; ++i) {
+            x[1 + Inputs_bytes + i] = c[i];
+        }
+        x[0] = 1 + mask;
+        Hash(k, x, sizeof x);
+    }
+    return 0;
+}
diff --git a/crypto_kem/ntrulpr857/clean/params.h b/crypto_kem/ntrulpr857/clean/params.h
new file mode 100644
index 00000000..6bb0c520
--- /dev/null
+++ b/crypto_kem/ntrulpr857/clean/params.h
@@ -0,0 +1,63 @@
+#ifndef params_H
+#define params_H
+#include "crypto_core_multsntrup857.h"
+#include "crypto_decode_256x16.h"
+#include "crypto_decode_256x2.h"
+#include "crypto_decode_857x1723.h"
+#include "crypto_decode_857x3.h"
+#include "crypto_decode_857xint16.h"
+#include "crypto_decode_857xint32.h"
+#include "crypto_encode_256x16.h"
+#include "crypto_encode_256x2.h"
+#include "crypto_encode_857x1723.h"
+#include "crypto_encode_857x1723round.h"
+#include "crypto_encode_857x3.h"
+#include "crypto_encode_857xint16.h"
+#include "crypto_verify_1312.h"
+
+
+#define p 857
+#define q 5167
+#define w 281
+#define q27 25976 /* closest integer to 2^27/q */
+#define q18 51 /* closest integer to 2^18/q */
+#define tau0 2433
+#define tau1 101
+#define tau2 2265
+#define tau3 324
+#define I 256
+
+#define ppadsort 857
+
+#define q18 51 /* round(2^18/q) */
+#define q27 25976 /* round(2^27/q) */
+#define q31 415615 /* floor(2^31/q) */
+
+#define crypto_verify_clen PQCLEAN_NTRULPR857_CLEAN_crypto_verify_1312
+
+#define Rounded_bytes PQCLEAN_NTRULPR857_CLEAN_crypto_decode_857x1723_STRBYTES
+#define Rounded_decode PQCLEAN_NTRULPR857_CLEAN_crypto_decode_857x1723
+
+#define Round_and_encode PQCLEAN_NTRULPR857_CLEAN_crypto_encode_857x1723round
+
+#define Small_bytes PQCLEAN_NTRULPR857_CLEAN_crypto_encode_857x3_STRBYTES
+#define Small_encode PQCLEAN_NTRULPR857_CLEAN_crypto_encode_857x3
+#define Small_decode PQCLEAN_NTRULPR857_CLEAN_crypto_decode_857x3
+
+#define Top_bytes PQCLEAN_NTRULPR857_CLEAN_crypto_encode_256x16_STRBYTES
+#define Top_encode PQCLEAN_NTRULPR857_CLEAN_crypto_encode_256x16
+#define Top_decode PQCLEAN_NTRULPR857_CLEAN_crypto_decode_256x16
+
+#define Inputs_bytes PQCLEAN_NTRULPR857_CLEAN_crypto_encode_256x2_STRBYTES
+#define Inputs_encode PQCLEAN_NTRULPR857_CLEAN_crypto_encode_256x2
+#define Inputs_decode PQCLEAN_NTRULPR857_CLEAN_crypto_decode_256x2
+
+#define crypto_decode_pxint32 PQCLEAN_NTRULPR857_CLEAN_crypto_decode_857xint32
+
+#define crypto_decode_pxint16 PQCLEAN_NTRULPR857_CLEAN_crypto_decode_857xint16
+
+#define crypto_encode_pxint16 PQCLEAN_NTRULPR857_CLEAN_crypto_encode_857xint16
+
+#define crypto_core_mult PQCLEAN_NTRULPR857_CLEAN_crypto_core_multsntrup857
+
+#endif
diff --git a/crypto_kem/sntrup653/META.yml b/crypto_kem/sntrup653/META.yml
new file mode 100644
index 00000000..1a72f374
--- /dev/null
+++ b/crypto_kem/sntrup653/META.yml
@@ -0,0 +1,26 @@
+name: sntrup653
+type: kem
+claimed-nist-level: 2
+claimed-security: IND-CCA2
+length-public-key: 994
+length-secret-key: 1518
+length-ciphertext: 897
+length-shared-secret: 32
+nistkat-sha256: 91dae8987131825001061f9d194bbfde53b3d17f3962f6992a3ec5fa3cf141d7
+principal-submitters:
+  - Daniel J. Bernstein
+  - Chitchanok Chuengsatiansup
+  - Tanja Lange
+  - Christine van Vredendaal
+implementations:
+    - name: clean
+      version: supercop-20200826
+    - name: avx2
+      version: supercop-20200826
+      supported_platforms:
+          - architecture: x86_64
+            operating_systems:
+                - Linux
+                - Darwin
+            required_flags:
+                - avx2
diff --git a/crypto_kem/sntrup653/avx2/LICENSE b/crypto_kem/sntrup653/avx2/LICENSE
new file mode 100644
index 00000000..d5d21fff
--- /dev/null
+++ b/crypto_kem/sntrup653/avx2/LICENSE
@@ -0,0 +1 @@
+Public Domain
diff --git a/crypto_kem/sntrup653/avx2/Makefile b/crypto_kem/sntrup653/avx2/Makefile
new file mode 100644
index 00000000..bc2fbbd0
--- /dev/null
+++ b/crypto_kem/sntrup653/avx2/Makefile
@@ -0,0 +1,22 @@
+# This Makefile can be used with GNU Make or BSD Make
+
+LIB=libsntrup653_avx2.a
+HEADERS=api.h crypto_core_inv3sntrup653.h crypto_core_invsntrup653.h crypto_core_mult3sntrup653.h crypto_core_multsntrup653.h crypto_core_multsntrup653_ntt.h crypto_core_scale3sntrup653.h crypto_core_weightsntrup653.h crypto_core_wforcesntrup653.h crypto_decode_653x1541.h crypto_decode_653x3.h crypto_decode_653x4621.h crypto_decode_653xint16.h crypto_decode_653xint32.h crypto_decode_int16.h crypto_encode_653x1541.h crypto_encode_653x1541round.h crypto_encode_653x3.h crypto_encode_653x4621.h crypto_encode_653xfreeze3.h crypto_encode_653xint16.h crypto_encode_int16.h crypto_sort_int32.h crypto_sort_uint32.h crypto_stream_aes256ctr.h crypto_verify_897.h params.h 
+OBJECTS=crypto_core_inv3sntrup653.o crypto_core_invsntrup653.o crypto_core_mult3sntrup653.o crypto_core_multsntrup653.o crypto_core_multsntrup653_ntt.o crypto_core_scale3sntrup653.o crypto_core_weightsntrup653.o crypto_core_wforcesntrup653.o crypto_decode_653x1541.o crypto_decode_653x3.o crypto_decode_653x4621.o crypto_decode_653xint16.o crypto_decode_653xint32.o crypto_decode_int16.o crypto_encode_653x1541.o crypto_encode_653x1541round.o crypto_encode_653x3.o crypto_encode_653x4621.o crypto_encode_653xfreeze3.o crypto_encode_653xint16.o crypto_encode_int16.o crypto_sort_int32.o crypto_sort_uint32.o crypto_stream_aes256ctr.o crypto_verify_897.o kem.o 
+
+CFLAGS=-O3 -mavx2 -mbmi2 -Wall -Wextra -Wpedantic -Wvla -Werror -Wredundant-decls -Wmissing-prototypes -std=c99 -I../../../common $(EXTRAFLAGS)
+
+all: $(LIB)
+
+%.o: %.s $(HEADERS)
+	$(AS) -o $@ $<
+
+%.o: %.c $(HEADERS)
+	$(CC) $(CFLAGS) -c -o $@ $<
+
+$(LIB): $(OBJECTS)
+	$(AR) -r $@ $(OBJECTS)
+
+clean:
+	$(RM) $(OBJECTS)
+	$(RM) $(LIB)
diff --git a/crypto_kem/sntrup653/avx2/api.h b/crypto_kem/sntrup653/avx2/api.h
new file mode 100644
index 00000000..c9f95e0a
--- /dev/null
+++ b/crypto_kem/sntrup653/avx2/api.h
@@ -0,0 +1,16 @@
+#ifndef PQCLEAN_SNTRUP653_AVX2_API_H
+#define PQCLEAN_SNTRUP653_AVX2_API_H
+
+
+
+#define PQCLEAN_SNTRUP653_AVX2_CRYPTO_ALGNAME "sntrup653"
+
+#define PQCLEAN_SNTRUP653_AVX2_CRYPTO_SECRETKEYBYTES 1518
+#define PQCLEAN_SNTRUP653_AVX2_CRYPTO_PUBLICKEYBYTES 994
+#define PQCLEAN_SNTRUP653_AVX2_CRYPTO_CIPHERTEXTBYTES 897
+#define PQCLEAN_SNTRUP653_AVX2_CRYPTO_BYTES 32
+
+int PQCLEAN_SNTRUP653_AVX2_crypto_kem_keypair(unsigned char *pk, unsigned char *sk);
+int PQCLEAN_SNTRUP653_AVX2_crypto_kem_enc(unsigned char *c, unsigned char *k, const unsigned char *pk);
+int PQCLEAN_SNTRUP653_AVX2_crypto_kem_dec(unsigned char *k, const unsigned char *c, const unsigned char *sk);
+#endif
diff --git a/crypto_kem/sntrup653/avx2/crypto_core_inv3sntrup653.c b/crypto_kem/sntrup653/avx2/crypto_core_inv3sntrup653.c
new file mode 100644
index 00000000..66292c3a
--- /dev/null
+++ b/crypto_kem/sntrup653/avx2/crypto_core_inv3sntrup653.c
@@ -0,0 +1,542 @@
+#include "crypto_core_inv3sntrup653.h"
+#include <immintrin.h>
+
+
+#define int8 int8_t
+typedef int8 small;
+
+#define p 653
+#define ppad 768
+#define numvec 3
+
+typedef __m256i vec256;
+
+/*
+This code stores 768-coeff poly as vec256[3].
+Order of 256 coefficients in each vec256
+is optimized in light of costs of vector instructions:
+  0,4,...,252 in 64-bit word;
+  1,5,...,253 in 64-bit word;
+  2,6,...,254 in 64-bit word;
+  3,7,...,255 in 64-bit word.
+*/
+
+static inline void vec256_frombits(vec256 *v, const small *b) {
+    int i;
+
+    for (i = 0; i < numvec; ++i) {
+        vec256 b0 = _mm256_loadu_si256((vec256 *) b);
+        b += 32; /* 0,1,...,31 */
+        vec256 b1 = _mm256_loadu_si256((vec256 *) b);
+        b += 32; /* 32,33,... */
+        vec256 b2 = _mm256_loadu_si256((vec256 *) b);
+        b += 32;
+        vec256 b3 = _mm256_loadu_si256((vec256 *) b);
+        b += 32;
+        vec256 b4 = _mm256_loadu_si256((vec256 *) b);
+        b += 32;
+        vec256 b5 = _mm256_loadu_si256((vec256 *) b);
+        b += 32;
+        vec256 b6 = _mm256_loadu_si256((vec256 *) b);
+        b += 32;
+        vec256 b7 = _mm256_loadu_si256((vec256 *) b);
+        b += 32;
+
+        vec256 c0 = _mm256_unpacklo_epi32(b0, b1); /* 0 1 2 3 32 33 34 35 4 5 6 7 36 37 38 39 ... 55 */
+        vec256 c1 = _mm256_unpackhi_epi32(b0, b1); /* 8 9 10 11 40 41 42 43 ... 63 */
+        vec256 c2 = _mm256_unpacklo_epi32(b2, b3);
+        vec256 c3 = _mm256_unpackhi_epi32(b2, b3);
+        vec256 c4 = _mm256_unpacklo_epi32(b4, b5);
+        vec256 c5 = _mm256_unpackhi_epi32(b4, b5);
+        vec256 c6 = _mm256_unpacklo_epi32(b6, b7);
+        vec256 c7 = _mm256_unpackhi_epi32(b6, b7);
+
+        vec256 d0 = c0 | _mm256_slli_epi32(c1, 2); /* 0 8, 1 9, 2 10, 3 11, 32 40, 33 41, ..., 55 63 */
+        vec256 d2 = c2 | _mm256_slli_epi32(c3, 2);
+        vec256 d4 = c4 | _mm256_slli_epi32(c5, 2);
+        vec256 d6 = c6 | _mm256_slli_epi32(c7, 2);
+
+        vec256 e0 = _mm256_unpacklo_epi64(d0, d2);
+        vec256 e2 = _mm256_unpackhi_epi64(d0, d2);
+        vec256 e4 = _mm256_unpacklo_epi64(d4, d6);
+        vec256 e6 = _mm256_unpackhi_epi64(d4, d6);
+
+        vec256 f0 = e0 | _mm256_slli_epi32(e2, 1);
+        vec256 f4 = e4 | _mm256_slli_epi32(e6, 1);
+
+        vec256 g0 = _mm256_permute2x128_si256(f0, f4, 0x20);
+        vec256 g4 = _mm256_permute2x128_si256(f0, f4, 0x31);
+
+        vec256 h = g0 | _mm256_slli_epi32(g4, 4);
+
+#define TRANSPOSE _mm256_set_epi8( 31,27,23,19, 30,26,22,18, 29,25,21,17, 28,24,20,16, 15,11,7,3, 14,10,6,2, 13,9,5,1, 12,8,4,0 )
+        h = _mm256_shuffle_epi8(h, TRANSPOSE);
+        h = _mm256_permute4x64_epi64(h, 0xd8);
+        h = _mm256_shuffle_epi32(h, 0xd8);
+
+        *v++ = h;
+    }
+}
+
+static inline void vec256_tobits(const vec256 *v, small *b) {
+    int i;
+
+    for (i = 0; i < numvec; ++i) {
+        vec256 h = *v++;
+
+        h = _mm256_shuffle_epi32(h, 0xd8);
+        h = _mm256_permute4x64_epi64(h, 0xd8);
+        h = _mm256_shuffle_epi8(h, TRANSPOSE);
+
+        vec256 g0 = h & _mm256_set1_epi8(15);
+        vec256 g4 = _mm256_srli_epi32(h, 4) & _mm256_set1_epi8(15);
+
+        vec256 f0 = _mm256_permute2x128_si256(g0, g4, 0x20);
+        vec256 f4 = _mm256_permute2x128_si256(g0, g4, 0x31);
+
+        vec256 e0 = f0 & _mm256_set1_epi8(5);
+        vec256 e2 = _mm256_srli_epi32(f0, 1) & _mm256_set1_epi8(5);
+        vec256 e4 = f4 & _mm256_set1_epi8(5);
+        vec256 e6 = _mm256_srli_epi32(f4, 1) & _mm256_set1_epi8(5);
+
+        vec256 d0 = _mm256_unpacklo_epi32(e0, e2);
+        vec256 d2 = _mm256_unpackhi_epi32(e0, e2);
+        vec256 d4 = _mm256_unpacklo_epi32(e4, e6);
+        vec256 d6 = _mm256_unpackhi_epi32(e4, e6);
+
+        vec256 c0 = d0 & _mm256_set1_epi8(1);
+        vec256 c1 = _mm256_srli_epi32(d0, 2) & _mm256_set1_epi8(1);
+        vec256 c2 = d2 & _mm256_set1_epi8(1);
+        vec256 c3 = _mm256_srli_epi32(d2, 2) & _mm256_set1_epi8(1);
+        vec256 c4 = d4 & _mm256_set1_epi8(1);
+        vec256 c5 = _mm256_srli_epi32(d4, 2) & _mm256_set1_epi8(1);
+        vec256 c6 = d6 & _mm256_set1_epi8(1);
+        vec256 c7 = _mm256_srli_epi32(d6, 2) & _mm256_set1_epi8(1);
+
+        vec256 b0 = _mm256_unpacklo_epi64(c0, c1);
+        vec256 b1 = _mm256_unpackhi_epi64(c0, c1);
+        vec256 b2 = _mm256_unpacklo_epi64(c2, c3);
+        vec256 b3 = _mm256_unpackhi_epi64(c2, c3);
+        vec256 b4 = _mm256_unpacklo_epi64(c4, c5);
+        vec256 b5 = _mm256_unpackhi_epi64(c4, c5);
+        vec256 b6 = _mm256_unpacklo_epi64(c6, c7);
+        vec256 b7 = _mm256_unpackhi_epi64(c6, c7);
+
+        _mm256_storeu_si256((vec256 *) b, b0);
+        b += 32;
+        _mm256_storeu_si256((vec256 *) b, b1);
+        b += 32;
+        _mm256_storeu_si256((vec256 *) b, b2);
+        b += 32;
+        _mm256_storeu_si256((vec256 *) b, b3);
+        b += 32;
+        _mm256_storeu_si256((vec256 *) b, b4);
+        b += 32;
+        _mm256_storeu_si256((vec256 *) b, b5);
+        b += 32;
+        _mm256_storeu_si256((vec256 *) b, b6);
+        b += 32;
+        _mm256_storeu_si256((vec256 *) b, b7);
+        b += 32;
+    }
+}
+
+static void vec256_init(vec256 *G0, vec256 *G1, const small *s) {
+    int i;
+    small srev[ppad + (ppad - p)];
+    small si;
+    small g0[ppad];
+    small g1[ppad];
+
+    for (i = 0; i < p; ++i) {
+        srev[ppad - 1 - i] = s[i];
+    }
+    for (i = 0; i < ppad - p; ++i) {
+        srev[i] = 0;
+    }
+    for (i = p; i < ppad; ++i) {
+        srev[i + ppad - p] = 0;
+    }
+
+    for (i = 0; i < ppad; ++i) {
+        si = srev[i + ppad - p];
+        g0[i] = si & 1;
+        g1[i] = (si >> 1) & g0[i];
+    }
+
+    vec256_frombits(G0, g0);
+    vec256_frombits(G1, g1);
+}
+
+static void vec256_final(small *out, const vec256 *V0, const vec256 *V1) {
+    int i;
+    small v0[ppad];
+    small v1[ppad];
+    small v[ppad];
+    small vrev[ppad + (ppad - p)];
+
+    vec256_tobits(V0, v0);
+    vec256_tobits(V1, v1);
+
+    for (i = 0; i < ppad; ++i) {
+        v[i] = v0[i] + 2 * v1[i] - 4 * (v0[i] & v1[i]);
+    }
+
+    for (i = 0; i < ppad; ++i) {
+        vrev[i] = v[ppad - 1 - i];
+    }
+    for (i = ppad; i < ppad + (ppad - p); ++i) {
+        vrev[i] = 0;
+    }
+
+    for (i = 0; i < p; ++i) {
+        out[i] = vrev[i + ppad - p];
+    }
+}
+
+static inline int negative_mask(int x) {
+    return x >> 31;
+}
+
+static inline void vec256_swap(vec256 *f, vec256 *g, int len, vec256 mask) {
+    vec256 flip;
+    int i;
+
+    for (i = 0; i < len; ++i) {
+        flip = mask & (f[i] ^ g[i]);
+        f[i] ^= flip;
+        g[i] ^= flip;
+    }
+}
+
+static inline void vec256_scale(vec256 *f0, vec256 *f1, const vec256 c0, const vec256 c1) {
+    int i;
+
+    for (i = 0; i < numvec; ++i) {
+        vec256 f0i = f0[i];
+        vec256 f1i = f1[i];
+
+        f0i &= c0;
+        f1i ^= c1;
+        f1i &= f0i;
+
+        f0[i] = f0i;
+        f1[i] = f1i;
+    }
+}
+
+static inline void vec256_eliminate(vec256 *f0, vec256 *f1, vec256 *g0, vec256 *g1, int len, const vec256 c0, const vec256 c1) {
+    int i;
+
+    for (i = 0; i < len; ++i) {
+        vec256 f0i = f0[i];
+        vec256 f1i = f1[i];
+        vec256 g0i = g0[i];
+        vec256 g1i = g1[i];
+        vec256 t;
+
+        f0i &= c0;
+        f1i ^= c1;
+        f1i &= f0i;
+
+        t = g0i ^ f0i;
+        g0[i] = t | (g1i ^ f1i);
+        g1[i] = (g1i ^ f0i) & (f1i ^ t);
+    }
+}
+
+static inline int vec256_bit0mask(vec256 *f) {
+    return -(_mm_cvtsi128_si32(_mm256_castsi256_si128(f[0])) & 1);
+}
+
+static inline void vec256_divx_1(vec256 *f) {
+    vec256 f0 = f[0];
+
+    unsigned long long low0 = _mm_cvtsi128_si64(_mm256_castsi256_si128(f0));
+
+    low0 = low0 >> 1;
+
+    f0 = _mm256_blend_epi32(f0, _mm256_set_epi64x(0, 0, 0, low0), 0x3);
+
+    f[0] = _mm256_permute4x64_epi64(f0, 0x39);
+}
+
+static inline void vec256_divx_2(vec256 *f) {
+    vec256 f0 = f[0];
+    vec256 f1 = f[1];
+
+    unsigned long long low0 = _mm_cvtsi128_si64(_mm256_castsi256_si128(f0));
+    unsigned long long low1 = _mm_cvtsi128_si64(_mm256_castsi256_si128(f1));
+
+    low0 = (low0 >> 1) | (low1 << 63);
+    low1 = low1 >> 1;
+
+    f0 = _mm256_blend_epi32(f0, _mm256_set_epi64x(0, 0, 0, low0), 0x3);
+    f1 = _mm256_blend_epi32(f1, _mm256_set_epi64x(0, 0, 0, low1), 0x3);
+
+    f[0] = _mm256_permute4x64_epi64(f0, 0x39);
+    f[1] = _mm256_permute4x64_epi64(f1, 0x39);
+}
+
+static inline void vec256_divx_3(vec256 *f) {
+    vec256 f0 = f[0];
+    vec256 f1 = f[1];
+    vec256 f2 = f[2];
+
+    unsigned long long low0 = _mm_cvtsi128_si64(_mm256_castsi256_si128(f0));
+    unsigned long long low1 = _mm_cvtsi128_si64(_mm256_castsi256_si128(f1));
+    unsigned long long low2 = _mm_cvtsi128_si64(_mm256_castsi256_si128(f2));
+
+    low0 = (low0 >> 1) | (low1 << 63);
+    low1 = (low1 >> 1) | (low2 << 63);
+    low2 = low2 >> 1;
+
+    f0 = _mm256_blend_epi32(f0, _mm256_set_epi64x(0, 0, 0, low0), 0x3);
+    f1 = _mm256_blend_epi32(f1, _mm256_set_epi64x(0, 0, 0, low1), 0x3);
+    f2 = _mm256_blend_epi32(f2, _mm256_set_epi64x(0, 0, 0, low2), 0x3);
+
+    f[0] = _mm256_permute4x64_epi64(f0, 0x39);
+    f[1] = _mm256_permute4x64_epi64(f1, 0x39);
+    f[2] = _mm256_permute4x64_epi64(f2, 0x39);
+}
+
+static inline void vec256_timesx_1(vec256 *f) {
+    vec256 f0 = _mm256_permute4x64_epi64(f[0], 0x93);
+
+    unsigned long long low0 = _mm_cvtsi128_si64(_mm256_castsi256_si128(f0));
+
+    low0 = low0 << 1;
+
+    f0 = _mm256_blend_epi32(f0, _mm256_set_epi64x(0, 0, 0, low0), 0x3);
+
+    f[0] = f0;
+}
+
+static inline void vec256_timesx_2(vec256 *f) {
+    vec256 f0 = _mm256_permute4x64_epi64(f[0], 0x93);
+    vec256 f1 = _mm256_permute4x64_epi64(f[1], 0x93);
+
+    unsigned long long low0 = _mm_cvtsi128_si64(_mm256_castsi256_si128(f0));
+    unsigned long long low1 = _mm_cvtsi128_si64(_mm256_castsi256_si128(f1));
+
+    low1 = (low1 << 1) | (low0 >> 63);
+    low0 = low0 << 1;
+
+    f0 = _mm256_blend_epi32(f0, _mm256_set_epi64x(0, 0, 0, low0), 0x3);
+    f1 = _mm256_blend_epi32(f1, _mm256_set_epi64x(0, 0, 0, low1), 0x3);
+
+    f[0] = f0;
+    f[1] = f1;
+}
+
+static inline void vec256_timesx_3(vec256 *f) {
+    vec256 f0 = _mm256_permute4x64_epi64(f[0], 0x93);
+    vec256 f1 = _mm256_permute4x64_epi64(f[1], 0x93);
+    vec256 f2 = _mm256_permute4x64_epi64(f[2], 0x93);
+
+    unsigned long long low0 = *(unsigned long long *) &f0;
+    unsigned long long low1 = *(unsigned long long *) &f1;
+    unsigned long long low2 = _mm_cvtsi128_si64(_mm256_castsi256_si128(f2));
+
+    low2 = (low2 << 1) | (low1 >> 63);
+    low1 = (low1 << 1) | (low0 >> 63);
+    low0 = low0 << 1;
+
+    *(unsigned long long *) &f0 = low0;
+    *(unsigned long long *) &f1 = low1;
+    f2 = _mm256_blend_epi32(f2, _mm256_set_epi64x(0, 0, 0, low2), 0x3);
+
+    f[0] = f0;
+    f[1] = f1;
+    f[2] = f2;
+}
+
+
+int PQCLEAN_SNTRUP653_AVX2_crypto_core_inv3sntrup653(unsigned char *outbytes, const unsigned char *inbytes) {
+    small *out = (void *) outbytes;
+    small *in = (void *) inbytes;
+    vec256 F0[numvec];
+    vec256 F1[numvec];
+    vec256 G0[numvec];
+    vec256 G1[numvec];
+    vec256 V0[numvec];
+    vec256 V1[numvec];
+    vec256 R0[numvec];
+    vec256 R1[numvec];
+    vec256 c0vec, c1vec;
+    int loop;
+    int c0, c1;
+    int minusdelta = -1;
+    int swapmask;
+    vec256 swapvec;
+
+    vec256_init(G0, G1, in);
+    F0[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, 1);
+    F0[1] = _mm256_set1_epi32(0);
+    F0[2] = _mm256_set_epi32(0, 0, 0, 0, 8, 0, 8, 0);
+    F1[0] = _mm256_set1_epi32(0);
+    F1[1] = _mm256_set1_epi32(0);
+    F1[2] = _mm256_set_epi32(0, 0, 0, 0, 8, 0, 8, 0);
+
+    V0[0] = _mm256_set1_epi32(0);
+    V1[0] = _mm256_set1_epi32(0);
+    V0[1] = _mm256_set1_epi32(0);
+    V1[1] = _mm256_set1_epi32(0);
+    V0[2] = _mm256_set1_epi32(0);
+    V1[2] = _mm256_set1_epi32(0);
+
+    R0[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, 1);
+    R1[0] = _mm256_set1_epi32(0);
+    R0[1] = _mm256_set1_epi32(0);
+    R1[1] = _mm256_set1_epi32(0);
+    R0[2] = _mm256_set1_epi32(0);
+    R1[2] = _mm256_set1_epi32(0);
+
+    for (loop = 256; loop > 0; --loop) {
+        vec256_timesx_1(V0);
+        vec256_timesx_1(V1);
+        swapmask = negative_mask(minusdelta) & vec256_bit0mask(G0);
+
+        c0 = vec256_bit0mask(F0) & vec256_bit0mask(G0);
+        c1 = vec256_bit0mask(F1) ^ vec256_bit0mask(G1);
+        c1 &= c0;
+
+        minusdelta ^= swapmask & (minusdelta ^ -minusdelta);
+        minusdelta -= 1;
+
+        swapvec = _mm256_set1_epi32(swapmask);
+        vec256_swap(F0, G0, 3, swapvec);
+        vec256_swap(F1, G1, 3, swapvec);
+
+        c0vec = _mm256_set1_epi32(c0);
+        c1vec = _mm256_set1_epi32(c1);
+
+        vec256_eliminate(F0, F1, G0, G1, 3, c0vec, c1vec);
+        vec256_divx_3(G0);
+        vec256_divx_3(G1);
+
+        vec256_swap(V0, R0, 1, swapvec);
+        vec256_swap(V1, R1, 1, swapvec);
+        vec256_eliminate(V0, V1, R0, R1, 1, c0vec, c1vec);
+    }
+
+    for (loop = 256; loop > 0; --loop) {
+        vec256_timesx_2(V0);
+        vec256_timesx_2(V1);
+        swapmask = negative_mask(minusdelta) & vec256_bit0mask(G0);
+
+        c0 = vec256_bit0mask(F0) & vec256_bit0mask(G0);
+        c1 = vec256_bit0mask(F1) ^ vec256_bit0mask(G1);
+        c1 &= c0;
+
+        minusdelta ^= swapmask & (minusdelta ^ -minusdelta);
+        minusdelta -= 1;
+
+        swapvec = _mm256_set1_epi32(swapmask);
+        vec256_swap(F0, G0, 3, swapvec);
+        vec256_swap(F1, G1, 3, swapvec);
+
+        c0vec = _mm256_set1_epi32(c0);
+        c1vec = _mm256_set1_epi32(c1);
+
+        vec256_eliminate(F0, F1, G0, G1, 3, c0vec, c1vec);
+        vec256_divx_3(G0);
+        vec256_divx_3(G1);
+
+        vec256_swap(V0, R0, 2, swapvec);
+        vec256_swap(V1, R1, 2, swapvec);
+        vec256_eliminate(V0, V1, R0, R1, 2, c0vec, c1vec);
+    }
+
+    for (loop = 281; loop > 0; --loop) {
+        vec256_timesx_3(V0);
+        vec256_timesx_3(V1);
+        swapmask = negative_mask(minusdelta) & vec256_bit0mask(G0);
+
+        c0 = vec256_bit0mask(F0) & vec256_bit0mask(G0);
+        c1 = vec256_bit0mask(F1) ^ vec256_bit0mask(G1);
+        c1 &= c0;
+
+        minusdelta ^= swapmask & (minusdelta ^ -minusdelta);
+        minusdelta -= 1;
+
+        swapvec = _mm256_set1_epi32(swapmask);
+        vec256_swap(F0, G0, 3, swapvec);
+        vec256_swap(F1, G1, 3, swapvec);
+
+        c0vec = _mm256_set1_epi32(c0);
+        c1vec = _mm256_set1_epi32(c1);
+
+        vec256_eliminate(F0, F1, G0, G1, 3, c0vec, c1vec);
+        vec256_divx_3(G0);
+        vec256_divx_3(G1);
+
+        vec256_swap(V0, R0, 3, swapvec);
+        vec256_swap(V1, R1, 3, swapvec);
+        vec256_eliminate(V0, V1, R0, R1, 3, c0vec, c1vec);
+    }
+
+    for (loop = 256; loop > 0; --loop) {
+        vec256_timesx_3(V0);
+        vec256_timesx_3(V1);
+        swapmask = negative_mask(minusdelta) & vec256_bit0mask(G0);
+
+        c0 = vec256_bit0mask(F0) & vec256_bit0mask(G0);
+        c1 = vec256_bit0mask(F1) ^ vec256_bit0mask(G1);
+        c1 &= c0;
+
+        minusdelta ^= swapmask & (minusdelta ^ -minusdelta);
+        minusdelta -= 1;
+
+        swapvec = _mm256_set1_epi32(swapmask);
+        vec256_swap(F0, G0, 2, swapvec);
+        vec256_swap(F1, G1, 2, swapvec);
+
+        c0vec = _mm256_set1_epi32(c0);
+        c1vec = _mm256_set1_epi32(c1);
+
+        vec256_eliminate(F0, F1, G0, G1, 2, c0vec, c1vec);
+        vec256_divx_2(G0);
+        vec256_divx_2(G1);
+
+        vec256_swap(V0, R0, 3, swapvec);
+        vec256_swap(V1, R1, 3, swapvec);
+        vec256_eliminate(V0, V1, R0, R1, 3, c0vec, c1vec);
+    }
+
+    for (loop = 256; loop > 0; --loop) {
+        vec256_timesx_3(V0);
+        vec256_timesx_3(V1);
+        swapmask = negative_mask(minusdelta) & vec256_bit0mask(G0);
+
+        c0 = vec256_bit0mask(F0) & vec256_bit0mask(G0);
+        c1 = vec256_bit0mask(F1) ^ vec256_bit0mask(G1);
+        c1 &= c0;
+
+        minusdelta ^= swapmask & (minusdelta ^ -minusdelta);
+        minusdelta -= 1;
+
+        swapvec = _mm256_set1_epi32(swapmask);
+        vec256_swap(F0, G0, 1, swapvec);
+        vec256_swap(F1, G1, 1, swapvec);
+
+        c0vec = _mm256_set1_epi32(c0);
+        c1vec = _mm256_set1_epi32(c1);
+
+        vec256_eliminate(F0, F1, G0, G1, 1, c0vec, c1vec);
+        vec256_divx_1(G0);
+        vec256_divx_1(G1);
+
+        vec256_swap(V0, R0, 3, swapvec);
+        vec256_swap(V1, R1, 3, swapvec);
+        vec256_eliminate(V0, V1, R0, R1, 3, c0vec, c1vec);
+    }
+
+    c0vec = _mm256_set1_epi32(vec256_bit0mask(F0));
+    c1vec = _mm256_set1_epi32(vec256_bit0mask(F1));
+    vec256_scale(V0, V1, c0vec, c1vec);
+
+    vec256_final(out, V0, V1);
+    out[p] = negative_mask(minusdelta);
+    return 0;
+}
diff --git a/crypto_kem/sntrup653/avx2/crypto_core_inv3sntrup653.h b/crypto_kem/sntrup653/avx2/crypto_core_inv3sntrup653.h
new file mode 100644
index 00000000..3b1ca939
--- /dev/null
+++ b/crypto_kem/sntrup653/avx2/crypto_core_inv3sntrup653.h
@@ -0,0 +1,11 @@
+#ifndef PQCLEAN_SNTRUP653_AVX2_CRYPTO_CORE_INV3SNTRUP653_H
+#define PQCLEAN_SNTRUP653_AVX2_CRYPTO_CORE_INV3SNTRUP653_H
+
+#include <stdint.h>
+#define PQCLEAN_SNTRUP653_AVX2_crypto_core_inv3sntrup653_OUTPUTBYTES 654
+#define PQCLEAN_SNTRUP653_AVX2_crypto_core_inv3sntrup653_INPUTBYTES 653
+#define PQCLEAN_SNTRUP653_AVX2_crypto_core_inv3sntrup653_KEYBYTES 0
+#define PQCLEAN_SNTRUP653_AVX2_crypto_core_inv3sntrup653_CONSTBYTES 0
+
+int PQCLEAN_SNTRUP653_AVX2_crypto_core_inv3sntrup653(unsigned char *outbytes, const unsigned char *inbytes);
+#endif
diff --git a/crypto_kem/sntrup653/avx2/crypto_core_invsntrup653.c b/crypto_kem/sntrup653/avx2/crypto_core_invsntrup653.c
new file mode 100644
index 00000000..e36d64c5
--- /dev/null
+++ b/crypto_kem/sntrup653/avx2/crypto_core_invsntrup653.c
@@ -0,0 +1,202 @@
+#include "crypto_core_invsntrup653.h"
+#include "params.h"
+#include <immintrin.h>
+
+#define int8 int8_t
+#define int16 int16_t
+#define int32 int32_t
+#define uint16 uint16_t
+#define uint32 uint32_t
+
+
+
+/* ----- masks */
+
+/* return -1 if x!=0; else return 0 */
+static int int16_nonzero_mask(int16 x) {
+    uint16 u = x; /* 0, else 1...65535 */
+    uint32 v = u; /* 0, else 1...65535 */
+    v = -v; /* 0, else 2^32-65535...2^32-1 */
+    v >>= 31; /* 0, else 1 */
+    return -v; /* 0, else -1 */
+}
+
+/* return -1 if x<0; otherwise return 0 */
+static int int16_negative_mask(int16 x) {
+    return x >> 15; /* XXX: theoretically need gcc -fwrapv for this */
+}
+
+/* ----- arithmetic mod q */
+
+typedef int8 small;
+
+typedef int16 Fq;
+/* always represented as -(q-1)/2...(q-1)/2 */
+
+/* works for -14000000 < x < 14000000 if q in 4591, 4621, 5167 */
+static Fq Fq_freeze(int32 x) {
+    x -= q * ((q18 * x) >> 18);
+    x -= q * ((q27 * x + 67108864) >> 27);
+    return x;
+}
+
+/* nonnegative e */
+static Fq Fq_pow(Fq a, int e) {
+    if (e == 0) {
+        return 1;
+    }
+    if (e == 1) {
+        return a;
+    }
+    if (e & 1) {
+        return Fq_freeze(a * (int32)Fq_pow(a, e - 1));
+    }
+    a = Fq_freeze(a * (int32)a);
+    return Fq_pow(a, e >> 1);
+}
+
+static Fq Fq_recip(Fq a) {
+    return Fq_pow(a, q - 2);
+}
+
+/* ----- more */
+
+#define qvec _mm256_set1_epi16(q)
+#define qinvvec _mm256_set1_epi16(qinv)
+
+static inline __m256i montproduct(__m256i x, __m256i y, __m256i yqinv) {
+    __m256i hi, d, e;
+
+    d = _mm256_mullo_epi16(x, yqinv);
+    hi = _mm256_mulhi_epi16(x, y);
+    e = _mm256_mulhi_epi16(d, qvec);
+    return _mm256_sub_epi16(hi, e);
+}
+
+static inline void vectormodq_swapeliminate(Fq *f, Fq *g, int len, const Fq f0, const Fq g0, int mask) {
+    __m256i f0vec = _mm256_set1_epi16(f0);
+    __m256i g0vec = _mm256_set1_epi16(g0);
+    __m256i f0vecqinv = _mm256_mullo_epi16(f0vec, qinvvec);
+    __m256i g0vecqinv = _mm256_mullo_epi16(g0vec, qinvvec);
+    __m256i maskvec = _mm256_set1_epi32(mask);
+
+    while (len > 0) {
+        __m256i fi = _mm256_loadu_si256((__m256i *) f);
+        __m256i gi = _mm256_loadu_si256((__m256i *) g);
+        __m256i finew = _mm256_blendv_epi8(fi, gi, maskvec);
+        __m256i ginew = _mm256_blendv_epi8(gi, fi, maskvec);
+        ginew = _mm256_sub_epi16(montproduct(ginew, f0vec, f0vecqinv), montproduct(finew, g0vec, g0vecqinv));
+        _mm256_storeu_si256((__m256i *) f, finew);
+        _mm256_storeu_si256((__m256i *) (g - 1), ginew);
+        f += 16;
+        g += 16;
+        len -= 16;
+    }
+}
+
+static inline void vectormodq_xswapeliminate(Fq *f, Fq *g, int len, const Fq f0, const Fq g0, int mask) {
+    __m256i f0vec = _mm256_set1_epi16(f0);
+    __m256i g0vec = _mm256_set1_epi16(g0);
+    __m256i f0vecqinv = _mm256_mullo_epi16(f0vec, qinvvec);
+    __m256i g0vecqinv = _mm256_mullo_epi16(g0vec, qinvvec);
+    __m256i maskvec = _mm256_set1_epi32(mask);
+
+    f += len + (-len & 15);
+    g += len + (-len & 15);
+    while (len > 0) {
+        f -= 16;
+        g -= 16;
+        len -= 16;
+        __m256i fi = _mm256_loadu_si256((__m256i *) f);
+        __m256i gi = _mm256_loadu_si256((__m256i *) g);
+        __m256i finew = _mm256_blendv_epi8(fi, gi, maskvec);
+        __m256i ginew = _mm256_blendv_epi8(gi, fi, maskvec);
+        ginew = _mm256_sub_epi16(montproduct(ginew, f0vec, f0vecqinv), montproduct(finew, g0vec, g0vecqinv));
+        _mm256_storeu_si256((__m256i *) (f + 1), finew);
+        _mm256_storeu_si256((__m256i *) g, ginew);
+    }
+}
+
+int PQCLEAN_SNTRUP653_AVX2_crypto_core_invsntrup653(unsigned char *outbytes, const unsigned char *inbytes) {
+    small *in = (void *) inbytes;
+    int loop;
+    Fq out[p], f[ppad], g[ppad], v[ppad], r[ppad];
+    Fq f0, g0;
+    Fq scale;
+    int i;
+    int delta = 1;
+    int minusdelta;
+    int fgflip;
+    int swap;
+
+    for (i = 0; i < ppad; ++i) {
+        f[i] = 0;
+    }
+    f[0] = 1;
+    f[p - 1] = -1;
+    f[p] = -1;
+    /* generalization: initialize f to reversal of any deg-p polynomial m */
+
+    for (i = 0; i < p; ++i) {
+        g[i] = in[p - 1 - i];
+    }
+    for (i = p; i < ppad; ++i) {
+        g[i] = 0;
+    }
+
+    for (i = 0; i < ppad; ++i) {
+        r[i] = 0;
+    }
+    r[0] = Fq_recip(3);
+
+    for (i = 0; i < ppad; ++i) {
+        v[i] = 0;
+    }
+
+    for (loop = 0; loop < p; ++loop) {
+        g0 = Fq_freeze(g[0]);
+        f0 = f[0];
+
+        minusdelta = -delta;
+        swap = int16_negative_mask(minusdelta) & int16_nonzero_mask(g0);
+        delta ^= swap & (delta ^ minusdelta);
+        delta += 1;
+
+        fgflip = swap & (f0 ^ g0);
+        f0 ^= fgflip;
+        g0 ^= fgflip;
+
+        f[0] = f0;
+
+        vectormodq_swapeliminate(f + 1, g + 1, p, f0, g0, swap);
+        vectormodq_xswapeliminate(v, r, loop + 1, f0, g0, swap);
+    }
+
+    for (loop = p - 1; loop > 0; --loop) {
+        g0 = Fq_freeze(g[0]);
+        f0 = f[0];
+
+        minusdelta = -delta;
+        swap = int16_negative_mask(minusdelta) & int16_nonzero_mask(g0);
+        delta ^= swap & (delta ^ minusdelta);
+        delta += 1;
+
+        fgflip = swap & (f0 ^ g0);
+        f0 ^= fgflip;
+        g0 ^= fgflip;
+
+        f[0] = f0;
+
+        vectormodq_swapeliminate(f + 1, g + 1, loop, f0, g0, swap);
+        vectormodq_xswapeliminate(v, r, p, f0, g0, swap);
+    }
+
+    scale = Fq_recip(Fq_freeze(f[0]));
+    for (i = 0; i < p; ++i) {
+        out[i] = Fq_freeze(scale * (int32)Fq_freeze(v[p - i]));
+    }
+
+    crypto_encode_pxint16(outbytes, out);
+    outbytes[2 * p] = int16_nonzero_mask(delta);
+    return 0;
+}
diff --git a/crypto_kem/sntrup653/avx2/crypto_core_invsntrup653.h b/crypto_kem/sntrup653/avx2/crypto_core_invsntrup653.h
new file mode 100644
index 00000000..b0d14477
--- /dev/null
+++ b/crypto_kem/sntrup653/avx2/crypto_core_invsntrup653.h
@@ -0,0 +1,11 @@
+#ifndef PQCLEAN_SNTRUP653_AVX2_CRYPTO_CORE_INVSNTRUP653_H
+#define PQCLEAN_SNTRUP653_AVX2_CRYPTO_CORE_INVSNTRUP653_H
+
+#include <stdint.h>
+#define PQCLEAN_SNTRUP653_AVX2_crypto_core_invsntrup653_OUTPUTBYTES 1307
+#define PQCLEAN_SNTRUP653_AVX2_crypto_core_invsntrup653_INPUTBYTES 653
+#define PQCLEAN_SNTRUP653_AVX2_crypto_core_invsntrup653_KEYBYTES 0
+#define PQCLEAN_SNTRUP653_AVX2_crypto_core_invsntrup653_CONSTBYTES 0
+
+int PQCLEAN_SNTRUP653_AVX2_crypto_core_invsntrup653(unsigned char *outbytes, const unsigned char *inbytes);
+#endif
diff --git a/crypto_kem/sntrup653/avx2/crypto_core_mult3sntrup653.c b/crypto_kem/sntrup653/avx2/crypto_core_mult3sntrup653.c
new file mode 100644
index 00000000..4a692c84
--- /dev/null
+++ b/crypto_kem/sntrup653/avx2/crypto_core_mult3sntrup653.c
@@ -0,0 +1,259 @@
+#include "crypto_core_mult3sntrup653.h"
+#include "crypto_core_multsntrup653_ntt.h"
+#include "crypto_decode_653xint16.h"
+#include "crypto_encode_653xint16.h"
+#include <immintrin.h>
+
+typedef int8_t int8;
+typedef int16_t int16;
+
+#define int16x16 __m256i
+#define load_x16(p) _mm256_loadu_si256((int16x16 *) (p))
+#define store_x16(p,v) _mm256_storeu_si256((int16x16 *) (p),(v))
+#define const_x16 _mm256_set1_epi16
+#define add_x16 _mm256_add_epi16
+#define sub_x16 _mm256_sub_epi16
+#define mullo_x16 _mm256_mullo_epi16
+#define mulhi_x16 _mm256_mulhi_epi16
+#define mulhrs_x16 _mm256_mulhrs_epi16
+#define signmask_x16(x) _mm256_srai_epi16((x),15)
+
+typedef union {
+    int16 v[3][512];
+    int16x16 _dummy;
+} vec3x512;
+
+typedef union {
+    int16 v[768];
+    int16x16 _dummy;
+} vec768;
+
+typedef union {
+    int16 v[3 * 512];
+    int16x16 _dummy;
+} vec1536;
+
+static int16x16 squeeze_3_x16(int16x16 x) {
+    return sub_x16(x, mullo_x16(mulhrs_x16(x, const_x16(10923)), const_x16(3)));
+}
+
+static int16x16 squeeze_7681_x16(int16x16 x) {
+    return sub_x16(x, mullo_x16(mulhrs_x16(x, const_x16(4)), const_x16(7681)));
+}
+
+static int16x16 mulmod_7681_x16(int16x16 x, int16x16 y) {
+    int16x16 yqinv = mullo_x16(y, const_x16(-7679)); /* XXX: precompute */
+    int16x16 b = mulhi_x16(x, y);
+    int16x16 d = mullo_x16(x, yqinv);
+    int16x16 e = mulhi_x16(d, const_x16(7681));
+    return sub_x16(b, e);
+}
+
+#define mask0 _mm256_set_epi16(-1,0,0,-1,0,0,-1,0,0,-1,0,0,-1,0,0,-1)
+#define mask1 _mm256_set_epi16(0,0,-1,0,0,-1,0,0,-1,0,0,-1,0,0,-1,0)
+#define mask2 _mm256_set_epi16(0,-1,0,0,-1,0,0,-1,0,0,-1,0,0,-1,0,0)
+
+static void good(int16 fpad[3][512], const int16 f[768]) {
+    int j;
+    int16x16 f0, f1;
+
+    j = 0;
+    for (;;) {
+        f0 = load_x16(f + j);
+        f1 = load_x16(f + 512 + j);
+        store_x16(&fpad[0][j], (f0 & mask0) | (f1 & mask1));
+        store_x16(&fpad[1][j], (f0 & mask1) | (f1 & mask2));
+        store_x16(&fpad[2][j], (f0 & mask2) | (f1 & mask0));
+        j += 16;
+        if (j == 256) {
+            break;
+        }
+
+        f0 = load_x16(f + j);
+        f1 = load_x16(f + 512 + j);
+        store_x16(&fpad[0][j], (f0 & mask2) | (f1 & mask0));
+        store_x16(&fpad[1][j], (f0 & mask0) | (f1 & mask1));
+        store_x16(&fpad[2][j], (f0 & mask1) | (f1 & mask2));
+        j += 16;
+
+        f0 = load_x16(f + j);
+        f1 = load_x16(f + 512 + j);
+        store_x16(&fpad[0][j], (f0 & mask1) | (f1 & mask2));
+        store_x16(&fpad[1][j], (f0 & mask2) | (f1 & mask0));
+        store_x16(&fpad[2][j], (f0 & mask0) | (f1 & mask1));
+        j += 16;
+    }
+    for (;;) {
+        f0 = load_x16(f + j);
+        store_x16(&fpad[0][j], f0 & mask2);
+        store_x16(&fpad[1][j], f0 & mask0);
+        store_x16(&fpad[2][j], f0 & mask1);
+        j += 16;
+        if (j == 512) {
+            break;
+        }
+
+        f0 = load_x16(f + j);
+        store_x16(&fpad[0][j], f0 & mask1);
+        store_x16(&fpad[1][j], f0 & mask2);
+        store_x16(&fpad[2][j], f0 & mask0);
+        j += 16;
+
+        f0 = load_x16(f + j);
+        store_x16(&fpad[0][j], f0 & mask0);
+        store_x16(&fpad[1][j], f0 & mask1);
+        store_x16(&fpad[2][j], f0 & mask2);
+        j += 16;
+    }
+}
+
+static void ungood(int16 f[1536], const int16 fpad[3][512]) {
+    int j;
+    int16x16 f0, f1, f2, g0, g1, g2;
+
+    j = 0;
+
+    for (;;) {
+        f0 = load_x16(&fpad[0][j]);
+        f1 = load_x16(&fpad[1][j]);
+        f2 = load_x16(&fpad[2][j]);
+        g0 = (f0 & mask0) | (f1 & mask1) | (f2 & mask2);
+        g1 = (f0 & mask1) | (f1 & mask2) | (f2 & mask0);
+        g2 = f0 ^ f1 ^ f2 ^ g0 ^ g1; /* same as (f0&mask2)|(f1&mask0)|(f2&mask1) */
+        store_x16(f + 0 + j, g0);
+        store_x16(f + 512 + j, g1);
+        store_x16(f + 1024 + j, g2);
+        j += 16;
+
+        f0 = load_x16(&fpad[0][j]);
+        f1 = load_x16(&fpad[1][j]);
+        f2 = load_x16(&fpad[2][j]);
+        g0 = (f0 & mask2) | (f1 & mask0) | (f2 & mask1);
+        g1 = (f0 & mask0) | (f1 & mask1) | (f2 & mask2);
+        g2 = f0 ^ f1 ^ f2 ^ g0 ^ g1; /* same as (f0&mask1)|(f1&mask2)|(f2&mask0) */
+        store_x16(f + 0 + j, g0);
+        store_x16(f + 512 + j, g1);
+        store_x16(f + 1024 + j, g2);
+        j += 16;
+        if (j == 512) {
+            break;
+        }
+
+        f0 = load_x16(&fpad[0][j]);
+        f1 = load_x16(&fpad[1][j]);
+        f2 = load_x16(&fpad[2][j]);
+        g0 = (f0 & mask1) | (f1 & mask2) | (f2 & mask0);
+        g1 = (f0 & mask2) | (f1 & mask0) | (f2 & mask1);
+        g2 = f0 ^ f1 ^ f2 ^ g0 ^ g1; /* same as (f0&mask0)|(f1&mask1)|(f2&mask2) */
+        store_x16(f + 0 + j, g0);
+        store_x16(f + 512 + j, g1);
+        store_x16(f + 1024 + j, g2);
+        j += 16;
+    }
+}
+
+static void mult768(int16 h[1536], const int16 f[768], const int16 g[768]) {
+    vec3x512 x1, x2;
+    vec1536 x3;
+#define fpad (x1.v)
+#define gpad (x2.v)
+#define hpad fpad
+#define h_7681 (x3.v)
+    int i;
+
+    good(fpad, f);
+    PQCLEAN_SNTRUP653_AVX2_ntt512_7681(fpad[0], 3);
+
+    good(gpad, g);
+    PQCLEAN_SNTRUP653_AVX2_ntt512_7681(gpad[0], 3);
+
+    for (i = 0; i < 512; i += 16) {
+        int16x16 f0 = squeeze_7681_x16(load_x16(&fpad[0][i]));
+        int16x16 f1 = squeeze_7681_x16(load_x16(&fpad[1][i]));
+        int16x16 f2 = squeeze_7681_x16(load_x16(&fpad[2][i]));
+        int16x16 g0 = squeeze_7681_x16(load_x16(&gpad[0][i]));
+        int16x16 g1 = squeeze_7681_x16(load_x16(&gpad[1][i]));
+        int16x16 g2 = squeeze_7681_x16(load_x16(&gpad[2][i]));
+        int16x16 d0 = mulmod_7681_x16(f0, g0);
+        int16x16 d1 = mulmod_7681_x16(f1, g1);
+        int16x16 d2 = mulmod_7681_x16(f2, g2);
+        int16x16 dsum = add_x16(add_x16(d0, d1), d2);
+        int16x16 h0 = add_x16(dsum, mulmod_7681_x16(sub_x16(f2, f1), sub_x16(g1, g2)));
+        int16x16 h1 = add_x16(dsum, mulmod_7681_x16(sub_x16(f1, f0), sub_x16(g0, g1)));
+        int16x16 h2 = add_x16(dsum, mulmod_7681_x16(sub_x16(f0, f2), sub_x16(g2, g0)));
+        store_x16(&hpad[0][i], squeeze_7681_x16(h0));
+        store_x16(&hpad[1][i], squeeze_7681_x16(h1));
+        store_x16(&hpad[2][i], squeeze_7681_x16(h2));
+    }
+
+    PQCLEAN_SNTRUP653_AVX2_invntt512_7681(hpad[0], 3);
+    ungood(h_7681, (const int16(*)[512]) hpad);
+
+    for (i = 0; i < 1536; i += 16) {
+        int16x16 u = load_x16(&h_7681[i]);
+        u = mulmod_7681_x16(u, const_x16(956));
+        store_x16(&h[i], u);
+    }
+}
+
+#define crypto_decode_pxint16 PQCLEAN_SNTRUP653_AVX2_crypto_decode_653xint16
+#define crypto_encode_pxint16 PQCLEAN_SNTRUP653_AVX2_crypto_encode_653xint16
+
+#define p 653
+
+static inline int16x16 freeze_3_x16(int16x16 x) {
+    int16x16 mask, x3;
+    x = add_x16(x, const_x16(3)&signmask_x16(x));
+    mask = signmask_x16(sub_x16(x, const_x16(2)));
+    x3 = sub_x16(x, const_x16(3));
+    x = _mm256_blendv_epi8(x3, x, mask);
+    return x;
+}
+
+int PQCLEAN_SNTRUP653_AVX2_crypto_core_mult3sntrup653(unsigned char *outbytes, const unsigned char *inbytes, const unsigned char *kbytes) {
+    vec768 x1, x2;
+    vec1536 x3;
+#define f (x1.v)
+#define g (x2.v)
+#define fg (x3.v)
+#define h f
+    int i;
+    int16x16 x;
+
+    x = const_x16(0);
+    for (i = p & ~15; i < 768; i += 16) {
+        store_x16(&f[i], x);
+    }
+    for (i = p & ~15; i < 768; i += 16) {
+        store_x16(&g[i], x);
+    }
+
+    for (i = 0; i < p; ++i) {
+        int8 fi = inbytes[i];
+        int8 fi0 = fi & 1;
+        f[i] = fi0 - (fi & (fi0 << 1));
+    }
+    for (i = 0; i < p; ++i) {
+        int8 gi = kbytes[i];
+        int8 gi0 = gi & 1;
+        g[i] = gi0 - (gi & (gi0 << 1));
+    }
+
+    mult768(fg, f, g);
+
+    fg[0] -= fg[p - 1];
+    for (i = 0; i < 768; i += 16) {
+        int16x16 fgi = load_x16(&fg[i]);
+        int16x16 fgip = load_x16(&fg[i + p]);
+        int16x16 fgip1 = load_x16(&fg[i + p - 1]);
+        x = add_x16(fgi, add_x16(fgip, fgip1));
+        x = freeze_3_x16(squeeze_3_x16(x));
+        store_x16(&h[i], x);
+    }
+
+    for (i = 0; i < p; ++i) {
+        outbytes[i] = h[i];
+    }
+
+    return 0;
+}
diff --git a/crypto_kem/sntrup653/avx2/crypto_core_mult3sntrup653.h b/crypto_kem/sntrup653/avx2/crypto_core_mult3sntrup653.h
new file mode 100644
index 00000000..e4e84330
--- /dev/null
+++ b/crypto_kem/sntrup653/avx2/crypto_core_mult3sntrup653.h
@@ -0,0 +1,11 @@
+#ifndef PQCLEAN_SNTRUP653_AVX2_CRYPTO_CORE_MULT3SNTRUP653_H
+#define PQCLEAN_SNTRUP653_AVX2_CRYPTO_CORE_MULT3SNTRUP653_H
+
+#include <stdint.h>
+#define PQCLEAN_SNTRUP653_AVX2_crypto_core_mult3sntrup653_OUTPUTBYTES 653
+#define PQCLEAN_SNTRUP653_AVX2_crypto_core_mult3sntrup653_INPUTBYTES 653
+#define PQCLEAN_SNTRUP653_AVX2_crypto_core_mult3sntrup653_KEYBYTES 653
+#define PQCLEAN_SNTRUP653_AVX2_crypto_core_mult3sntrup653_CONSTBYTES 0
+
+int PQCLEAN_SNTRUP653_AVX2_crypto_core_mult3sntrup653(unsigned char *outbytes, const unsigned char *inbytes, const unsigned char *kbytes);
+#endif
diff --git a/crypto_kem/sntrup653/avx2/crypto_core_multsntrup653.c b/crypto_kem/sntrup653/avx2/crypto_core_multsntrup653.c
new file mode 100644
index 00000000..d91f5582
--- /dev/null
+++ b/crypto_kem/sntrup653/avx2/crypto_core_multsntrup653.c
@@ -0,0 +1,314 @@
+#include "crypto_core_multsntrup653.h"
+#include "crypto_core_multsntrup653_ntt.h"
+#include "crypto_decode_653xint16.h"
+#include "crypto_encode_653xint16.h"
+#include <immintrin.h>
+
+typedef int8_t int8;
+typedef int16_t int16;
+
+#define int16x16 __m256i
+#define load_x16(p) _mm256_loadu_si256((int16x16 *) (p))
+#define store_x16(p,v) _mm256_storeu_si256((int16x16 *) (p),(v))
+#define const_x16 _mm256_set1_epi16
+#define add_x16 _mm256_add_epi16
+#define sub_x16 _mm256_sub_epi16
+#define mullo_x16 _mm256_mullo_epi16
+#define mulhi_x16 _mm256_mulhi_epi16
+#define mulhrs_x16 _mm256_mulhrs_epi16
+#define signmask_x16(x) _mm256_srai_epi16((x),15)
+
+typedef union {
+    int16 v[3][512];
+    int16x16 _dummy;
+} vec3x512;
+
+typedef union {
+    int16 v[768];
+    int16x16 _dummy;
+} vec768;
+
+typedef union {
+    int16 v[3 * 512];
+    int16x16 _dummy;
+} vec1536;
+
+static inline int16x16 squeeze_4621_x16(int16x16 x) {
+    return sub_x16(x, mullo_x16(mulhrs_x16(x, const_x16(7)), const_x16(4621)));
+}
+
+static inline int16x16 squeeze_7681_x16(int16x16 x) {
+    return sub_x16(x, mullo_x16(mulhrs_x16(x, const_x16(4)), const_x16(7681)));
+}
+
+static inline int16x16 squeeze_10753_x16(int16x16 x) {
+    return sub_x16(x, mullo_x16(mulhrs_x16(x, const_x16(3)), const_x16(10753)));
+}
+
+static inline int16x16 mulmod_4621_x16(int16x16 x, int16x16 y) {
+    int16x16 yqinv = mullo_x16(y, const_x16(-29499)); /* XXX: precompute */
+    int16x16 b = mulhi_x16(x, y);
+    int16x16 d = mullo_x16(x, yqinv);
+    int16x16 e = mulhi_x16(d, const_x16(4621));
+    return sub_x16(b, e);
+}
+
+static inline int16x16 mulmod_7681_x16(int16x16 x, int16x16 y) {
+    int16x16 yqinv = mullo_x16(y, const_x16(-7679)); /* XXX: precompute */
+    int16x16 b = mulhi_x16(x, y);
+    int16x16 d = mullo_x16(x, yqinv);
+    int16x16 e = mulhi_x16(d, const_x16(7681));
+    return sub_x16(b, e);
+}
+
+static inline int16x16 mulmod_10753_x16(int16x16 x, int16x16 y) {
+    int16x16 yqinv = mullo_x16(y, const_x16(-10751)); /* XXX: precompute */
+    int16x16 b = mulhi_x16(x, y);
+    int16x16 d = mullo_x16(x, yqinv);
+    int16x16 e = mulhi_x16(d, const_x16(10753));
+    return sub_x16(b, e);
+}
+
+#define mask0 _mm256_set_epi16(-1,0,0,-1,0,0,-1,0,0,-1,0,0,-1,0,0,-1)
+#define mask1 _mm256_set_epi16(0,0,-1,0,0,-1,0,0,-1,0,0,-1,0,0,-1,0)
+#define mask2 _mm256_set_epi16(0,-1,0,0,-1,0,0,-1,0,0,-1,0,0,-1,0,0)
+
+static void good(int16 fpad[3][512], const int16 f[768]) {
+    int j;
+    int16x16 f0, f1;
+
+    j = 0;
+    for (;;) {
+        f0 = load_x16(f + j);
+        f1 = load_x16(f + 512 + j);
+        store_x16(&fpad[0][j], (f0 & mask0) | (f1 & mask1));
+        store_x16(&fpad[1][j], (f0 & mask1) | (f1 & mask2));
+        store_x16(&fpad[2][j], (f0 & mask2) | (f1 & mask0));
+        j += 16;
+        if (j == 256) {
+            break;
+        }
+
+        f0 = load_x16(f + j);
+        f1 = load_x16(f + 512 + j);
+        store_x16(&fpad[0][j], (f0 & mask2) | (f1 & mask0));
+        store_x16(&fpad[1][j], (f0 & mask0) | (f1 & mask1));
+        store_x16(&fpad[2][j], (f0 & mask1) | (f1 & mask2));
+        j += 16;
+
+        f0 = load_x16(f + j);
+        f1 = load_x16(f + 512 + j);
+        store_x16(&fpad[0][j], (f0 & mask1) | (f1 & mask2));
+        store_x16(&fpad[1][j], (f0 & mask2) | (f1 & mask0));
+        store_x16(&fpad[2][j], (f0 & mask0) | (f1 & mask1));
+        j += 16;
+    }
+    for (;;) {
+        f0 = load_x16(f + j);
+        store_x16(&fpad[0][j], f0 & mask2);
+        store_x16(&fpad[1][j], f0 & mask0);
+        store_x16(&fpad[2][j], f0 & mask1);
+        j += 16;
+        if (j == 512) {
+            break;
+        }
+
+        f0 = load_x16(f + j);
+        store_x16(&fpad[0][j], f0 & mask1);
+        store_x16(&fpad[1][j], f0 & mask2);
+        store_x16(&fpad[2][j], f0 & mask0);
+        j += 16;
+
+        f0 = load_x16(f + j);
+        store_x16(&fpad[0][j], f0 & mask0);
+        store_x16(&fpad[1][j], f0 & mask1);
+        store_x16(&fpad[2][j], f0 & mask2);
+        j += 16;
+    }
+}
+
+static void ungood(int16 f[1536], const int16 fpad[3][512]) {
+    int j;
+    int16x16 f0, f1, f2, g0, g1, g2;
+
+    j = 0;
+
+    for (;;) {
+        f0 = load_x16(&fpad[0][j]);
+        f1 = load_x16(&fpad[1][j]);
+        f2 = load_x16(&fpad[2][j]);
+        g0 = (f0 & mask0) | (f1 & mask1) | (f2 & mask2);
+        g1 = (f0 & mask1) | (f1 & mask2) | (f2 & mask0);
+        g2 = f0 ^ f1 ^ f2 ^ g0 ^ g1; /* same as (f0&mask2)|(f1&mask0)|(f2&mask1) */
+        store_x16(f + 0 + j, g0);
+        store_x16(f + 512 + j, g1);
+        store_x16(f + 1024 + j, g2);
+        j += 16;
+
+        f0 = load_x16(&fpad[0][j]);
+        f1 = load_x16(&fpad[1][j]);
+        f2 = load_x16(&fpad[2][j]);
+        g0 = (f0 & mask2) | (f1 & mask0) | (f2 & mask1);
+        g1 = (f0 & mask0) | (f1 & mask1) | (f2 & mask2);
+        g2 = f0 ^ f1 ^ f2 ^ g0 ^ g1; /* same as (f0&mask1)|(f1&mask2)|(f2&mask0) */
+        store_x16(f + 0 + j, g0);
+        store_x16(f + 512 + j, g1);
+        store_x16(f + 1024 + j, g2);
+        j += 16;
+        if (j == 512) {
+            break;
+        }
+
+        f0 = load_x16(&fpad[0][j]);
+        f1 = load_x16(&fpad[1][j]);
+        f2 = load_x16(&fpad[2][j]);
+        g0 = (f0 & mask1) | (f1 & mask2) | (f2 & mask0);
+        g1 = (f0 & mask2) | (f1 & mask0) | (f2 & mask1);
+        g2 = f0 ^ f1 ^ f2 ^ g0 ^ g1; /* same as (f0&mask0)|(f1&mask1)|(f2&mask2) */
+        store_x16(f + 0 + j, g0);
+        store_x16(f + 512 + j, g1);
+        store_x16(f + 1024 + j, g2);
+        j += 16;
+    }
+}
+
+static void mult768(int16 h[1536], const int16 f[768], const int16 g[768]) {
+    vec3x512 x1, x2;
+    vec1536 x3, x4;
+#define fpad (x1.v)
+#define gpad (x2.v)
+#define hpad fpad
+#define h_7681 (x3.v)
+#define h_10753 (x4.v)
+    int i;
+
+    good(fpad, f);
+    PQCLEAN_SNTRUP653_AVX2_ntt512_7681(fpad[0], 3);
+
+    good(gpad, g);
+    PQCLEAN_SNTRUP653_AVX2_ntt512_7681(gpad[0], 3);
+
+    for (i = 0; i < 512; i += 16) {
+        int16x16 f0 = squeeze_7681_x16(load_x16(&fpad[0][i]));
+        int16x16 f1 = squeeze_7681_x16(load_x16(&fpad[1][i]));
+        int16x16 f2 = squeeze_7681_x16(load_x16(&fpad[2][i]));
+        int16x16 g0 = squeeze_7681_x16(load_x16(&gpad[0][i]));
+        int16x16 g1 = squeeze_7681_x16(load_x16(&gpad[1][i]));
+        int16x16 g2 = squeeze_7681_x16(load_x16(&gpad[2][i]));
+        int16x16 d0 = mulmod_7681_x16(f0, g0);
+        int16x16 d1 = mulmod_7681_x16(f1, g1);
+        int16x16 d2 = mulmod_7681_x16(f2, g2);
+        int16x16 dsum = add_x16(add_x16(d0, d1), d2);
+        int16x16 h0 = add_x16(dsum, mulmod_7681_x16(sub_x16(f2, f1), sub_x16(g1, g2)));
+        int16x16 h1 = add_x16(dsum, mulmod_7681_x16(sub_x16(f1, f0), sub_x16(g0, g1)));
+        int16x16 h2 = add_x16(dsum, mulmod_7681_x16(sub_x16(f0, f2), sub_x16(g2, g0)));
+        store_x16(&hpad[0][i], squeeze_7681_x16(h0));
+        store_x16(&hpad[1][i], squeeze_7681_x16(h1));
+        store_x16(&hpad[2][i], squeeze_7681_x16(h2));
+    }
+
+    PQCLEAN_SNTRUP653_AVX2_invntt512_7681(hpad[0], 3);
+    ungood(h_7681, (const int16(*)[512]) hpad);
+
+    good(fpad, f);
+    PQCLEAN_SNTRUP653_AVX2_ntt512_10753(fpad[0], 3);
+
+    good(gpad, g);
+    PQCLEAN_SNTRUP653_AVX2_ntt512_10753(gpad[0], 3);
+
+    for (i = 0; i < 512; i += 16) {
+        int16x16 f0 = squeeze_10753_x16(load_x16(&fpad[0][i]));
+        int16x16 f1 = squeeze_10753_x16(load_x16(&fpad[1][i]));
+        int16x16 f2 = squeeze_10753_x16(load_x16(&fpad[2][i]));
+        int16x16 g0 = squeeze_10753_x16(load_x16(&gpad[0][i]));
+        int16x16 g1 = squeeze_10753_x16(load_x16(&gpad[1][i]));
+        int16x16 g2 = squeeze_10753_x16(load_x16(&gpad[2][i]));
+        int16x16 d0 = mulmod_10753_x16(f0, g0);
+        int16x16 d1 = mulmod_10753_x16(f1, g1);
+        int16x16 d2 = mulmod_10753_x16(f2, g2);
+        int16x16 dsum = add_x16(add_x16(d0, d1), d2);
+        int16x16 h0 = add_x16(dsum, mulmod_10753_x16(sub_x16(f2, f1), sub_x16(g1, g2)));
+        int16x16 h1 = add_x16(dsum, mulmod_10753_x16(sub_x16(f1, f0), sub_x16(g0, g1)));
+        int16x16 h2 = add_x16(dsum, mulmod_10753_x16(sub_x16(f0, f2), sub_x16(g2, g0)));
+        store_x16(&hpad[0][i], squeeze_10753_x16(h0));
+        store_x16(&hpad[1][i], squeeze_10753_x16(h1));
+        store_x16(&hpad[2][i], squeeze_10753_x16(h2));
+    }
+
+    PQCLEAN_SNTRUP653_AVX2_invntt512_10753(hpad[0], 3);
+    ungood(h_10753, (const int16(*)[512]) hpad);
+
+    for (i = 0; i < 1536; i += 16) {
+        int16x16 u1 = load_x16(&h_10753[i]);
+        int16x16 u2 = load_x16(&h_7681[i]);
+        int16x16 t;
+        u1 = mulmod_10753_x16(u1, const_x16(1268));
+        u2 = mulmod_7681_x16(u2, const_x16(956));
+        t = mulmod_7681_x16(sub_x16(u2, u1), const_x16(-2539));
+        t = add_x16(u1, mulmod_4621_x16(t, const_x16(1487)));
+        store_x16(&h[i], t);
+    }
+}
+
+#define crypto_decode_pxint16 PQCLEAN_SNTRUP653_AVX2_crypto_decode_653xint16
+#define crypto_encode_pxint16 PQCLEAN_SNTRUP653_AVX2_crypto_encode_653xint16
+
+#define p 653
+#define q 4621
+
+static inline int16x16 freeze_4621_x16(int16x16 x) {
+    int16x16 mask, xq;
+    x = add_x16(x, const_x16(q)&signmask_x16(x));
+    mask = signmask_x16(sub_x16(x, const_x16((q + 1) / 2)));
+    xq = sub_x16(x, const_x16(q));
+    x = _mm256_blendv_epi8(xq, x, mask);
+    return x;
+}
+
+int PQCLEAN_SNTRUP653_AVX2_crypto_core_multsntrup653(unsigned char *outbytes, const unsigned char *inbytes, const unsigned char *kbytes) {
+    vec768 x1, x2;
+    vec1536 x3;
+#define f (x1.v)
+#define g (x2.v)
+#define fg (x3.v)
+#define h f
+    int i;
+    int16x16 x;
+
+    x = const_x16(0);
+    for (i = p & ~15; i < 768; i += 16) {
+        store_x16(&f[i], x);
+    }
+    for (i = p & ~15; i < 768; i += 16) {
+        store_x16(&g[i], x);
+    }
+
+    crypto_decode_pxint16(f, inbytes);
+
+    for (i = 0; i < 768; i += 16) {
+        x = load_x16(&f[i]);
+        x = freeze_4621_x16(squeeze_4621_x16(x));
+        store_x16(&f[i], x);
+    }
+    for (i = 0; i < p; ++i) {
+        int8 gi = kbytes[i];
+        int8 gi0 = gi & 1;
+        g[i] = gi0 - (gi & (gi0 << 1));
+    }
+
+    mult768(fg, f, g);
+
+    fg[0] -= fg[p - 1];
+    for (i = 0; i < 768; i += 16) {
+        int16x16 fgi = load_x16(&fg[i]);
+        int16x16 fgip = load_x16(&fg[i + p]);
+        int16x16 fgip1 = load_x16(&fg[i + p - 1]);
+        x = add_x16(fgi, add_x16(fgip, fgip1));
+        x = freeze_4621_x16(squeeze_4621_x16(x));
+        store_x16(&h[i], x);
+    }
+
+    crypto_encode_pxint16(outbytes, h);
+
+    return 0;
+}
diff --git a/crypto_kem/sntrup653/avx2/crypto_core_multsntrup653.h b/crypto_kem/sntrup653/avx2/crypto_core_multsntrup653.h
new file mode 100644
index 00000000..cd6648d5
--- /dev/null
+++ b/crypto_kem/sntrup653/avx2/crypto_core_multsntrup653.h
@@ -0,0 +1,11 @@
+#ifndef PQCLEAN_SNTRUP653_AVX2_CRYPTO_CORE_MULTSNTRUP653_H
+#define PQCLEAN_SNTRUP653_AVX2_CRYPTO_CORE_MULTSNTRUP653_H
+
+#include <stdint.h>
+#define PQCLEAN_SNTRUP653_AVX2_crypto_core_multsntrup653_OUTPUTBYTES 1306
+#define PQCLEAN_SNTRUP653_AVX2_crypto_core_multsntrup653_INPUTBYTES 1306
+#define PQCLEAN_SNTRUP653_AVX2_crypto_core_multsntrup653_KEYBYTES 653
+#define PQCLEAN_SNTRUP653_AVX2_crypto_core_multsntrup653_CONSTBYTES 0
+
+int PQCLEAN_SNTRUP653_AVX2_crypto_core_multsntrup653(unsigned char *outbytes, const unsigned char *inbytes, const unsigned char *kbytes);
+#endif
diff --git a/crypto_kem/sntrup653/avx2/crypto_core_multsntrup653_ntt.c b/crypto_kem/sntrup653/avx2/crypto_core_multsntrup653_ntt.c
new file mode 100644
index 00000000..6fe2436c
--- /dev/null
+++ b/crypto_kem/sntrup653/avx2/crypto_core_multsntrup653_ntt.c
@@ -0,0 +1,927 @@
+#include "crypto_core_multsntrup653.h"
+#include "crypto_core_multsntrup653_ntt.h"
+#include <immintrin.h>
+#include <stdint.h>
+
+/* auto-generated; do not edit */
+
+
+typedef int8_t int8;
+typedef int16_t int16;
+
+#define zeta(n,i) (((__m256i *) zeta_##n)[(i)])
+#define zeta_x4(n,i) (((__m256i *) zeta_x4_##n)[(i)])
+#define zeta_qinv(n,i) (((__m256i *) qinvzeta_##n)[(i)])
+#define zeta_x4_qinv(n,i) (((__m256i *) qinvzeta_x4_##n)[(i)])
+#define zetainv(n,i) _mm256_loadu_reverse16((__m256i *) ((int16 *) zeta_##n+(n)/2+1-16*((i)+1)))
+#define zetainv_x4(n,i) _mm256_loadu_reverse16((__m256i *) ((int16 *) zeta_x4_##n+2*(n)+4-16*((i)+1)))
+#define zetainv_qinv(n,i) _mm256_loadu_reverse16((__m256i *) ((int16 *) qinvzeta_##n+(n)/2+1-16*((i)+1)))
+#define zetainv_x4_qinv(n,i) _mm256_loadu_reverse16((__m256i *) ((int16 *) qinvzeta_x4_##n+2*(n)+4-16*((i)+1)))
+
+typedef union {
+    int16 data[93 * 16];
+    __m256i _dummy;
+} vec1488;
+
+static const vec1488 qdata_7681 = { .data = {
+
+#define q_x16 (qdata[0])
+        7681, 7681, 7681, 7681, 7681, 7681, 7681, 7681, 7681, 7681, 7681, 7681, 7681, 7681, 7681, 7681,
+
+#define qrecip_x16 (qdata[1])
+        17474, 17474, 17474, 17474, 17474, 17474, 17474, 17474, 17474, 17474, 17474, 17474, 17474, 17474, 17474, 17474,
+
+#define qshift_x16 (qdata[2])
+        16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+
+#define zeta4_x16 (qdata[3])
+        -3777, -3777, -3777, -3777, -3777, -3777, -3777, -3777, -3777, -3777, -3777, -3777, -3777, -3777, -3777, -3777,
+
+#define zeta4_x16_qinv (qdata[4])
+        -28865, -28865, -28865, -28865, -28865, -28865, -28865, -28865, -28865, -28865, -28865, -28865, -28865, -28865, -28865, -28865,
+
+#define zeta8_x16 (qdata[5])
+        -3625, -3625, -3625, -3625, -3625, -3625, -3625, -3625, -3625, -3625, -3625, -3625, -3625, -3625, -3625, -3625,
+
+#define zeta8_x16_qinv (qdata[6])
+        -16425, -16425, -16425, -16425, -16425, -16425, -16425, -16425, -16425, -16425, -16425, -16425, -16425, -16425, -16425, -16425,
+
+#define zetainv8_x16 (qdata[7])
+        -3182, -3182, -3182, -3182, -3182, -3182, -3182, -3182, -3182, -3182, -3182, -3182, -3182, -3182, -3182, -3182,
+
+#define zetainv8_x16_qinv (qdata[8])
+        -10350, -10350, -10350, -10350, -10350, -10350, -10350, -10350, -10350, -10350, -10350, -10350, -10350, -10350, -10350, -10350,
+
+#define zeta_x4_16 (qdata+9)
+        -3593, -3593, -3593, -3593, -2194, -2194, -2194, -2194, -3625, -3625, -3625, -3625, 1100, 1100, 1100, 1100,
+        -3777, -3777, -3777, -3777, -2456, -2456, -2456, -2456, 3182, 3182, 3182, 3182, 3696, 3696, 3696, 3696,
+        3593, 3593, 3593, 3593, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+#define qinvzeta_x4_16 (qdata+12)
+        -9, -9, -9, -9, 4974, 4974, 4974, 4974, -16425, -16425, -16425, -16425, 7244, 7244, 7244, 7244,
+        -28865, -28865, -28865, -28865, -14744, -14744, -14744, -14744, 10350, 10350, 10350, 10350, -4496, -4496, -4496, -4496,
+        9, 9, 9, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+#define zeta_x4_32 (qdata+15)
+        -3593, -3593, -3593, -3593, 1414, 1414, 1414, 1414, -2194, -2194, -2194, -2194, -2495, -2495, -2495, -2495,
+        -3625, -3625, -3625, -3625, 2876, 2876, 2876, 2876, 1100, 1100, 1100, 1100, -2250, -2250, -2250, -2250,
+        -3777, -3777, -3777, -3777, -1701, -1701, -1701, -1701, -2456, -2456, -2456, -2456, 834, 834, 834, 834,
+        3182, 3182, 3182, 3182, -2319, -2319, -2319, -2319, 3696, 3696, 3696, 3696, 121, 121, 121, 121,
+        3593, 3593, 3593, 3593, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+#define qinvzeta_x4_32 (qdata+20)
+        -9, -9, -9, -9, 20870, 20870, 20870, 20870, 4974, 4974, 4974, 4974, 22593, 22593, 22593, 22593,
+        -16425, -16425, -16425, -16425, 828, 828, 828, 828, 7244, 7244, 7244, 7244, -23754, -23754, -23754, -23754,
+        -28865, -28865, -28865, -28865, 20315, 20315, 20315, 20315, -14744, -14744, -14744, -14744, 18242, 18242, 18242, 18242,
+        10350, 10350, 10350, 10350, -18191, -18191, -18191, -18191, -4496, -4496, -4496, -4496, -11655, -11655, -11655, -11655,
+        9, 9, 9, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+#define zeta_64 (qdata+25)
+        -3593, -617, 1414, 3706, -2194, -1296, -2495, -2237, -3625, 2830, 2876, -1599, 1100, 1525, -2250, 2816,
+        -3777, 1921, -1701, 2006, -2456, 1483, 834, -1986, 3182, 3364, -2319, -1993, 3696, -2557, 121, 2088,
+        3593, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+#define qinvzeta_64 (qdata+28)
+        -9, 19351, 20870, -15750, 4974, -9488, 22593, 7491, -16425, 26382, 828, 23489, 7244, 20469, -23754, 2816,
+        -28865, -5759, 20315, -3114, -14744, 15307, 18242, -19394, 10350, -10972, -18191, -31177, -4496, -25597, -11655, 22568,
+        9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+#define zeta_128 (qdata+31)
+        -3593, -2804, -617, -396, 1414, -549, 3706, 810, -2194, -1321, -1296, 438, -2495, -2535, -2237, -3689,
+        -3625, 2043, 2830, -1881, 2876, 3153, -1599, 7, 1100, -514, 1525, -1760, -2250, -2440, 2816, 3600,
+        -3777, 103, 1921, -3174, -1701, 1535, 2006, -1887, -2456, 1399, 1483, -679, 834, 3772, -1986, 1738,
+        3182, -1431, 3364, -3555, -2319, -2310, -1993, 638, 3696, -2956, -2557, -1305, 121, 2555, 2088, -3266,
+        3593, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+#define qinvzeta_128 (qdata+36)
+        -9, -29428, 19351, 26228, 20870, 21467, -15750, 5930, 4974, -14121, -9488, -21066, 22593, 2073, 7491, 16279,
+        -16425, -25093, 26382, 26279, 828, -29103, 23489, 11783, 7244, 14846, 20469, 14624, -23754, -6536, 2816, 11792,
+        -28865, -4505, -5759, -6246, 20315, 9215, -3114, 6817, -14744, 4983, 15307, -28839, 18242, 1724, -19394, 23242,
+        10350, -21399, -10972, -29667, -18191, -21766, -31177, 15998, -4496, 23668, -25597, -5913, -11655, -24581, 22568, -20674,
+        9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+#define zeta_256 (qdata+41)
+        -3593, 2665, -2804, -2572, -617, 727, -396, 3417, 1414, 2579, -549, 373, 3706, 3750, 810, -1054,
+        -2194, -2133, -1321, 1681, -1296, -1386, 438, -2732, -2495, 1919, -2535, -2391, -2237, 2835, -3689, 2,
+        -3625, -783, 2043, 3145, 2830, 1533, -1881, 2789, 2876, 2649, 3153, 3692, -1599, -1390, 7, -1166,
+        1100, 3310, -514, 2224, 1525, -2743, -1760, 2385, -2250, -486, -2440, -1756, 2816, -3816, 3600, -3831,
+        -3777, -1799, 103, 1497, 1921, 1521, -3174, -194, -1701, -859, 1535, 2175, 2006, -2762, -1887, -1698,
+        -2456, -3480, 1399, 2883, 1483, -3428, -679, -2113, 834, 1532, 3772, -660, -1986, -2764, 1738, -915,
+        3182, 1056, -1431, 1350, 3364, 1464, -3555, 2919, -2319, -2160, -2310, 730, -1993, -1598, 638, 3456,
+        3696, -1168, -2956, -3588, -2557, -921, -1305, 3405, 121, -404, 2555, -3135, 2088, 2233, -3266, -2426,
+        3593, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+#define qinvzeta_256 (qdata+50)
+        -9, -17303, -29428, 24052, 19351, -12073, 26228, -24743, 20870, -12269, 21467, 19317, -15750, -25946, 5930, 32738,
+        4974, -4693, -14121, 2193, -9488, 26262, -21066, 7508, 22593, 9599, 2073, 10409, 7491, -12013, 16279, -15358,
+        -16425, -16655, -25093, 32329, 26382, 24573, 26279, 13541, 828, -25511, -29103, 26220, 23489, -8558, 11783, -24718,
+        7244, 10478, 14846, 26800, 20469, 26441, 14624, -29871, -23754, -3558, -6536, -16092, 2816, 8472, 11792, -7415,
+        -28865, -13575, -4505, -26663, -5759, -14351, -6246, -17602, 20315, -22875, 9215, 9855, -3114, -24266, 6817, -2722,
+        -14744, -15768, 4983, 12611, 15307, -21860, -28839, -27201, 18242, 32252, 1724, 21868, -19394, -8908, 23242, 13933,
+        10350, 17440, -21399, -11962, -10972, 30136, -29667, -1689, -18191, 6032, -21766, 30426, -31177, 15810, 15998, 3456,
+        -4496, -9360, 23668, 27132, -25597, -5529, -5913, 1869, -11655, 22124, -24581, 21953, 22568, 23225, -20674, 17030,
+        9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+#define zeta_512 (qdata+59)
+        -3593, 2005, 2665, 2891, -2804, 2345, -2572, 1121, -617, -188, 727, 2786, -396, -3208, 3417, -17,
+        1414, -3752, 2579, 2815, -549, 1837, 373, 151, 3706, -1012, 3750, -1509, 810, -3214, -1054, 3177,
+        -2194, -1403, -2133, -3314, -1321, 83, 1681, -658, -1296, 2070, -1386, -3547, 438, 3781, -2732, 2230,
+        -2495, -1669, 1919, 2589, -2535, -3312, -2391, -3542, -2237, -1441, 2835, -3568, -3689, -402, 2, -1070,
+        -3625, 3763, -783, -3550, 2043, -2303, 3145, -436, 2830, -893, 1533, 1712, -1881, 124, 2789, -2001,
+        2876, -2460, 2649, 3770, 3153, 2965, 3692, -1203, -1599, 2874, -1390, -1407, 7, -3745, -1166, 1649,
+        1100, 2937, 3310, 3461, -514, -1526, 2224, 715, 1525, -1689, -2743, 434, -1760, -3163, 2385, -929,
+        -2250, -2167, -486, -1144, -2440, -370, -1756, 2378, 2816, -1084, -3816, -1586, 3600, 1931, -3831, -1242,
+        -3777, 592, -1799, 2340, 103, -1338, 1497, -2071, 1921, 1519, 1521, 451, -3174, 589, -194, -3744,
+        -1701, 3677, -859, -1295, 1535, 642, 2175, -3794, 2006, 2130, -2762, 2918, -1887, 3334, -1698, 2072,
+        -2456, 509, -3480, 2998, 1399, -3408, 2883, 1476, 1483, -2262, -3428, -1779, -679, 2258, -2113, 1348,
+        834, -692, 1532, 2247, 3772, 2083, -660, -226, -1986, 2532, -2764, -3693, 1738, -429, -915, -2059,
+        3182, 2812, 1056, 3434, -1431, -2515, 1350, -236, 3364, -2386, 1464, 222, -3555, -2963, 2919, -2422,
+        -2319, -3657, -2160, 3450, -2310, -791, 730, 1181, -1993, -1404, -1598, 2339, 638, -3366, 3456, 2161,
+        3696, -3343, -1168, 2719, -2956, -826, -3588, -670, -2557, 777, -921, 1151, -1305, -796, 3405, -1278,
+        121, -3287, -404, 1072, 2555, 293, -3135, 2767, 2088, -3335, 2233, 3581, -3266, 3723, -2426, -179,
+        3593, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+#define qinvzeta_512 (qdata+76)
+        -9, 4565, -17303, 16715, -29428, 15145, 24052, -22943, 19351, 1860, -12073, -28958, 26228, -7304, -24743, -529,
+        20870, -24232, -12269, 10495, 21467, -16083, 19317, 20119, -15750, -27636, -25946, -12261, 5930, -26766, 32738, -16791,
+        4974, 25733, -4693, 20238, -14121, 18003, 2193, 6510, -9488, 29718, 26262, -25563, -21066, -1851, 7508, -19274,
+        22593, -28805, 9599, -23523, 2073, 4880, 10409, 1578, 7491, -10145, -12013, 4624, 16279, 6766, -15358, 24530,
+        -16425, 5299, -16655, -2526, -25093, -9983, 32329, 5708, 26382, -23933, 24573, 26288, 26279, 30844, 13541, 30255,
+        828, 15972, -25511, 17082, -29103, -27243, 26220, -2739, 23489, 16186, -8558, -9087, 11783, -12449, -24718, -14223,
+        7244, -8839, 10478, 30597, 14846, -12790, 26800, 14539, 20469, -6297, 26441, 9650, 14624, -25179, -29871, -9633,
+        -23754, -5751, -3558, 2952, -6536, 23182, -16092, 23882, 2816, 964, 8472, -10802, 11792, -17013, -7415, -30938,
+        -28865, -23984, -13575, -11996, -4505, -14650, -26663, -22039, -5759, 1007, -14351, 10179, -6246, -947, -17602, -20128,
+        20315, 10333, -22875, -17167, 9215, -14718, 9855, -29394, -3114, 27730, -24266, 5990, 6817, 22790, -2722, 14360,
+        -14744, 23549, -15768, -18506, 4983, 21168, 12611, 3524, 15307, 2858, -21860, 29453, -28839, 27858, -27201, 3396,
+        18242, 5452, 32252, -18745, 1724, -4573, 21868, 31518, -19394, 20964, -8908, -18541, 23242, 17491, 13933, 16885,
+        10350, -32004, 17440, -24214, -21399, -20435, -11962, -22764, -10972, -27986, 30136, -802, -29667, 11885, -1689, -13686,
+        -18191, 32695, 6032, -16006, -21766, -20759, 30426, -24931, -31177, -32124, 15810, -4317, 15998, 26330, 3456, -13711,
+        -4496, -19215, -9360, 26783, 23668, -14138, 27132, -32414, -25597, -2807, -5529, 8831, -5913, 17636, 1869, -16638,
+        -11655, 9513, 22124, 25648, -24581, -21723, 21953, -14129, 22568, -15111, 23225, 26621, -20674, -15221, 17030, -1715,
+        9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    }
+};
+
+static const vec1488 qdata_10753 = { .data = {
+
+        10753, 10753, 10753, 10753, 10753, 10753, 10753, 10753, 10753, 10753, 10753, 10753, 10753, 10753, 10753, 10753,
+
+        24964, 24964, 24964, 24964, 24964, 24964, 24964, 24964, 24964, 24964, 24964, 24964, 24964, 24964, 24964, 24964,
+
+        8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+
+        223, 223, 223, 223, 223, 223, 223, 223, 223, 223, 223, 223, 223, 223, 223, 223,
+
+        27359, 27359, 27359, 27359, 27359, 27359, 27359, 27359, 27359, 27359, 27359, 27359, 27359, 27359, 27359, 27359,
+
+        4188, 4188, 4188, 4188, 4188, 4188, 4188, 4188, 4188, 4188, 4188, 4188, 4188, 4188, 4188, 4188,
+
+        -1956, -1956, -1956, -1956, -1956, -1956, -1956, -1956, -1956, -1956, -1956, -1956, -1956, -1956, -1956, -1956,
+
+        3688, 3688, 3688, 3688, 3688, 3688, 3688, 3688, 3688, 3688, 3688, 3688, 3688, 3688, 3688, 3688,
+
+        -408, -408, -408, -408, -408, -408, -408, -408, -408, -408, -408, -408, -408, -408, -408, -408,
+
+        1018, 1018, 1018, 1018, 2413, 2413, 2413, 2413, 4188, 4188, 4188, 4188, 357, 357, 357, 357,
+        223, 223, 223, 223, -3686, -3686, -3686, -3686, -3688, -3688, -3688, -3688, -376, -376, -376, -376,
+        -1018, -1018, -1018, -1018, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+        -6, -6, -6, -6, 10093, 10093, 10093, 10093, -1956, -1956, -1956, -1956, 28517, 28517, 28517, 28517,
+        27359, 27359, 27359, 27359, -21094, -21094, -21094, -21094, 408, 408, 408, 408, -20856, -20856, -20856, -20856,
+        6, 6, 6, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+        1018, 1018, 1018, 1018, -2695, -2695, -2695, -2695, 2413, 2413, 2413, 2413, 425, 425, 425, 425,
+        4188, 4188, 4188, 4188, -4855, -4855, -4855, -4855, 357, 357, 357, 357, -3364, -3364, -3364, -3364,
+        223, 223, 223, 223, 730, 730, 730, 730, -3686, -3686, -3686, -3686, -4544, -4544, -4544, -4544,
+        -3688, -3688, -3688, -3688, -2236, -2236, -2236, -2236, -376, -376, -376, -376, 3784, 3784, 3784, 3784,
+        -1018, -1018, -1018, -1018, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+        -6, -6, -6, -6, 7033, 7033, 7033, 7033, 10093, 10093, 10093, 10093, 18345, 18345, 18345, 18345,
+        -1956, -1956, -1956, -1956, 29449, 29449, 29449, 29449, 28517, 28517, 28517, 28517, -9508, -9508, -9508, -9508,
+        27359, 27359, 27359, 27359, 16090, 16090, 16090, 16090, -21094, -21094, -21094, -21094, 28224, 28224, 28224, 28224,
+        408, 408, 408, 408, -12476, -12476, -12476, -12476, -20856, -20856, -20856, -20856, 16072, 16072, 16072, 16072,
+        6, 6, 6, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+        1018, -1520, -2695, 1341, 2413, 918, 425, 5175, 4188, -4035, -4855, 341, 357, 4347, -3364, 5213,
+        223, -4875, 730, 1931, -3686, -2503, -4544, -4095, -3688, 5063, -2236, -3823, -376, 3012, 3784, -2629,
+        -1018, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+        -6, 23056, 7033, 829, 10093, 26518, 18345, 3639, -1956, -4547, 29449, 3925, 28517, -7429, -9508, -11683,
+        27359, -17675, 16090, 14731, -21094, -25543, 28224, -14847, 408, 28103, -12476, 10001, -20856, -7228, 16072, 18363,
+        6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+        1018, -2935, -1520, -4744, -2695, -205, 1341, 1299, 2413, 4, 918, -4379, 425, -4616, 5175, -544,
+        4188, 4129, -4035, 4102, -4855, -1287, 341, -2388, 357, 1284, 4347, 2984, -3364, 2178, 5213, -2576,
+        223, 2790, -4875, 4876, 730, -4513, 1931, -3085, -3686, 3550, -2503, 847, -4544, 193, -4095, 1085,
+        -3688, 3091, 5063, -4742, -2236, 2982, -3823, -1009, -376, -268, 3012, 3062, 3784, -2565, -2629, 4189,
+        -1018, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+        -6, 31369, 23056, 15736, 7033, -24269, 829, -6381, 10093, 22532, 26518, 23781, 18345, 15864, 3639, 15840,
+        -1956, -23007, -4547, 5126, 29449, 8441, 3925, -16724, 28517, 23812, -7429, 31656, -9508, -19326, -11683, -27152,
+        27359, 20198, -17675, 6924, 16090, 22623, 14731, 5619, -21094, -24098, -25543, 3407, 28224, 22209, -14847, 573,
+        408, -4589, 28103, -5766, -12476, -12378, 10001, -31217, -20856, -2316, -7228, -20490, 16072, -14341, 18363, -12707,
+        6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+        1018, -4734, -2935, -400, -1520, 4977, -4744, -2973, -2695, 512, -205, -779, 1341, -1356, 1299, 635,
+        2413, 567, 4, -4286, 918, -5114, -4379, -1586, 425, 1615, -4616, -336, 5175, -1841, -544, 2234,
+        4188, -3441, 4129, 636, -4035, -4580, 4102, 2684, -4855, 3057, -1287, -2740, 341, -5156, -2388, -472,
+        357, -794, 1284, 578, 4347, 3615, 2984, -3715, -3364, 2271, 2178, -326, 5213, 454, -2576, -3337,
+        223, 2998, 2790, -151, -4875, 2981, 4876, 1324, 730, 2774, -4513, 2206, 1931, 886, -3085, -970,
+        -3686, 3198, 3550, 2737, -2503, -909, 847, 1068, -4544, -2213, 193, 2884, -4095, -4808, 1085, 4123,
+        -3688, 5341, 3091, 5294, 5063, -116, -4742, -5116, -2236, -2045, 2982, -1572, -3823, 4828, -1009, 467,
+        -376, 5023, -268, -3169, 3012, -1458, 3062, -1268, 3784, -675, -2565, 1006, -2629, 5064, 4189, 864,
+        -1018, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+        -6, -26238, 31369, -24976, 23056, -30351, 15736, -18845, 7033, 512, -24269, -13579, 829, 29364, -6381, -11141,
+        10093, -969, 22532, 6978, 26518, -4090, 23781, 11726, 18345, 4175, 15864, 7856, 3639, 719, 15840, -31558,
+        -1956, 31887, -23007, -21892, -4547, 22044, 5126, -19844, 29449, -32271, 8441, 32076, 3925, -11300, -16724, 28200,
+        28517, 16614, 23812, 11842, -7429, -2017, 31656, 28541, -9508, 29407, -19326, 31418, -11683, -31290, -27152, 27895,
+        27359, 12214, 20198, -14999, -17675, -1627, 6924, -13012, 16090, -4394, 22623, 7326, 14731, -22666, 5619, 8246,
+        -21094, 24702, -24098, 177, -25543, 7795, 3407, -13268, 28224, 2395, 22209, -7356, -14847, -17096, 573, -24037,
+        408, -11555, -4589, -30546, 28103, 1932, -5766, 17412, -12476, 31235, -12378, -7716, 10001, -1316, -31217, 25555,
+        -20856, -609, -2316, -8801, -7228, 11854, -20490, 780, 16072, -17571, -14341, -2066, 18363, 17352, -12707, 17248,
+        6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+        1018, 3453, -4734, 4519, -2935, 2118, -400, -554, -1520, 2196, 4977, 1893, -4744, -1409, -2973, -1053,
+        -2695, 4601, 512, 279, -205, -3241, -779, 4889, 1341, 3524, -1356, -1663, 1299, 2283, 635, 73,
+        2413, 2428, 567, 624, 4, -1930, -4286, 3419, 918, -2062, -5114, 5068, -4379, -97, -1586, 1782,
+        425, 4621, 1615, 355, -4616, 1349, -336, 825, 5175, 3135, -1841, 1160, -544, 4408, 2234, -2605,
+        4188, 854, -3441, -1056, 4129, 2439, 636, 4967, -4035, -4782, -4580, -5268, 4102, -663, 2684, -4670,
+        -4855, 3760, 3057, 3535, -1287, 2680, -2740, -569, 341, 2139, -5156, 3827, -2388, 1639, -472, 1927,
+        357, 5172, -794, -4003, 1284, 4144, 578, 693, 4347, 4784, 3615, 3125, 2984, 1122, -3715, 2113,
+        -3364, -573, 2271, -4328, 2178, 2909, -326, -4000, 5213, -4447, 454, -3995, -2576, -4428, -3337, 2529,
+        223, 5309, 2998, 5120, 2790, -2050, -151, 2963, -4875, 2657, 2981, -2807, 4876, 2237, 1324, -4403,
+        730, 2624, 2774, -5083, -4513, 40, 2206, 152, 1931, -1573, 886, 2625, -3085, -778, -970, -5107,
+        -3686, 4250, 3198, -5356, 3550, -3148, 2737, -3360, -2503, -2015, -909, 3096, 847, 5313, 1068, 834,
+        -4544, -1132, -2213, -2151, 193, -1722, 2884, -4393, -4095, 2662, -4808, -2788, 1085, -1992, 4123, 5334,
+        -3688, 5215, 5341, -1689, 3091, -2117, 5294, 4859, 5063, 3410, -116, 2205, -4742, -2374, -5116, -4720,
+        -2236, 3570, -2045, 2813, 2982, 2087, -1572, -4973, -3823, 458, 4828, 3891, -1009, -2419, 467, -4891,
+        -376, -1381, 5023, 1204, -268, 274, -3169, -3260, 3012, -1635, -1458, 4540, 3062, -4254, -1268, -1111,
+        3784, 2230, -675, -2279, -2565, -4359, 1006, -1510, -2629, 5015, 5064, -2449, 4189, -5005, 864, 2487,
+        -1018, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+        -6, -29827, -26238, -21593, 31369, -29626, -24976, -7722, 23056, -16236, -30351, 30053, 15736, 9343, -18845, -16925,
+        7033, 14329, 512, 15127, -24269, -21161, -13579, -1767, 829, -6716, 29364, -12415, -6381, 31467, -11141, 1609,
+        10093, -20100, -969, -23952, 22532, -25482, 6978, 8027, 26518, 17394, -4090, -25652, 23781, -5729, 11726, -21770,
+        18345, -4083, 4175, -15517, 15864, -19643, 7856, -22215, 3639, -18881, 719, -19320, 15840, -7880, -31558, 22483,
+        -1956, -6314, 31887, 15328, -23007, -7289, -21892, 11623, -4547, 31058, 22044, 13164, 5126, -15511, -19844, 6594,
+        29449, 11952, -32271, 6095, 8441, 23160, 32076, 22471, 3925, 6747, -11300, 12531, -16724, 8295, 28200, -7801,
+        28517, -29644, 16614, -20899, 23812, 12336, 11842, 20661, -7429, 12976, -2017, 23093, 31656, -3998, 28541, 24129,
+        -9508, -61, 29407, -232, -19326, -13987, 31418, 12384, -11683, -31583, -31290, 24165, -27152, 26292, 27895, 8161,
+        27359, 4797, 12214, 5120, 20198, 19454, -14999, -4717, -17675, 8289, -1627, 31497, 6924, 1725, -13012, 19661,
+        16090, -30144, -4394, -9691, 22623, 28712, 7326, 4248, 14731, 3035, -22666, 24641, 5619, -24330, 8246, -13811,
+        -21094, -13158, 24702, -23788, -24098, 27572, 177, 13024, -25543, -29151, 7795, 7192, 3407, 27329, -13268, 12098,
+        28224, -19564, 2395, -8807, 22209, 32070, -7356, -22313, -14847, 20070, -17096, 23836, 573, -14280, -24037, -1834,
+        408, 32351, -11555, 4967, -4589, 18875, -30546, -6917, 28103, -26286, 1932, 18077, -5766, 29370, 17412, 19856,
+        -12476, 23026, 31235, -30467, -12378, -24025, -7716, -12653, 10001, -8758, -1316, -20173, -31217, -11123, 25555, 23269,
+        -20856, -29541, -609, 31924, -2316, 3346, -8801, -13500, -7228, 14237, 11854, 14780, -20490, -9374, 780, 16809,
+        16072, 11446, -17571, -8935, -14341, 5369, -2066, -18918, 18363, 19863, 17352, -16273, -12707, 3699, 17248, 951,
+        6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    }
+};
+
+static inline __m256i sub_x16(__m256i a, __m256i b) {
+    //__asm__("vpsubw %1,%0,%0" : "+x"(a),"+x"(b));
+    return _mm256_sub_epi16(a, b);
+}
+
+static inline __m256i add_x16(__m256i a, __m256i b) {
+    return _mm256_add_epi16(a, b);
+}
+
+static inline __m256i reduce_x16(const __m256i *qdata, __m256i x) {
+    __m256i y = _mm256_mulhi_epi16(x, qrecip_x16);
+    y = _mm256_mulhrs_epi16(y, qshift_x16);
+    y = _mm256_mullo_epi16(y, q_x16);
+    return sub_x16(x, y);
+}
+
+static inline __m256i mulmod_x16_scaled(const __m256i *qdata, __m256i x, __m256i y, __m256i yqinv) {
+    __m256i b = _mm256_mulhi_epi16(x, y);
+    __m256i d = _mm256_mullo_epi16(x, yqinv);
+    __m256i e = _mm256_mulhi_epi16(d, q_x16);
+    return sub_x16(b, e);
+}
+
+typedef union {
+    int8 data[32];
+    __m256i _dummy;
+} byte32;
+static const byte32 shuffle_buf = { .data = {
+        14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1,
+        14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1,
+    }
+};
+#define shuffle (*(__m256i *) shuffle_buf.data)
+
+static inline __m256i _mm256_loadu_reverse16(const __m256i *p) {
+    __m256i x = _mm256_loadu_si256(p);
+    x = _mm256_permute2x128_si256(x, x, 1);
+    x = _mm256_shuffle_epi8(x, shuffle);
+    return x;
+}
+
+static void ntt128(int16 *f, int reps, const __m256i *qdata) {
+    __m256i f0, f1, f2, f3, g0, g1, g2, g3, h0, h1, h2, h3;
+    int16 *origf = f;
+    int rep;
+    __m256i zetainv_128_0 = zetainv(128, 0);
+    __m256i zetainv_qinv_128_0 = zetainv_qinv(128, 0);
+    __m256i zetainv_x4_32_0 = zetainv_x4(32, 0);
+    __m256i zetainv_x4_qinv_32_0 = zetainv_x4_qinv(32, 0);
+    __m256i zetainv_128_1 = zetainv(128, 1);
+    __m256i zetainv_qinv_128_1 = zetainv_qinv(128, 1);
+    __m256i zetainv_x4_32_1 = zetainv_x4(32, 1);
+    __m256i zetainv_x4_qinv_32_1 = zetainv_x4_qinv(32, 1);
+    for (rep = 0; rep < reps; ++rep) {
+        f1 = _mm256_loadu_si256((__m256i *) (f + 32));
+        f3 = _mm256_loadu_si256((__m256i *) (f + 96));
+        g3 = sub_x16(f1, f3);
+        g3 = mulmod_x16_scaled(qdata, g3, zeta4_x16, zeta4_x16_qinv);
+        g1 = add_x16(f1, f3);
+
+        f0 = _mm256_loadu_si256((__m256i *) (f + 0));
+        f2 = _mm256_loadu_si256((__m256i *) (f + 64));
+        g2 = sub_x16(f0, f2);
+        g0 = add_x16(f0, f2);
+
+        f3 = sub_x16(g3, g2);
+        f2 = add_x16(g2, g3);
+        f3 = mulmod_x16_scaled(qdata, f3, zetainv_128_0, zetainv_qinv_128_0);
+        f2 = mulmod_x16_scaled(qdata, f2, zeta(128, 0), zeta_qinv(128, 0));
+
+        g2 = _mm256_unpacklo_epi16(f2, f3);
+        g3 = _mm256_unpackhi_epi16(f2, f3);
+
+        f1 = sub_x16(g0, g1);
+        f0 = add_x16(g0, g1);
+        f1 = mulmod_x16_scaled(qdata, f1, zeta(64, 0), zeta_qinv(64, 0));
+        f0 = reduce_x16(qdata, f0);
+
+        g0 = _mm256_unpacklo_epi16(f0, f1);
+        h0 = _mm256_unpacklo_epi32(g0, g2);
+        h1 = _mm256_unpackhi_epi32(g0, g2);
+        g1 = _mm256_unpackhi_epi16(f0, f1);
+        h2 = _mm256_unpacklo_epi32(g1, g3);
+        h3 = _mm256_unpackhi_epi32(g1, g3);
+        f0 = _mm256_permute2x128_si256(h0, h1, 0x20);
+        f2 = _mm256_permute2x128_si256(h0, h1, 0x31);
+        f1 = _mm256_permute2x128_si256(h2, h3, 0x20);
+        f3 = _mm256_permute2x128_si256(h2, h3, 0x31);
+
+        _mm256_storeu_si256((__m256i *) (f + 0), f0);
+        _mm256_storeu_si256((__m256i *) (f + 64), f2);
+        _mm256_storeu_si256((__m256i *) (f + 32), f1);
+        _mm256_storeu_si256((__m256i *) (f + 96), f3);
+
+        f1 = _mm256_loadu_si256((__m256i *) (f + 48));
+        f3 = _mm256_loadu_si256((__m256i *) (f + 112));
+        g3 = sub_x16(f1, f3);
+        g3 = mulmod_x16_scaled(qdata, g3, zeta4_x16, zeta4_x16_qinv);
+        g1 = add_x16(f1, f3);
+
+        f0 = _mm256_loadu_si256((__m256i *) (f + 16));
+        f2 = _mm256_loadu_si256((__m256i *) (f + 80));
+        g2 = sub_x16(f0, f2);
+        g0 = add_x16(f0, f2);
+
+        f3 = sub_x16(g3, g2);
+        f2 = add_x16(g2, g3);
+        f3 = mulmod_x16_scaled(qdata, f3, zetainv_128_1, zetainv_qinv_128_1);
+        f2 = mulmod_x16_scaled(qdata, f2, zeta(128, 1), zeta_qinv(128, 1));
+
+        g2 = _mm256_unpacklo_epi16(f2, f3);
+        g3 = _mm256_unpackhi_epi16(f2, f3);
+
+        f1 = sub_x16(g0, g1);
+        f0 = add_x16(g0, g1);
+        f1 = mulmod_x16_scaled(qdata, f1, zeta(64, 1), zeta_qinv(64, 1));
+        f0 = reduce_x16(qdata, f0);
+
+        g0 = _mm256_unpacklo_epi16(f0, f1);
+        h0 = _mm256_unpacklo_epi32(g0, g2);
+        h1 = _mm256_unpackhi_epi32(g0, g2);
+        g1 = _mm256_unpackhi_epi16(f0, f1);
+        h2 = _mm256_unpacklo_epi32(g1, g3);
+        h3 = _mm256_unpackhi_epi32(g1, g3);
+        f0 = _mm256_permute2x128_si256(h0, h1, 0x20);
+        f2 = _mm256_permute2x128_si256(h0, h1, 0x31);
+        f1 = _mm256_permute2x128_si256(h2, h3, 0x20);
+        f3 = _mm256_permute2x128_si256(h2, h3, 0x31);
+
+        _mm256_storeu_si256((__m256i *) (f + 16), f0);
+        _mm256_storeu_si256((__m256i *) (f + 80), f2);
+        _mm256_storeu_si256((__m256i *) (f + 48), f1);
+        _mm256_storeu_si256((__m256i *) (f + 112), f3);
+
+        f += 128;
+    }
+    f = origf;
+    for (rep = 0; rep < reps; ++rep) {
+        f1 = _mm256_loadu_si256((__m256i *) (f + 64));
+        f3 = _mm256_loadu_si256((__m256i *) (f + 80));
+        g3 = sub_x16(f1, f3);
+        g3 = mulmod_x16_scaled(qdata, g3, zeta4_x16, zeta4_x16_qinv);
+        g1 = add_x16(f1, f3);
+
+        f0 = _mm256_loadu_si256((__m256i *) (f + 0));
+        f2 = _mm256_loadu_si256((__m256i *) (f + 16));
+        g2 = sub_x16(f0, f2);
+        g0 = add_x16(f0, f2);
+
+        f3 = sub_x16(g3, g2);
+        f2 = add_x16(g2, g3);
+        f3 = mulmod_x16_scaled(qdata, f3, zetainv_x4_32_0, zetainv_x4_qinv_32_0);
+        f2 = mulmod_x16_scaled(qdata, f2, zeta_x4(32, 0), zeta_x4_qinv(32, 0));
+
+        g2 = _mm256_unpacklo_epi64(f2, f3);
+        g3 = _mm256_unpackhi_epi64(f2, f3);
+
+        f1 = sub_x16(g0, g1);
+        f0 = add_x16(g0, g1);
+        f1 = mulmod_x16_scaled(qdata, f1, zeta_x4(16, 0), zeta_x4_qinv(16, 0));
+        f0 = reduce_x16(qdata, f0);
+
+        g1 = _mm256_unpackhi_epi64(f0, f1);
+        g0 = _mm256_unpacklo_epi64(f0, f1);
+        f1 = _mm256_permute2x128_si256(g1, g3, 0x20);
+        f3 = _mm256_permute2x128_si256(g1, g3, 0x31);
+        f0 = _mm256_permute2x128_si256(g0, g2, 0x20);
+        f2 = _mm256_permute2x128_si256(g0, g2, 0x31);
+
+        _mm256_storeu_si256((__m256i *) (f + 64), f1);
+        _mm256_storeu_si256((__m256i *) (f + 80), f3);
+        _mm256_storeu_si256((__m256i *) (f + 0), f0);
+        _mm256_storeu_si256((__m256i *) (f + 16), f2);
+
+        f1 = _mm256_loadu_si256((__m256i *) (f + 96));
+        f3 = _mm256_loadu_si256((__m256i *) (f + 112));
+        g3 = sub_x16(f1, f3);
+        g3 = mulmod_x16_scaled(qdata, g3, zeta4_x16, zeta4_x16_qinv);
+        g1 = add_x16(f1, f3);
+
+        f0 = _mm256_loadu_si256((__m256i *) (f + 32));
+        f2 = _mm256_loadu_si256((__m256i *) (f + 48));
+        g2 = sub_x16(f0, f2);
+        g0 = add_x16(f0, f2);
+
+        f3 = sub_x16(g3, g2);
+        f2 = add_x16(g2, g3);
+        f3 = mulmod_x16_scaled(qdata, f3, zetainv_x4_32_1, zetainv_x4_qinv_32_1);
+        f2 = mulmod_x16_scaled(qdata, f2, zeta_x4(32, 1), zeta_x4_qinv(32, 1));
+
+        g2 = _mm256_unpacklo_epi64(f2, f3);
+        g3 = _mm256_unpackhi_epi64(f2, f3);
+
+        f1 = sub_x16(g0, g1);
+        f0 = add_x16(g0, g1);
+        f1 = mulmod_x16_scaled(qdata, f1, zeta_x4(16, 1), zeta_x4_qinv(16, 1));
+        f0 = reduce_x16(qdata, f0);
+
+        g1 = _mm256_unpackhi_epi64(f0, f1);
+        g0 = _mm256_unpacklo_epi64(f0, f1);
+        f1 = _mm256_permute2x128_si256(g1, g3, 0x20);
+        f3 = _mm256_permute2x128_si256(g1, g3, 0x31);
+        f0 = _mm256_permute2x128_si256(g0, g2, 0x20);
+        f2 = _mm256_permute2x128_si256(g0, g2, 0x31);
+
+        _mm256_storeu_si256((__m256i *) (f + 96), f1);
+        _mm256_storeu_si256((__m256i *) (f + 112), f3);
+        _mm256_storeu_si256((__m256i *) (f + 32), f0);
+        _mm256_storeu_si256((__m256i *) (f + 48), f2);
+
+        f += 128;
+    }
+    f = origf;
+    for (rep = 0; rep < reps; ++rep) {
+
+        f1 = _mm256_loadu_si256((__m256i *) (f + 16));
+        f3 = _mm256_loadu_si256((__m256i *) (f + 48));
+        g3 = sub_x16(f1, f3);
+        g3 = mulmod_x16_scaled(qdata, g3, zeta4_x16, zeta4_x16_qinv);
+        g1 = add_x16(f1, f3);
+
+        f0 = _mm256_loadu_si256((__m256i *) (f + 0));
+        f2 = _mm256_loadu_si256((__m256i *) (f + 32));
+        g2 = sub_x16(f0, f2);
+        g0 = add_x16(f0, f2);
+
+        f2 = add_x16(g2, g3);
+        f3 = sub_x16(g2, g3);
+        f2 = reduce_x16(qdata, f2);
+        f3 = reduce_x16(qdata, f3);
+
+        f1 = sub_x16(g0, g1);
+        f0 = add_x16(g0, g1);
+        f0 = reduce_x16(qdata, f0);
+
+        h0 = f0;
+        h1 = f1;
+        h2 = f2;
+        h3 = f3;
+
+        f1 = _mm256_loadu_si256((__m256i *) (f + 80));
+        f3 = _mm256_loadu_si256((__m256i *) (f + 112));
+        g3 = sub_x16(f1, f3);
+        g3 = mulmod_x16_scaled(qdata, g3, zeta4_x16, zeta4_x16_qinv);
+        g1 = add_x16(f1, f3);
+
+        f0 = _mm256_loadu_si256((__m256i *) (f + 64));
+        f2 = _mm256_loadu_si256((__m256i *) (f + 96));
+        g2 = sub_x16(f0, f2);
+        g0 = add_x16(f0, f2);
+
+        f3 = sub_x16(g3, g2);
+        f2 = add_x16(g2, g3);
+        f3 = mulmod_x16_scaled(qdata, f3, zetainv8_x16, zetainv8_x16_qinv);
+        f2 = mulmod_x16_scaled(qdata, f2, zeta8_x16, zeta8_x16_qinv);
+
+        f1 = sub_x16(g0, g1);
+        f0 = add_x16(g0, g1);
+        f1 = mulmod_x16_scaled(qdata, f1, zeta4_x16, zeta4_x16_qinv);
+        f0 = reduce_x16(qdata, f0);
+
+        g0 = add_x16(h0, f0);
+        g1 = add_x16(h1, f1);
+        g2 = add_x16(h2, f2);
+        g3 = add_x16(h3, f3);
+        _mm256_storeu_si256((__m256i *) (f + 0), g0);
+        _mm256_storeu_si256((__m256i *) (f + 16), g1);
+        _mm256_storeu_si256((__m256i *) (f + 32), g2);
+        _mm256_storeu_si256((__m256i *) (f + 48), g3);
+        g0 = sub_x16(h0, f0);
+        g1 = sub_x16(h1, f1);
+        g2 = sub_x16(h2, f2);
+        g3 = sub_x16(h3, f3);
+        _mm256_storeu_si256((__m256i *) (f + 64), g0);
+        _mm256_storeu_si256((__m256i *) (f + 80), g1);
+        _mm256_storeu_si256((__m256i *) (f + 96), g2);
+        _mm256_storeu_si256((__m256i *) (f + 112), g3);
+        f += 128;
+    }
+}
+
+static void ntt512(int16 *f, int reps, const __m256i *qdata) {
+    __m256i f0, f1, f2, f3, g0, g1, g2, g3; /* [-Werror=unused-variable] */ /* ,h0,h1,h2,h3; */
+    int16 *origf = f;
+    int rep;
+    __m256i zetainv_512[8];
+    __m256i zetainv_qinv_512[8];
+    int i;
+    for (i = 0; i < 8; ++i) {
+        zetainv_512[i] = zetainv(512, i);
+    }
+    for (i = 0; i < 8; ++i) {
+        zetainv_qinv_512[i] = zetainv_qinv(512, i);
+    }
+    for (rep = 0; rep < reps; ++rep) {
+        for (i = 0; i < 8; ++i) {
+            f1 = _mm256_loadu_si256((__m256i *) (f + 16 * i + 128));
+            f3 = _mm256_loadu_si256((__m256i *) (f + 16 * i + 384));
+            g3 = sub_x16(f1, f3);
+            g3 = mulmod_x16_scaled(qdata, g3, zeta4_x16, zeta4_x16_qinv);
+            g1 = add_x16(f1, f3);
+
+            f0 = _mm256_loadu_si256((__m256i *) (f + 16 * i));
+            f2 = _mm256_loadu_si256((__m256i *) (f + 16 * i + 256));
+            g2 = sub_x16(f0, f2);
+            g0 = add_x16(f0, f2);
+
+            f3 = sub_x16(g3, g2);
+            f2 = add_x16(g2, g3);
+            f3 = mulmod_x16_scaled(qdata, f3, zetainv_512[i], zetainv_qinv_512[i]);
+            f2 = mulmod_x16_scaled(qdata, f2, zeta(512, i), zeta_qinv(512, i));
+
+            f1 = sub_x16(g0, g1);
+            f0 = add_x16(g0, g1);
+            f1 = mulmod_x16_scaled(qdata, f1, zeta(256, i), zeta_qinv(256, i));
+            f0 = reduce_x16(qdata, f0);
+
+            _mm256_storeu_si256((__m256i *) (f + 16 * i + 384), f3);
+            _mm256_storeu_si256((__m256i *) (f + 16 * i + 256), f2);
+            _mm256_storeu_si256((__m256i *) (f + 16 * i + 128), f1);
+            _mm256_storeu_si256((__m256i *) (f + 16 * i), f0);
+
+        }
+        f += 512;
+    }
+    f = origf;
+    ntt128(f, reps * 4, qdata);
+}
+
+void PQCLEAN_SNTRUP653_AVX2_ntt512_7681(int16 *f, int reps) {
+    ntt512(f, reps, (const __m256i *) qdata_7681.data);
+}
+
+void PQCLEAN_SNTRUP653_AVX2_ntt512_10753(int16 *f, int reps) {
+    ntt512(f, reps, (const __m256i *) qdata_10753.data);
+}
+
+static void invntt128(int16 *f, int reps, const __m256i *qdata) {
+    __m256i f0, f1, f2, f3, g0, g1, g2, g3, h0, h1, h2, h3;
+    int16 *origf = f;
+    int rep;
+    __m256i zetainv_x4_16_0 = zetainv_x4(16, 0);
+    __m256i zetainv_x4_qinv_16_0 = zetainv_x4_qinv(16, 0);
+    __m256i zetainv_x4_32_0 = zetainv_x4(32, 0);
+    __m256i zetainv_x4_qinv_32_0 = zetainv_x4_qinv(32, 0);
+    __m256i zetainv_64_0 = zetainv(64, 0);
+    __m256i zetainv_qinv_64_0 = zetainv_qinv(64, 0);
+    __m256i zetainv_128_0 = zetainv(128, 0);
+    __m256i zetainv_qinv_128_0 = zetainv_qinv(128, 0);
+    __m256i zetainv_x4_16_1 = zetainv_x4(16, 1);
+    __m256i zetainv_x4_qinv_16_1 = zetainv_x4_qinv(16, 1);
+    __m256i zetainv_x4_32_1 = zetainv_x4(32, 1);
+    __m256i zetainv_x4_qinv_32_1 = zetainv_x4_qinv(32, 1);
+    __m256i zetainv_64_1 = zetainv(64, 1);
+    __m256i zetainv_qinv_64_1 = zetainv_qinv(64, 1);
+    __m256i zetainv_128_1 = zetainv(128, 1);
+    __m256i zetainv_qinv_128_1 = zetainv_qinv(128, 1);
+    for (rep = 0; rep < reps; ++rep) {
+        f0 = _mm256_loadu_si256((__m256i *) (f +   0));
+        f1 = _mm256_loadu_si256((__m256i *) (f +  64));
+        f2 = _mm256_loadu_si256((__m256i *) (f +  16));
+        f3 = _mm256_loadu_si256((__m256i *) (f +  80));
+        g0 = _mm256_loadu_si256((__m256i *) (f +  32));
+        g1 = _mm256_loadu_si256((__m256i *) (f +  96));
+        g2 = _mm256_loadu_si256((__m256i *) (f +  48));
+        g3 = _mm256_loadu_si256((__m256i *) (f + 112));
+
+        h1 = sub_x16(f0, f1);
+        h1 = reduce_x16(qdata, h1);
+        h0 = add_x16(f0, f1);
+        h3 = sub_x16(f2, f3);
+        h3 = mulmod_x16_scaled(qdata, h3, zeta4_x16, zeta4_x16_qinv);
+        h2 = add_x16(f2, f3);
+        f1 = sub_x16(g0, g1);
+        f1 = mulmod_x16_scaled(qdata, f1, zetainv8_x16, zetainv8_x16_qinv);
+        f0 = add_x16(g0, g1);
+        f3 = sub_x16(g2, g3);
+        f3 = mulmod_x16_scaled(qdata, f3, zeta8_x16, zeta8_x16_qinv);
+        f2 = add_x16(g2, g3);
+
+        g0 = add_x16(h0, h2);
+        g0 = reduce_x16(qdata, g0);
+        g2 = sub_x16(h0, h2);
+        g2 = reduce_x16(qdata, g2);
+        g1 = sub_x16(h1, h3);
+        g3 = add_x16(h1, h3);
+        h2 = sub_x16(f0, f2);
+        h2 = mulmod_x16_scaled(qdata, h2, zeta4_x16, zeta4_x16_qinv);
+        h0 = add_x16(f0, f2);
+        h3 = add_x16(f1, f3);
+        h3 = mulmod_x16_scaled(qdata, h3, zeta4_x16, zeta4_x16_qinv);
+        h1 = sub_x16(f1, f3);
+
+        f0 = add_x16(g0, h0);
+        g0 = sub_x16(g0, h0);
+        f1 = add_x16(g1, h1);
+        g1 = sub_x16(g1, h1);
+        f2 = sub_x16(g2, h2);
+        g2 = add_x16(g2, h2);
+        f3 = sub_x16(g3, h3);
+        g3 = add_x16(g3, h3);
+
+        _mm256_storeu_si256((__m256i *) (f +   0), f0);
+        _mm256_storeu_si256((__m256i *) (f +  32), g0);
+        _mm256_storeu_si256((__m256i *) (f +  64), f1);
+        _mm256_storeu_si256((__m256i *) (f +  96), g1);
+        _mm256_storeu_si256((__m256i *) (f +  16), f2);
+        _mm256_storeu_si256((__m256i *) (f +  48), g2);
+        _mm256_storeu_si256((__m256i *) (f +  80), f3);
+        _mm256_storeu_si256((__m256i *) (f + 112), g3);
+
+        f += 128;
+    }
+    f = origf;
+    for (rep = 0; rep < reps; ++rep) {
+        f0 = _mm256_loadu_si256((__m256i *) (f + 0));
+        f1 = _mm256_loadu_si256((__m256i *) (f + 64));
+        f2 = _mm256_loadu_si256((__m256i *) (f + 16));
+        f3 = _mm256_loadu_si256((__m256i *) (f + 80));
+
+        g0 = _mm256_unpacklo_epi64(f0, f1);
+        g1 = _mm256_unpacklo_epi64(f2, f3);
+        g2 = _mm256_unpackhi_epi64(f0, f1);
+        g3 = _mm256_unpackhi_epi64(f2, f3);
+        f2 = _mm256_permute2x128_si256(g0, g1, 0x31);
+        f3 = _mm256_permute2x128_si256(g2, g3, 0x31);
+        f0 = _mm256_permute2x128_si256(g0, g1, 0x20);
+        f1 = _mm256_permute2x128_si256(g2, g3, 0x20);
+
+        f2 = mulmod_x16_scaled(qdata, f2, zetainv_x4_32_0, zetainv_x4_qinv_32_0);
+        f3 = mulmod_x16_scaled(qdata, f3, zeta_x4(32, 0), zeta_x4_qinv(32, 0));
+
+        g3 = add_x16(f3, f2);
+        g3 = mulmod_x16_scaled(qdata, g3, zeta4_x16, zeta4_x16_qinv);
+        g2 = sub_x16(f3, f2);
+
+        f0 = reduce_x16(qdata, f0);
+        f1 = mulmod_x16_scaled(qdata, f1, zetainv_x4_16_0, zetainv_x4_qinv_16_0);
+
+        g1 = add_x16(f0, f1);
+        g0 = sub_x16(f0, f1);
+
+        f1 = add_x16(g1, g3);
+        f3 = sub_x16(g1, g3);
+        f0 = add_x16(g0, g2);
+        f2 = sub_x16(g0, g2);
+
+        _mm256_storeu_si256((__m256i *) (f + 64), f1);
+        _mm256_storeu_si256((__m256i *) (f + 80), f3);
+        _mm256_storeu_si256((__m256i *) (f + 0), f0);
+        _mm256_storeu_si256((__m256i *) (f + 16), f2);
+
+        f0 = _mm256_loadu_si256((__m256i *) (f + 32));
+        f1 = _mm256_loadu_si256((__m256i *) (f + 96));
+        f2 = _mm256_loadu_si256((__m256i *) (f + 48));
+        f3 = _mm256_loadu_si256((__m256i *) (f + 112));
+
+        g0 = _mm256_unpacklo_epi64(f0, f1);
+        g1 = _mm256_unpacklo_epi64(f2, f3);
+        g2 = _mm256_unpackhi_epi64(f0, f1);
+        g3 = _mm256_unpackhi_epi64(f2, f3);
+        f2 = _mm256_permute2x128_si256(g0, g1, 0x31);
+        f3 = _mm256_permute2x128_si256(g2, g3, 0x31);
+        f0 = _mm256_permute2x128_si256(g0, g1, 0x20);
+        f1 = _mm256_permute2x128_si256(g2, g3, 0x20);
+
+        f2 = mulmod_x16_scaled(qdata, f2, zetainv_x4_32_1, zetainv_x4_qinv_32_1);
+        f3 = mulmod_x16_scaled(qdata, f3, zeta_x4(32, 1), zeta_x4_qinv(32, 1));
+
+        g3 = add_x16(f3, f2);
+        g3 = mulmod_x16_scaled(qdata, g3, zeta4_x16, zeta4_x16_qinv);
+        g2 = sub_x16(f3, f2);
+
+        f0 = reduce_x16(qdata, f0);
+        f1 = mulmod_x16_scaled(qdata, f1, zetainv_x4_16_1, zetainv_x4_qinv_16_1);
+
+        g1 = add_x16(f0, f1);
+        g0 = sub_x16(f0, f1);
+
+        f1 = add_x16(g1, g3);
+        f3 = sub_x16(g1, g3);
+        f0 = add_x16(g0, g2);
+        f2 = sub_x16(g0, g2);
+
+        _mm256_storeu_si256((__m256i *) (f + 96), f1);
+        _mm256_storeu_si256((__m256i *) (f + 112), f3);
+        _mm256_storeu_si256((__m256i *) (f + 32), f0);
+        _mm256_storeu_si256((__m256i *) (f + 48), f2);
+
+        f += 128;
+    }
+    f = origf;
+    for (rep = 0; rep < reps; ++rep) {
+        f0 = _mm256_loadu_si256((__m256i *) (f + 0));
+        f2 = _mm256_loadu_si256((__m256i *) (f + 64));
+        f1 = _mm256_loadu_si256((__m256i *) (f + 32));
+        f3 = _mm256_loadu_si256((__m256i *) (f + 96));
+
+        g0 = _mm256_permute2x128_si256(f0, f2, 0x20);
+        g2 = _mm256_permute2x128_si256(f0, f2, 0x31);
+        f0 = _mm256_unpacklo_epi16(g0, g2);
+        f2 = _mm256_unpackhi_epi16(g0, g2);
+        g1 = _mm256_permute2x128_si256(f1, f3, 0x20);
+        g3 = _mm256_permute2x128_si256(f1, f3, 0x31);
+        f1 = _mm256_unpacklo_epi16(g1, g3);
+        f3 = _mm256_unpackhi_epi16(g1, g3);
+        g1 = _mm256_unpackhi_epi16(f0, f2);
+        g0 = _mm256_unpacklo_epi16(f0, f2);
+        g3 = _mm256_unpackhi_epi16(f1, f3);
+        g2 = _mm256_unpacklo_epi16(f1, f3);
+        f2 = _mm256_unpacklo_epi64(g1, g3);
+        f3 = _mm256_unpackhi_epi64(g1, g3);
+        f0 = _mm256_unpacklo_epi64(g0, g2);
+        f1 = _mm256_unpackhi_epi64(g0, g2);
+
+        f2 = mulmod_x16_scaled(qdata, f2, zetainv_128_0, zetainv_qinv_128_0);
+        f3 = mulmod_x16_scaled(qdata, f3, zeta(128, 0), zeta_qinv(128, 0));
+        f0 = reduce_x16(qdata, f0);
+        f1 = mulmod_x16_scaled(qdata, f1, zetainv_64_0, zetainv_qinv_64_0);
+
+        g3 = add_x16(f3, f2);
+        g3 = mulmod_x16_scaled(qdata, g3, zeta4_x16, zeta4_x16_qinv);
+        g1 = add_x16(f0, f1);
+        g2 = sub_x16(f3, f2);
+        g0 = sub_x16(f0, f1);
+
+        f1 = add_x16(g1, g3);
+        f3 = sub_x16(g1, g3);
+        f0 = add_x16(g0, g2);
+        f2 = sub_x16(g0, g2);
+
+        _mm256_storeu_si256((__m256i *) (f + 32), f1);
+        _mm256_storeu_si256((__m256i *) (f + 96), f3);
+        _mm256_storeu_si256((__m256i *) (f + 0), f0);
+        _mm256_storeu_si256((__m256i *) (f + 64), f2);
+
+        f0 = _mm256_loadu_si256((__m256i *) (f + 16));
+        f2 = _mm256_loadu_si256((__m256i *) (f + 80));
+        f1 = _mm256_loadu_si256((__m256i *) (f + 48));
+        f3 = _mm256_loadu_si256((__m256i *) (f + 112));
+
+        g0 = _mm256_permute2x128_si256(f0, f2, 0x20);
+        g2 = _mm256_permute2x128_si256(f0, f2, 0x31);
+        f0 = _mm256_unpacklo_epi16(g0, g2);
+        f2 = _mm256_unpackhi_epi16(g0, g2);
+        g1 = _mm256_permute2x128_si256(f1, f3, 0x20);
+        g3 = _mm256_permute2x128_si256(f1, f3, 0x31);
+        f1 = _mm256_unpacklo_epi16(g1, g3);
+        f3 = _mm256_unpackhi_epi16(g1, g3);
+        g1 = _mm256_unpackhi_epi16(f0, f2);
+        g0 = _mm256_unpacklo_epi16(f0, f2);
+        g3 = _mm256_unpackhi_epi16(f1, f3);
+        g2 = _mm256_unpacklo_epi16(f1, f3);
+        f2 = _mm256_unpacklo_epi64(g1, g3);
+        f3 = _mm256_unpackhi_epi64(g1, g3);
+        f0 = _mm256_unpacklo_epi64(g0, g2);
+        f1 = _mm256_unpackhi_epi64(g0, g2);
+
+        f2 = mulmod_x16_scaled(qdata, f2, zetainv_128_1, zetainv_qinv_128_1);
+        f3 = mulmod_x16_scaled(qdata, f3, zeta(128, 1), zeta_qinv(128, 1));
+        f0 = reduce_x16(qdata, f0);
+        f1 = mulmod_x16_scaled(qdata, f1, zetainv_64_1, zetainv_qinv_64_1);
+
+        g3 = add_x16(f3, f2);
+        g3 = mulmod_x16_scaled(qdata, g3, zeta4_x16, zeta4_x16_qinv);
+        g1 = add_x16(f0, f1);
+        g2 = sub_x16(f3, f2);
+        g0 = sub_x16(f0, f1);
+
+        f1 = add_x16(g1, g3);
+        f3 = sub_x16(g1, g3);
+        f0 = add_x16(g0, g2);
+        f2 = sub_x16(g0, g2);
+
+        _mm256_storeu_si256((__m256i *) (f + 48), f1);
+        _mm256_storeu_si256((__m256i *) (f + 112), f3);
+        _mm256_storeu_si256((__m256i *) (f + 16), f0);
+        _mm256_storeu_si256((__m256i *) (f + 80), f2);
+
+        f += 128;
+    }
+}
+
+static void invntt512(int16 *f, int reps, const __m256i *qdata) {
+    __m256i f0, f1, f2, f3, g0, g1, g2, g3; /* [-Werror=unused-variable] */ /* ,h0,h1,h2,h3; */
+    /* [-Werror=unused-variable] */ /* int16 *origf = f; */
+    int rep;
+    __m256i zetainv_512[8];
+    __m256i zetainv_qinv_512[8];
+    __m256i zetainv_256[8];
+    __m256i zetainv_qinv_256[8];
+    int i;
+    for (i = 0; i < 8; ++i) {
+        zetainv_512[i] = zetainv(512, i);
+    }
+    for (i = 0; i < 8; ++i) {
+        zetainv_qinv_512[i] = zetainv_qinv(512, i);
+    }
+    for (i = 0; i < 8; ++i) {
+        zetainv_256[i] = zetainv(256, i);
+    }
+    for (i = 0; i < 8; ++i) {
+        zetainv_qinv_256[i] = zetainv_qinv(256, i);
+    }
+    invntt128(f, 4 * reps, qdata);
+    for (rep = 0; rep < reps; ++rep) {
+        for (i = 0; i < 8; ++i) {
+            f2 = _mm256_loadu_si256((__m256i *) (f + 16 * i + 256));
+            f3 = _mm256_loadu_si256((__m256i *) (f + 16 * i + 384));
+
+            f2 = mulmod_x16_scaled(qdata, f2, zetainv_512[i], zetainv_qinv_512[i]);
+            f3 = mulmod_x16_scaled(qdata, f3, zeta(512, i), zeta_qinv(512, i));
+            g3 = add_x16(f3, f2);
+            g3 = mulmod_x16_scaled(qdata, g3, zeta4_x16, zeta4_x16_qinv);
+            g2 = sub_x16(f3, f2);
+
+            f0 = _mm256_loadu_si256((__m256i *) (f + 16 * i + 0));
+            f1 = _mm256_loadu_si256((__m256i *) (f + 16 * i + 128));
+
+            f0 = reduce_x16(qdata, f0);
+            f1 = mulmod_x16_scaled(qdata, f1, zetainv_256[i], zetainv_qinv_256[i]);
+            g1 = add_x16(f0, f1);
+            g0 = sub_x16(f0, f1);
+
+            f1 = add_x16(g1, g3);
+            f3 = sub_x16(g1, g3);
+            f0 = add_x16(g0, g2);
+            f2 = sub_x16(g0, g2);
+
+            _mm256_storeu_si256((__m256i *) (f + 16 * i + 128), f1);
+            _mm256_storeu_si256((__m256i *) (f + 16 * i + 384), f3);
+            _mm256_storeu_si256((__m256i *) (f + 16 * i + 0), f0);
+            _mm256_storeu_si256((__m256i *) (f + 16 * i + 256), f2);
+        }
+        f += 512;
+    }
+}
+
+void PQCLEAN_SNTRUP653_AVX2_invntt512_7681(int16 *f, int reps) {
+    invntt512(f, reps, (const __m256i *) qdata_7681.data);
+}
+
+void PQCLEAN_SNTRUP653_AVX2_invntt512_10753(int16 *f, int reps) {
+    invntt512(f, reps, (const __m256i *) qdata_10753.data);
+}
diff --git a/crypto_kem/sntrup653/avx2/crypto_core_multsntrup653_ntt.h b/crypto_kem/sntrup653/avx2/crypto_core_multsntrup653_ntt.h
new file mode 100644
index 00000000..44fff973
--- /dev/null
+++ b/crypto_kem/sntrup653/avx2/crypto_core_multsntrup653_ntt.h
@@ -0,0 +1,13 @@
+#ifndef ntt_H
+#define ntt_H
+
+#include <stdint.h>
+
+
+
+extern void PQCLEAN_SNTRUP653_AVX2_ntt512_7681(int16_t *f, int reps);
+extern void PQCLEAN_SNTRUP653_AVX2_ntt512_10753(int16_t *f, int reps);
+extern void PQCLEAN_SNTRUP653_AVX2_invntt512_7681(int16_t *f, int reps);
+extern void PQCLEAN_SNTRUP653_AVX2_invntt512_10753(int16_t *f, int reps);
+
+#endif
diff --git a/crypto_kem/sntrup653/avx2/crypto_core_scale3sntrup653.c b/crypto_kem/sntrup653/avx2/crypto_core_scale3sntrup653.c
new file mode 100644
index 00000000..fbbd1f0f
--- /dev/null
+++ b/crypto_kem/sntrup653/avx2/crypto_core_scale3sntrup653.c
@@ -0,0 +1,47 @@
+#include "crypto_core_scale3sntrup653.h"
+#include "crypto_decode_653xint16.h"
+#include "crypto_encode_653xint16.h"
+#include <immintrin.h>
+
+#define p 653
+#define q 4621
+
+#define crypto_decode_pxint16 PQCLEAN_SNTRUP653_AVX2_crypto_decode_653xint16
+#define crypto_encode_pxint16 PQCLEAN_SNTRUP653_AVX2_crypto_encode_653xint16
+
+typedef int16_t Fq;
+
+/* out = 3*in in Rq */
+int PQCLEAN_SNTRUP653_AVX2_crypto_core_scale3sntrup653(unsigned char *outbytes, const unsigned char *inbytes) {
+    int i = p - 16;
+
+    __m256i save = _mm256_loadu_si256((__m256i *) (inbytes + 2 * i));
+    /* in case outbytes = inbytes */
+
+    for (;;) {
+        do {
+            __m256i x = _mm256_loadu_si256((__m256i *) inbytes);
+            __m256i xneg;
+            x = _mm256_mullo_epi16(x, _mm256_set1_epi16(3));
+            x = _mm256_sub_epi16(x, _mm256_set1_epi16((q + 1) / 2));
+            xneg = _mm256_srai_epi16(x, 15);
+            x = _mm256_add_epi16(x, _mm256_set1_epi16(q)&xneg);
+            xneg = _mm256_srai_epi16(x, 15);
+            x = _mm256_add_epi16(x, _mm256_set1_epi16(q)&xneg);
+            x = _mm256_sub_epi16(x, _mm256_set1_epi16((q - 1) / 2));
+            _mm256_storeu_si256((__m256i *) outbytes, x);
+
+            inbytes += 32;
+            outbytes += 32;
+            i -= 16;
+        } while (i >= 0);
+        if (i <= -16) {
+            break;
+        }
+        inbytes += 2 * i;
+        outbytes += 2 * i;
+        _mm256_storeu_si256((__m256i *) outbytes, save);
+    }
+
+    return 0;
+}
diff --git a/crypto_kem/sntrup653/avx2/crypto_core_scale3sntrup653.h b/crypto_kem/sntrup653/avx2/crypto_core_scale3sntrup653.h
new file mode 100644
index 00000000..057afaf1
--- /dev/null
+++ b/crypto_kem/sntrup653/avx2/crypto_core_scale3sntrup653.h
@@ -0,0 +1,11 @@
+#ifndef PQCLEAN_SNTRUP653_AVX2_CRYPTO_CORE_SCALE3SNTRUP653_H
+#define PQCLEAN_SNTRUP653_AVX2_CRYPTO_CORE_SCALE3SNTRUP653_H
+
+#include <stdint.h>
+#define PQCLEAN_SNTRUP653_AVX2_crypto_core_scale3sntrup653_OUTPUTBYTES 1306
+#define PQCLEAN_SNTRUP653_AVX2_crypto_core_scale3sntrup653_INPUTBYTES 1306
+#define PQCLEAN_SNTRUP653_AVX2_crypto_core_scale3sntrup653_KEYBYTES 0
+#define PQCLEAN_SNTRUP653_AVX2_crypto_core_scale3sntrup653_CONSTBYTES 0
+
+int PQCLEAN_SNTRUP653_AVX2_crypto_core_scale3sntrup653(unsigned char *outbytes, const unsigned char *inbytes);
+#endif
diff --git a/crypto_kem/sntrup653/avx2/crypto_core_weightsntrup653.c b/crypto_kem/sntrup653/avx2/crypto_core_weightsntrup653.c
new file mode 100644
index 00000000..c2b932e6
--- /dev/null
+++ b/crypto_kem/sntrup653/avx2/crypto_core_weightsntrup653.c
@@ -0,0 +1,45 @@
+#include "crypto_core_weightsntrup653.h"
+#include "crypto_encode_int16.h"
+#include "params.h"
+#include <immintrin.h>
+
+#define int8 int8_t
+#define int16 int16_t
+
+
+/* out = little-endian weight of bottom bits of in */
+int PQCLEAN_SNTRUP653_AVX2_crypto_core_weightsntrup653(unsigned char *outbytes, const unsigned char *inbytes) {
+    int8 *in = (void *) inbytes;
+    int i;
+    __m256i sum, sumhi;
+    int16 weight;
+
+    sum = _mm256_loadu_si256((__m256i *) (in + p - 32));
+    sum &= _mm256_set_epi8(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+
+    for (i = p - 32; i >= 0; i -= 32) {
+        __m256i bits = _mm256_loadu_si256((__m256i *) in);
+        bits &= _mm256_set1_epi8(1);
+        sum = _mm256_add_epi8(sum, bits);
+        in += 32;
+    }
+
+    /* sum is 32xint8; want to add these int8 */
+    sumhi = _mm256_srli_epi16(sum, 8);
+    sum &= _mm256_set1_epi16(0xff);
+    sum = _mm256_add_epi16(sum, sumhi);
+
+    /* sum is 16xint16; want to add these int16 */
+    sum = _mm256_hadd_epi16(sum, sum);
+    /* want sum[0]+sum[1]+sum[2]+sum[3]+sum[8]+sum[9]+sum[10]+sum[11] */
+    sum = _mm256_hadd_epi16(sum, sum);
+    /* want sum[0]+sum[1]+sum[8]+sum[9] */
+    sum = _mm256_hadd_epi16(sum, sum);
+    /* want sum[0]+sum[8] */
+
+    weight = _mm256_extract_epi16(sum, 0);
+    weight += _mm256_extract_epi16(sum, 8);
+
+    PQCLEAN_SNTRUP653_AVX2_crypto_encode_int16(outbytes, &weight);
+    return 0;
+}
diff --git a/crypto_kem/sntrup653/avx2/crypto_core_weightsntrup653.h b/crypto_kem/sntrup653/avx2/crypto_core_weightsntrup653.h
new file mode 100644
index 00000000..99fa8769
--- /dev/null
+++ b/crypto_kem/sntrup653/avx2/crypto_core_weightsntrup653.h
@@ -0,0 +1,11 @@
+#ifndef PQCLEAN_SNTRUP653_AVX2_CRYPTO_CORE_WEIGHTSNTRUP653_H
+#define PQCLEAN_SNTRUP653_AVX2_CRYPTO_CORE_WEIGHTSNTRUP653_H
+
+#include <stdint.h>
+#define PQCLEAN_SNTRUP653_AVX2_crypto_core_weightsntrup653_OUTPUTBYTES 2
+#define PQCLEAN_SNTRUP653_AVX2_crypto_core_weightsntrup653_INPUTBYTES 653
+#define PQCLEAN_SNTRUP653_AVX2_crypto_core_weightsntrup653_KEYBYTES 0
+#define PQCLEAN_SNTRUP653_AVX2_crypto_core_weightsntrup653_CONSTBYTES 0
+
+int PQCLEAN_SNTRUP653_AVX2_crypto_core_weightsntrup653(unsigned char *outbytes, const unsigned char *inbytes);
+#endif
diff --git a/crypto_kem/sntrup653/avx2/crypto_core_wforcesntrup653.c b/crypto_kem/sntrup653/avx2/crypto_core_wforcesntrup653.c
new file mode 100644
index 00000000..90da1ebb
--- /dev/null
+++ b/crypto_kem/sntrup653/avx2/crypto_core_wforcesntrup653.c
@@ -0,0 +1,61 @@
+#include "crypto_core_wforcesntrup653.h"
+#include "crypto_decode_int16.h"
+#include "params.h"
+#include <immintrin.h>
+
+#define int16 int16_t
+
+
+/* out = in if bottom bits of in have weight w */
+/* otherwise out = (1,1,...,1,0,0,...,0) */
+int PQCLEAN_SNTRUP653_AVX2_crypto_core_wforcesntrup653(unsigned char *out, const unsigned char *in) {
+    int16 weight;
+    int16 mask;
+    __m256i maskvec;
+    int i;
+
+    crypto_core_weight((unsigned char *) &weight, in);
+    PQCLEAN_SNTRUP653_AVX2_crypto_decode_int16(&weight, (unsigned char *) &weight);
+
+    mask = (weight - w) | (w - weight);
+    mask >>= 15;
+    maskvec = _mm256_set1_epi16((short) ~mask);
+
+    i = w - 32;
+    for (;;) {
+        do {
+            __m256i x = _mm256_loadu_si256((__m256i *) in);
+            x ^= _mm256_set1_epi8(1);
+            x &= maskvec;
+            x ^= _mm256_set1_epi8(1);
+            _mm256_storeu_si256((__m256i *) out, x);
+            in += 32;
+            out += 32;
+            i -= 32;
+        } while (i >= 0);
+        if (i <= -32) {
+            break;
+        }
+        in += i;
+        out += i;
+    }
+
+    i = p - w - 32;
+    for (;;) {
+        do {
+            __m256i x = _mm256_loadu_si256((__m256i *) in);
+            x &= maskvec;
+            _mm256_storeu_si256((__m256i *) out, x);
+            in += 32;
+            out += 32;
+            i -= 32;
+        } while (i >= 0);
+        if (i <= -32) {
+            break;
+        }
+        in += i;
+        out += i;
+    }
+
+    return 0;
+}
diff --git a/crypto_kem/sntrup653/avx2/crypto_core_wforcesntrup653.h b/crypto_kem/sntrup653/avx2/crypto_core_wforcesntrup653.h
new file mode 100644
index 00000000..2e4d78c4
--- /dev/null
+++ b/crypto_kem/sntrup653/avx2/crypto_core_wforcesntrup653.h
@@ -0,0 +1,11 @@
+#ifndef PQCLEAN_SNTRUP653_AVX2_CRYPTO_CORE_WFORCESNTRUP653_H
+#define PQCLEAN_SNTRUP653_AVX2_CRYPTO_CORE_WFORCESNTRUP653_H
+
+#include <stdint.h>
+#define PQCLEAN_SNTRUP653_AVX2_crypto_core_wforcesntrup653_OUTPUTBYTES 653
+#define PQCLEAN_SNTRUP653_AVX2_crypto_core_wforcesntrup653_INPUTBYTES 653
+#define PQCLEAN_SNTRUP653_AVX2_crypto_core_wforcesntrup653_KEYBYTES 0
+#define PQCLEAN_SNTRUP653_AVX2_crypto_core_wforcesntrup653_CONSTBYTES 0
+
+int PQCLEAN_SNTRUP653_AVX2_crypto_core_wforcesntrup653(unsigned char *out, const unsigned char *in);
+#endif
diff --git a/crypto_kem/sntrup653/avx2/crypto_decode_653x1541.c b/crypto_kem/sntrup653/avx2/crypto_decode_653x1541.c
new file mode 100644
index 00000000..da5d6f19
--- /dev/null
+++ b/crypto_kem/sntrup653/avx2/crypto_decode_653x1541.c
@@ -0,0 +1,408 @@
+#include "crypto_decode_653x1541.h"
+#include <immintrin.h>
+/* auto-generated; do not edit */
+
+#define int16 int16_t
+#define int32 int32_t
+
+static inline int16 mullo(int16 x, int16 y) {
+    return x * y;
+}
+
+static inline int16 mulhi(int16 x, int16 y) {
+    return (x * (int32)y) >> 16;
+}
+
+static inline __m256i add(__m256i x, __m256i y) {
+    return _mm256_add_epi16(x, y);
+}
+
+static inline __m256i sub(__m256i x, __m256i y) {
+    return _mm256_sub_epi16(x, y);
+}
+
+static inline __m256i shiftleftconst(__m256i x, int16 y) {
+    return _mm256_slli_epi16(x, y);
+}
+
+static inline __m256i signedshiftrightconst(__m256i x, int16 y) {
+    return _mm256_srai_epi16(x, y);
+}
+
+static inline __m256i addconst(__m256i x, int16 y) {
+    return add(x, _mm256_set1_epi16(y));
+}
+
+static inline __m256i subconst(__m256i x, int16 y) {
+    return sub(x, _mm256_set1_epi16(y));
+}
+
+static inline __m256i mulloconst(__m256i x, int16 y) {
+    return _mm256_mullo_epi16(x, _mm256_set1_epi16(y));
+}
+
+static inline __m256i mulhiconst(__m256i x, int16 y) {
+    return _mm256_mulhi_epi16(x, _mm256_set1_epi16(y));
+}
+
+static inline __m256i ifgesubconst(__m256i x, int16 y) {
+    __m256i y16 = _mm256_set1_epi16(y);
+    __m256i top16 = _mm256_set1_epi16((int16)(y - 1));
+    return sub(x, _mm256_cmpgt_epi16(x, top16) & y16);
+}
+
+static inline __m256i ifnegaddconst(__m256i x, int16 y) {
+    return add(x, signedshiftrightconst(x, 15) & _mm256_set1_epi16(y));
+}
+
+void PQCLEAN_SNTRUP653_AVX2_crypto_decode_653x1541(void *v, const unsigned char *s) {
+    int16 *R0 = v;
+    int16 R1[327], R2[164], R3[82], R4[41], R5[21], R6[11], R7[6], R8[3], R9[2], R10[1];
+    long long i;
+    int16 a0, a1, a2;
+    __m256i A0, A1, A2, S0, S1, B0, B1, C0, C1;
+
+    s += PQCLEAN_SNTRUP653_AVX2_crypto_decode_653x1541_STRBYTES;
+    a1 = 0;
+    a1 += *--s; /* 0...255 */
+    a1 = mulhi(a1, -48) - mulhi(mullo(a1, -6433), 2608);
+    a1 += *--s; /* -1304...1558 */
+    a1 += (a1 >> 15) & 2608; /* 0...2607 */
+    R10[0] = a1;
+
+    /* R10 ------> R9: reconstruct mod 1*[71]+[9402] */
+
+    i = 0;
+    s -= 1;
+    a2 = a0 = R10[0];
+    a0 = mulhi(a0, -13) - mulhi(mullo(a0, 25845), 71); /* -39...35 */
+    a0 += s[1 * i + 0]; /* -39...290 */
+    a0 = mulhi(a0, 3) - mulhi(mullo(a0, -923), 71); /* -36...35 */
+    a0 += (a0 >> 15) & 71; /* 0...70 */
+    a1 = (a2 << 8) + s[i] - a0;
+    a1 = mullo(a1, -22153);
+
+    /* invalid inputs might need reduction mod 9402 */
+    a1 -= 9402;
+    a1 += (a1 >> 15) & 9402;
+
+    R9[0] = a0;
+    R9[1] = a1;
+    s -= 0;
+
+    /* R9 ------> R8: reconstruct mod 2*[134]+[9402] */
+
+    R8[2] = R9[1];
+    s -= 1;
+    for (i = 0; i >= 0; --i) {
+        a2 = a0 = R9[i];
+        a0 = mulhi(a0, 14) - mulhi(mullo(a0, 5869), 134); /* -67...70 */
+        a0 += s[1 * i + 0]; /* -67...325 */
+        a0 = mulhi(a0, 10) - mulhi(mullo(a0, -489), 134); /* -68...67 */
+        a0 += (a0 >> 15) & 134; /* 0...133 */
+        a1 = (a2 << 7) + ((s[i] - a0) >> 1);
+        a1 = mullo(a1, 19563);
+
+        /* invalid inputs might need reduction mod 134 */
+        a1 -= 134;
+        a1 += (a1 >> 15) & 134;
+
+        R8[2 * i] = a0;
+        R8[2 * i + 1] = a1;
+    }
+
+    /* R8 ------> R7: reconstruct mod 5*[2953]+[815] */
+
+    i = 0;
+    s -= 1;
+    a2 = a0 = R8[2];
+    a0 = mulhi(a0, 1223) - mulhi(mullo(a0, -5681), 2953); /* -1477...1782 */
+    a0 += s[1 * i + 0]; /* -1477...2037 */
+    a0 += (a0 >> 15) & 2953; /* 0...2952 */
+    a1 = (a2 << 8) + s[i] - a0;
+    a1 = mullo(a1, -9543);
+
+    /* invalid inputs might need reduction mod 815 */
+    a1 -= 815;
+    a1 += (a1 >> 15) & 815;
+
+    R7[4] = a0;
+    R7[5] = a1;
+    s -= 4;
+    for (i = 1; i >= 0; --i) {
+        a0 = R8[i];
+        a0 = mulhi(a0, 1223) - mulhi(mullo(a0, -5681), 2953); /* -1477...1782 */
+        a0 += s[2 * i + 1]; /* -1477...2037 */
+        a0 = mulhi(a0, 1223) - mulhi(mullo(a0, -5681), 2953); /* -1505...1514 */
+        a0 += s[2 * i + 0]; /* -1505...1769 */
+        a0 += (a0 >> 15) & 2953; /* 0...2952 */
+        a1 = (s[2 * i + 1] << 8) + s[2 * i] - a0;
+        a1 = mullo(a1, -9543);
+
+        /* invalid inputs might need reduction mod 2953 */
+        a1 -= 2953;
+        a1 += (a1 >> 15) & 2953;
+
+        R7[2 * i] = a0;
+        R7[2 * i + 1] = a1;
+    }
+
+    /* R7 ------> R6: reconstruct mod 10*[13910]+[815] */
+
+    R6[10] = R7[5];
+    s -= 10;
+    for (i = 4; i >= 0; --i) {
+        a2 = a0 = R7[i];
+        a0 = mulhi(a0, 1756) - mulhi(mullo(a0, -1206), 13910); /* -6955...7394 */
+        a0 += s[2 * i + 1]; /* -6955...7649 */
+        a0 = mulhi(a0, 1756) - mulhi(mullo(a0, -1206), 13910); /* -7142...7159 */
+        a0 += s[2 * i + 0]; /* -7142...7414 */
+        a0 += (a0 >> 15) & 13910; /* 0...13909 */
+        a1 = (a2 << 15) + (s[2 * i + 1] << 7) + ((s[2 * i] - a0) >> 1);
+        a1 = mullo(a1, -13437);
+
+        /* invalid inputs might need reduction mod 13910 */
+        a1 -= 13910;
+        a1 += (a1 >> 15) & 13910;
+
+        R6[2 * i] = a0;
+        R6[2 * i + 1] = a1;
+    }
+
+    /* R6 ------> R5: reconstruct mod 20*[1887]+[815] */
+
+    R5[20] = R6[10];
+    s -= 10;
+    for (i = 9; i >= 0; --i) {
+        a2 = a0 = R6[i];
+        a0 = mulhi(a0, -101) - mulhi(mullo(a0, -8891), 1887); /* -969...943 */
+        a0 += s[1 * i + 0]; /* -969...1198 */
+        a0 += (a0 >> 15) & 1887; /* 0...1886 */
+        a1 = (a2 << 8) + s[i] - a0;
+        a1 = mullo(a1, 5279);
+
+        /* invalid inputs might need reduction mod 1887 */
+        a1 -= 1887;
+        a1 += (a1 >> 15) & 1887;
+
+        R5[2 * i] = a0;
+        R5[2 * i + 1] = a1;
+    }
+
+    /* R5 ------> R4: reconstruct mod 40*[695]+[815] */
+
+    R4[40] = R5[20];
+    s -= 20;
+    i = 4;
+    for (;;) {
+        A2 = A0 = _mm256_loadu_si256((__m256i *) &R5[i]);
+        S0 = _mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i *) (s + i)));
+        A0 = sub(mulhiconst(A0, -84), mulhiconst(mulloconst(A0, -24140), 695)); /* -369...347 */
+        A0 = add(A0, S0); /* -369...602 */
+        A0 = ifnegaddconst(A0, 695); /* 0...694 */
+        A1 = add(shiftleftconst(A2, 8), sub(S0, A0));
+        A1 = mulloconst(A1, 31495);
+
+        /* invalid inputs might need reduction mod 695 */
+        A1 = ifgesubconst(A1, 695);
+
+        /* A0: r0r2r4r6r8r10r12r14 r16r18r20r22r24r26r28r30 */
+        /* A1: r1r3r5r7r9r11r13r15 r17r19r21r23r25r27r29r31 */
+        B0 = _mm256_unpacklo_epi16(A0, A1);
+        B1 = _mm256_unpackhi_epi16(A0, A1);
+        /* B0: r0r1r2r3r4r5r6r7 r16r17r18r19r20r21r22r23 */
+        /* B1: r8r9r10r11r12r13r14r15 r24r25r26r27r28r29r30r31 */
+        C0 = _mm256_permute2x128_si256(B0, B1, 0x20);
+        C1 = _mm256_permute2x128_si256(B0, B1, 0x31);
+        /* C0: r0r1r2r3r4r5r6r7 r8r9r10r11r12r13r14r15 */
+        /* C1: r16r17r18r19r20r21r22r23 r24r25r26r27r28r29r30r31 */
+        _mm256_storeu_si256((__m256i *) (&R4[2 * i]), C0);
+        _mm256_storeu_si256((__m256i *) (16 + &R4[2 * i]), C1);
+        if (!i) {
+            break;
+        }
+        i = -16 - ((~15) & -i);
+    }
+
+    /* R4 ------> R3: reconstruct mod 81*[6745]+[7910] */
+
+    i = 0;
+    s -= 2;
+    a0 = R4[40];
+    a0 = mulhi(a0, 2401) - mulhi(mullo(a0, -2487), 6745); /* -3373...3972 */
+    a0 += s[2 * i + 1]; /* -3373...4227 */
+    a0 = mulhi(a0, 2401) - mulhi(mullo(a0, -2487), 6745); /* -3497...3527 */
+    a0 += s[2 * i + 0]; /* -3497...3782 */
+    a0 += (a0 >> 15) & 6745; /* 0...6744 */
+    a1 = (s[2 * i + 1] << 8) + s[2 * i] - a0;
+    a1 = mullo(a1, -29207);
+
+    /* invalid inputs might need reduction mod 7910 */
+    a1 -= 7910;
+    a1 += (a1 >> 15) & 7910;
+
+    R3[80] = a0;
+    R3[81] = a1;
+    s -= 80;
+    i = 24;
+    for (;;) {
+        A0 = _mm256_loadu_si256((__m256i *) &R4[i]);
+        S0 = _mm256_loadu_si256((__m256i *) (s + 2 * i));
+        S1 = _mm256_srli_epi16(S0, 8);
+        S0 &= _mm256_set1_epi16(255);
+        A0 = sub(mulhiconst(A0, 2401), mulhiconst(mulloconst(A0, -2487), 6745)); /* -3373...3972 */
+        A0 = add(A0, S1); /* -3373...4227 */
+        A0 = sub(mulhiconst(A0, 2401), mulhiconst(mulloconst(A0, -2487), 6745)); /* -3497...3527 */
+        A0 = add(A0, S0); /* -3497...3782 */
+        A0 = ifnegaddconst(A0, 6745); /* 0...6744 */
+        A1 = add(shiftleftconst(S1, 8), sub(S0, A0));
+        A1 = mulloconst(A1, -29207);
+
+        /* invalid inputs might need reduction mod 6745 */
+        A1 = ifgesubconst(A1, 6745);
+
+        /* A0: r0r2r4r6r8r10r12r14 r16r18r20r22r24r26r28r30 */
+        /* A1: r1r3r5r7r9r11r13r15 r17r19r21r23r25r27r29r31 */
+        B0 = _mm256_unpacklo_epi16(A0, A1);
+        B1 = _mm256_unpackhi_epi16(A0, A1);
+        /* B0: r0r1r2r3r4r5r6r7 r16r17r18r19r20r21r22r23 */
+        /* B1: r8r9r10r11r12r13r14r15 r24r25r26r27r28r29r30r31 */
+        C0 = _mm256_permute2x128_si256(B0, B1, 0x20);
+        C1 = _mm256_permute2x128_si256(B0, B1, 0x31);
+        /* C0: r0r1r2r3r4r5r6r7 r8r9r10r11r12r13r14r15 */
+        /* C1: r16r17r18r19r20r21r22r23 r24r25r26r27r28r29r30r31 */
+        _mm256_storeu_si256((__m256i *) (&R3[2 * i]), C0);
+        _mm256_storeu_si256((__m256i *) (16 + &R3[2 * i]), C1);
+        if (!i) {
+            break;
+        }
+        i = -16 - ((~15) & -i);
+    }
+
+    /* R3 ------> R2: reconstruct mod 163*[1314]+[1541] */
+
+    i = 0;
+    s -= 1;
+    a2 = a0 = R3[81];
+    a0 = mulhi(a0, 64) - mulhi(mullo(a0, -12768), 1314); /* -657...673 */
+    a0 += s[1 * i + 0]; /* -657...928 */
+    a0 += (a0 >> 15) & 1314; /* 0...1313 */
+    a1 = (a2 << 7) + ((s[i] - a0) >> 1);
+    a1 = mullo(a1, -399);
+
+    /* invalid inputs might need reduction mod 1541 */
+    a1 -= 1541;
+    a1 += (a1 >> 15) & 1541;
+
+    R2[162] = a0;
+    R2[163] = a1;
+    s -= 81;
+    i = 65;
+    for (;;) {
+        A2 = A0 = _mm256_loadu_si256((__m256i *) &R3[i]);
+        S0 = _mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i *) (s + i)));
+        A0 = sub(mulhiconst(A0, 64), mulhiconst(mulloconst(A0, -12768), 1314)); /* -657...673 */
+        A0 = add(A0, S0); /* -657...928 */
+        A0 = ifnegaddconst(A0, 1314); /* 0...1313 */
+        A1 = add(shiftleftconst(A2, 7), signedshiftrightconst(sub(S0, A0), 1));
+        A1 = mulloconst(A1, -399);
+
+        /* invalid inputs might need reduction mod 1314 */
+        A1 = ifgesubconst(A1, 1314);
+
+        /* A0: r0r2r4r6r8r10r12r14 r16r18r20r22r24r26r28r30 */
+        /* A1: r1r3r5r7r9r11r13r15 r17r19r21r23r25r27r29r31 */
+        B0 = _mm256_unpacklo_epi16(A0, A1);
+        B1 = _mm256_unpackhi_epi16(A0, A1);
+        /* B0: r0r1r2r3r4r5r6r7 r16r17r18r19r20r21r22r23 */
+        /* B1: r8r9r10r11r12r13r14r15 r24r25r26r27r28r29r30r31 */
+        C0 = _mm256_permute2x128_si256(B0, B1, 0x20);
+        C1 = _mm256_permute2x128_si256(B0, B1, 0x31);
+        /* C0: r0r1r2r3r4r5r6r7 r8r9r10r11r12r13r14r15 */
+        /* C1: r16r17r18r19r20r21r22r23 r24r25r26r27r28r29r30r31 */
+        _mm256_storeu_si256((__m256i *) (&R2[2 * i]), C0);
+        _mm256_storeu_si256((__m256i *) (16 + &R2[2 * i]), C1);
+        if (!i) {
+            break;
+        }
+        i = -16 - ((~15) & -i);
+    }
+
+    /* R2 ------> R1: reconstruct mod 326*[9277]+[1541] */
+
+    R1[326] = R2[163];
+    s -= 326;
+    i = 147;
+    for (;;) {
+        A0 = _mm256_loadu_si256((__m256i *) &R2[i]);
+        S0 = _mm256_loadu_si256((__m256i *) (s + 2 * i));
+        S1 = _mm256_srli_epi16(S0, 8);
+        S0 &= _mm256_set1_epi16(255);
+        A0 = sub(mulhiconst(A0, 4400), mulhiconst(mulloconst(A0, -1808), 9277)); /* -4639...5738 */
+        A0 = add(A0, S1); /* -4639...5993 */
+        A0 = sub(mulhiconst(A0, 4400), mulhiconst(mulloconst(A0, -1808), 9277)); /* -4950...5040 */
+        A0 = add(A0, S0); /* -4950...5295 */
+        A0 = ifnegaddconst(A0, 9277); /* 0...9276 */
+        A1 = add(shiftleftconst(S1, 8), sub(S0, A0));
+        A1 = mulloconst(A1, -27883);
+
+        /* invalid inputs might need reduction mod 9277 */
+        A1 = ifgesubconst(A1, 9277);
+
+        /* A0: r0r2r4r6r8r10r12r14 r16r18r20r22r24r26r28r30 */
+        /* A1: r1r3r5r7r9r11r13r15 r17r19r21r23r25r27r29r31 */
+        B0 = _mm256_unpacklo_epi16(A0, A1);
+        B1 = _mm256_unpackhi_epi16(A0, A1);
+        /* B0: r0r1r2r3r4r5r6r7 r16r17r18r19r20r21r22r23 */
+        /* B1: r8r9r10r11r12r13r14r15 r24r25r26r27r28r29r30r31 */
+        C0 = _mm256_permute2x128_si256(B0, B1, 0x20);
+        C1 = _mm256_permute2x128_si256(B0, B1, 0x31);
+        /* C0: r0r1r2r3r4r5r6r7 r8r9r10r11r12r13r14r15 */
+        /* C1: r16r17r18r19r20r21r22r23 r24r25r26r27r28r29r30r31 */
+        _mm256_storeu_si256((__m256i *) (&R1[2 * i]), C0);
+        _mm256_storeu_si256((__m256i *) (16 + &R1[2 * i]), C1);
+        if (!i) {
+            break;
+        }
+        i = -16 - ((~15) & -i);
+    }
+
+    /* R1 ------> R0: reconstruct mod 653*[1541] */
+
+    R0[652] = 3 * R1[326] - 2310;
+    s -= 326;
+    i = 310;
+    for (;;) {
+        A2 = A0 = _mm256_loadu_si256((__m256i *) &R1[i]);
+        S0 = _mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i *) (s + i)));
+        A0 = sub(mulhiconst(A0, 349), mulhiconst(mulloconst(A0, -10887), 1541)); /* -771...857 */
+        A0 = add(A0, S0); /* -771...1112 */
+        A0 = ifnegaddconst(A0, 1541); /* 0...1540 */
+        A1 = add(shiftleftconst(A2, 8), sub(S0, A0));
+        A1 = mulloconst(A1, -10547);
+
+        /* invalid inputs might need reduction mod 1541 */
+        A1 = ifgesubconst(A1, 1541);
+
+        A0 = mulloconst(A0, 3);
+        A1 = mulloconst(A1, 3);
+        A0 = subconst(A0, 2310);
+        A1 = subconst(A1, 2310);
+        /* A0: r0r2r4r6r8r10r12r14 r16r18r20r22r24r26r28r30 */
+        /* A1: r1r3r5r7r9r11r13r15 r17r19r21r23r25r27r29r31 */
+        B0 = _mm256_unpacklo_epi16(A0, A1);
+        B1 = _mm256_unpackhi_epi16(A0, A1);
+        /* B0: r0r1r2r3r4r5r6r7 r16r17r18r19r20r21r22r23 */
+        /* B1: r8r9r10r11r12r13r14r15 r24r25r26r27r28r29r30r31 */
+        C0 = _mm256_permute2x128_si256(B0, B1, 0x20);
+        C1 = _mm256_permute2x128_si256(B0, B1, 0x31);
+        /* C0: r0r1r2r3r4r5r6r7 r8r9r10r11r12r13r14r15 */
+        /* C1: r16r17r18r19r20r21r22r23 r24r25r26r27r28r29r30r31 */
+        _mm256_storeu_si256((__m256i *) (&R0[2 * i]), C0);
+        _mm256_storeu_si256((__m256i *) (16 + &R0[2 * i]), C1);
+        if (!i) {
+            break;
+        }
+        i = -16 - ((~15) & -i);
+    }
+}
diff --git a/crypto_kem/sntrup653/avx2/crypto_decode_653x1541.h b/crypto_kem/sntrup653/avx2/crypto_decode_653x1541.h
new file mode 100644
index 00000000..5827b644
--- /dev/null
+++ b/crypto_kem/sntrup653/avx2/crypto_decode_653x1541.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_SNTRUP653_AVX2_CRYPTO_DECODE_653X1541_H
+#define PQCLEAN_SNTRUP653_AVX2_CRYPTO_DECODE_653X1541_H
+
+#include <stdint.h>
+#define PQCLEAN_SNTRUP653_AVX2_crypto_decode_653x1541_STRBYTES 865
+#define PQCLEAN_SNTRUP653_AVX2_crypto_decode_653x1541_ITEMS 653
+#define PQCLEAN_SNTRUP653_AVX2_crypto_decode_653x1541_ITEMBYTES 2
+
+void PQCLEAN_SNTRUP653_AVX2_crypto_decode_653x1541(void *v, const unsigned char *s);
+#endif
diff --git a/crypto_kem/sntrup653/avx2/crypto_decode_653x3.c b/crypto_kem/sntrup653/avx2/crypto_decode_653x3.c
new file mode 100644
index 00000000..3dc5c580
--- /dev/null
+++ b/crypto_kem/sntrup653/avx2/crypto_decode_653x3.c
@@ -0,0 +1,65 @@
+#include "crypto_decode_653x3.h"
+#include <immintrin.h>
+#define uint8 uint8_t
+
+#define p 653
+#define loops 6
+#define overshoot 29
+
+void PQCLEAN_SNTRUP653_AVX2_crypto_decode_653x3(void *v, const unsigned char *s) {
+    uint8 *f = v;
+    int loop;
+    uint8 *nextf = f + 128 - 4 * overshoot;
+    const unsigned char *nexts = s + 32 - overshoot;
+
+    for (loop = loops; loop > 0; --loop) {
+        __m256i s0 = _mm256_loadu_si256((const __m256i *) s);
+        s = nexts;
+        nexts += 32;
+
+        __m256i s1 = _mm256_srli_epi16(s0 & _mm256_set1_epi8(-16), 4);
+        s0 &= _mm256_set1_epi8(15);
+
+        __m256i a0 = _mm256_unpacklo_epi8(s0, s1);
+        /* 0 0>>4 1 1>>4 2 2>>4 3 3>>4 4 4>>4 5 5>>4 6 6>>4 7 7>>4 */
+        /* 16 16>>4 ... */
+        __m256i a1 = _mm256_unpackhi_epi8(s0, s1);
+        /* 8 8>>4 9 9>>4 10 10>>4 ... */
+        /* 24 24>>4 ... */
+
+        __m256i a2 = _mm256_srli_epi16(a0 & _mm256_set1_epi8(12), 2);
+        __m256i a3 = _mm256_srli_epi16(a1 & _mm256_set1_epi8(12), 2);
+        a0 &= _mm256_set1_epi8(3);
+        a1 &= _mm256_set1_epi8(3);
+
+        __m256i b0 = _mm256_unpacklo_epi8(a0, a2);
+        /* 0 0>>2 0>>4 0>>6 1 1>>2 1>>4 1>>6 */
+        /* 2 2>>2 2>>4 2>>6 3 3>>2 3>>4 3>.6 */
+        /* 16 16>>2 16>>4 16>>6 ... */
+        __m256i b2 = _mm256_unpackhi_epi8(a0, a2);
+        /* 4 4>>2 ... */
+        __m256i b1 = _mm256_unpacklo_epi8(a1, a3);
+        /* 8 8>>2 ... */
+        __m256i b3 = _mm256_unpackhi_epi8(a1, a3);
+        /* 12 12>>2 ... */
+
+        __m256i f0 = _mm256_permute2x128_si256(b0, b2, 0x20);
+        __m256i f2 = _mm256_permute2x128_si256(b0, b2, 0x31);
+        __m256i f1 = _mm256_permute2x128_si256(b1, b3, 0x20);
+        __m256i f3 = _mm256_permute2x128_si256(b1, b3, 0x31);
+
+        f0 = _mm256_add_epi8(f0, _mm256_set1_epi8(-1));
+        f1 = _mm256_add_epi8(f1, _mm256_set1_epi8(-1));
+        f2 = _mm256_add_epi8(f2, _mm256_set1_epi8(-1));
+        f3 = _mm256_add_epi8(f3, _mm256_set1_epi8(-1));
+
+        _mm256_storeu_si256((__m256i *) (f + 0), f0);
+        _mm256_storeu_si256((__m256i *) (f + 32), f1);
+        _mm256_storeu_si256((__m256i *) (f + 64), f2);
+        _mm256_storeu_si256((__m256i *) (f + 96), f3);
+        f = nextf;
+        nextf += 128;
+    }
+
+    *f = ((uint8)(*s & 3)) - 1;
+}
diff --git a/crypto_kem/sntrup653/avx2/crypto_decode_653x3.h b/crypto_kem/sntrup653/avx2/crypto_decode_653x3.h
new file mode 100644
index 00000000..c331d9cc
--- /dev/null
+++ b/crypto_kem/sntrup653/avx2/crypto_decode_653x3.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_SNTRUP653_AVX2_CRYPTO_DECODE_653X3_H
+#define PQCLEAN_SNTRUP653_AVX2_CRYPTO_DECODE_653X3_H
+
+#include <stdint.h>
+#define PQCLEAN_SNTRUP653_AVX2_crypto_decode_653x3_STRBYTES 164
+#define PQCLEAN_SNTRUP653_AVX2_crypto_decode_653x3_ITEMS 653
+#define PQCLEAN_SNTRUP653_AVX2_crypto_decode_653x3_ITEMBYTES 1
+
+void PQCLEAN_SNTRUP653_AVX2_crypto_decode_653x3(void *v, const unsigned char *s);
+#endif
diff --git a/crypto_kem/sntrup653/avx2/crypto_decode_653x4621.c b/crypto_kem/sntrup653/avx2/crypto_decode_653x4621.c
new file mode 100644
index 00000000..04368a95
--- /dev/null
+++ b/crypto_kem/sntrup653/avx2/crypto_decode_653x4621.c
@@ -0,0 +1,408 @@
+#include "crypto_decode_653x4621.h"
+#include <immintrin.h>
+/* auto-generated; do not edit */
+
+#define int16 int16_t
+#define int32 int32_t
+
+static inline int16 mullo(int16 x, int16 y) {
+    return x * y;
+}
+
+static inline int16 mulhi(int16 x, int16 y) {
+    return (x * (int32)y) >> 16;
+}
+
+static inline __m256i add(__m256i x, __m256i y) {
+    return _mm256_add_epi16(x, y);
+}
+
+static inline __m256i sub(__m256i x, __m256i y) {
+    return _mm256_sub_epi16(x, y);
+}
+
+static inline __m256i shiftleftconst(__m256i x, int16 y) {
+    return _mm256_slli_epi16(x, y);
+}
+
+static inline __m256i signedshiftrightconst(__m256i x, int16 y) {
+    return _mm256_srai_epi16(x, y);
+}
+
+static inline __m256i addconst(__m256i x, int16 y) {
+    return add(x, _mm256_set1_epi16(y));
+}
+
+static inline __m256i subconst(__m256i x, int16 y) {
+    return sub(x, _mm256_set1_epi16(y));
+}
+
+static inline __m256i mulloconst(__m256i x, int16 y) {
+    return _mm256_mullo_epi16(x, _mm256_set1_epi16(y));
+}
+
+static inline __m256i mulhiconst(__m256i x, int16 y) {
+    return _mm256_mulhi_epi16(x, _mm256_set1_epi16(y));
+}
+
+static inline __m256i ifgesubconst(__m256i x, int16 y) {
+    __m256i y16 = _mm256_set1_epi16(y);
+    __m256i top16 = _mm256_set1_epi16((int16)(y - 1));
+    return sub(x, _mm256_cmpgt_epi16(x, top16) & y16);
+}
+
+static inline __m256i ifnegaddconst(__m256i x, int16 y) {
+    return add(x, signedshiftrightconst(x, 15) & _mm256_set1_epi16(y));
+}
+
+void PQCLEAN_SNTRUP653_AVX2_crypto_decode_653x4621(void *v, const unsigned char *s) {
+    int16 *R0 = v;
+    int16 R1[327], R2[164], R3[82], R4[41], R5[21], R6[11], R7[6], R8[3], R9[2], R10[1];
+    long long i;
+    int16 a0, a1, a2;
+    __m256i A0, A1, A2, S0, S1, B0, B1, C0, C1;
+
+    s += PQCLEAN_SNTRUP653_AVX2_crypto_decode_653x4621_STRBYTES;
+    a1 = 0;
+    a1 += *--s; /* 0...255 */
+    a1 -= 86; /* -86...169 */
+    a1 -= 86; /* -172...83 */
+    a1 += (a1 >> 15) & 86; /* -86...85 */
+    a1 += (a1 >> 15) & 86; /* 0...85 */
+    R10[0] = a1;
+
+    /* R10 ------> R9: reconstruct mod 1*[835]+[6708] */
+
+    i = 0;
+    s -= 2;
+    a0 = R10[0];
+    a0 = mulhi(a0, 396) - mulhi(mullo(a0, -20092), 835); /* -418...516 */
+    a0 += s[2 * i + 1]; /* -418...771 */
+    a0 = mulhi(a0, 396) - mulhi(mullo(a0, -20092), 835); /* -421...422 */
+    a0 += s[2 * i + 0]; /* -421...677 */
+    a0 += (a0 >> 15) & 835; /* 0...834 */
+    a1 = (s[2 * i + 1] << 8) + s[2 * i] - a0;
+    a1 = mullo(a1, 8555);
+
+    /* invalid inputs might need reduction mod 6708 */
+    a1 -= 6708;
+    a1 += (a1 >> 15) & 6708;
+
+    R9[0] = a0;
+    R9[1] = a1;
+    s -= 0;
+
+    /* R9 ------> R8: reconstruct mod 2*[7396]+[6708] */
+
+    R8[2] = R9[1];
+    s -= 2;
+    for (i = 0; i >= 0; --i) {
+        a2 = a0 = R9[i];
+        a0 = mulhi(a0, 3088) - mulhi(mullo(a0, -2268), 7396); /* -3698...4470 */
+        a0 += s[2 * i + 1]; /* -3698...4725 */
+        a0 = mulhi(a0, 3088) - mulhi(mullo(a0, -2268), 7396); /* -3873...3920 */
+        a0 += s[2 * i + 0]; /* -3873...4175 */
+        a0 += (a0 >> 15) & 7396; /* 0...7395 */
+        a1 = (a2 << 14) + (s[2 * i + 1] << 6) + ((s[2 * i] - a0) >> 2);
+        a1 = mullo(a1, -18679);
+
+        /* invalid inputs might need reduction mod 7396 */
+        a1 -= 7396;
+        a1 += (a1 >> 15) & 7396;
+
+        R8[2 * i] = a0;
+        R8[2 * i + 1] = a1;
+    }
+
+    /* R8 ------> R7: reconstruct mod 5*[86]+[78] */
+
+    s -= 0;
+    a2 = a0 = R8[2];
+    a0 = mulhi(a0, 4) - mulhi(mullo(a0, -762), 86); /* -43...44 */
+    a0 += (a0 >> 15) & 86; /* 0...85 */
+    a1 = (a2 - a0) >> 1;
+    a1 = mullo(a1, -16765);
+
+    /* invalid inputs might need reduction mod 78 */
+    a1 -= 78;
+    a1 += (a1 >> 15) & 78;
+
+    R7[4] = a0;
+    R7[5] = a1;
+    s -= 0;
+    for (i = 1; i >= 0; --i) {
+        a2 = a0 = R8[i];
+        a0 = mulhi(a0, 4) - mulhi(mullo(a0, -762), 86); /* -43...44 */
+        a0 += (a0 >> 15) & 86; /* 0...85 */
+        a1 = (a2 - a0) >> 1;
+        a1 = mullo(a1, -16765);
+
+        /* invalid inputs might need reduction mod 86 */
+        a1 -= 86;
+        a1 += (a1 >> 15) & 86;
+
+        R7[2 * i] = a0;
+        R7[2 * i + 1] = a1;
+    }
+
+    /* R7 ------> R6: reconstruct mod 10*[2370]+[78] */
+
+    R6[10] = R7[5];
+    s -= 10;
+    for (i = 4; i >= 0; --i) {
+        a2 = a0 = R7[i];
+        a0 = mulhi(a0, -14) - mulhi(mullo(a0, -7079), 2370); /* -1189...1185 */
+        a0 += s[2 * i + 1]; /* -1189...1440 */
+        a0 = mulhi(a0, -14) - mulhi(mullo(a0, -7079), 2370); /* -1186...1185 */
+        a0 += s[2 * i + 0]; /* -1186...1440 */
+        a0 += (a0 >> 15) & 2370; /* 0...2369 */
+        a1 = (a2 << 15) + (s[2 * i + 1] << 7) + ((s[2 * i] - a0) >> 1);
+        a1 = mullo(a1, -8351);
+
+        /* invalid inputs might need reduction mod 2370 */
+        a1 -= 2370;
+        a1 += (a1 >> 15) & 2370;
+
+        R6[2 * i] = a0;
+        R6[2 * i + 1] = a1;
+    }
+
+    /* R6 ------> R5: reconstruct mod 20*[12461]+[78] */
+
+    R5[20] = R6[10];
+    s -= 20;
+    for (i = 9; i >= 0; --i) {
+        a0 = R6[i];
+        a0 = mulhi(a0, 4710) - mulhi(mullo(a0, -1346), 12461); /* -6231...7408 */
+        a0 += s[2 * i + 1]; /* -6231...7663 */
+        a0 = mulhi(a0, 4710) - mulhi(mullo(a0, -1346), 12461); /* -6679...6781 */
+        a0 += s[2 * i + 0]; /* -6679...7036 */
+        a0 += (a0 >> 15) & 12461; /* 0...12460 */
+        a1 = (s[2 * i + 1] << 8) + s[2 * i] - a0;
+        a1 = mullo(a1, -19675);
+
+        /* invalid inputs might need reduction mod 12461 */
+        a1 -= 12461;
+        a1 += (a1 >> 15) & 12461;
+
+        R5[2 * i] = a0;
+        R5[2 * i + 1] = a1;
+    }
+
+    /* R5 ------> R4: reconstruct mod 40*[1786]+[78] */
+
+    R4[40] = R5[20];
+    s -= 20;
+    i = 4;
+    for (;;) {
+        A2 = A0 = _mm256_loadu_si256((__m256i *) &R5[i]);
+        S0 = _mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i *) (s + i)));
+        A0 = sub(mulhiconst(A0, -468), mulhiconst(mulloconst(A0, -9394), 1786)); /* -1010...893 */
+        A0 = add(A0, S0); /* -1010...1148 */
+        A0 = ifnegaddconst(A0, 1786); /* 0...1785 */
+        A1 = add(shiftleftconst(A2, 7), signedshiftrightconst(sub(S0, A0), 1));
+        A1 = mulloconst(A1, -12843);
+
+        /* invalid inputs might need reduction mod 1786 */
+        A1 = ifgesubconst(A1, 1786);
+
+        /* A0: r0r2r4r6r8r10r12r14 r16r18r20r22r24r26r28r30 */
+        /* A1: r1r3r5r7r9r11r13r15 r17r19r21r23r25r27r29r31 */
+        B0 = _mm256_unpacklo_epi16(A0, A1);
+        B1 = _mm256_unpackhi_epi16(A0, A1);
+        /* B0: r0r1r2r3r4r5r6r7 r16r17r18r19r20r21r22r23 */
+        /* B1: r8r9r10r11r12r13r14r15 r24r25r26r27r28r29r30r31 */
+        C0 = _mm256_permute2x128_si256(B0, B1, 0x20);
+        C1 = _mm256_permute2x128_si256(B0, B1, 0x31);
+        /* C0: r0r1r2r3r4r5r6r7 r8r9r10r11r12r13r14r15 */
+        /* C1: r16r17r18r19r20r21r22r23 r24r25r26r27r28r29r30r31 */
+        _mm256_storeu_si256((__m256i *) (&R4[2 * i]), C0);
+        _mm256_storeu_si256((__m256i *) (16 + &R4[2 * i]), C1);
+        if (!i) {
+            break;
+        }
+        i = -16 - ((~15) & -i);
+    }
+
+    /* R4 ------> R3: reconstruct mod 81*[676]+[7510] */
+
+    i = 0;
+    s -= 2;
+    a2 = a0 = R4[40];
+    a0 = mulhi(a0, 248) - mulhi(mullo(a0, -24818), 676); /* -338...400 */
+    a0 += s[2 * i + 1]; /* -338...655 */
+    a0 = mulhi(a0, 248) - mulhi(mullo(a0, -24818), 676); /* -340...340 */
+    a0 += s[2 * i + 0]; /* -340...595 */
+    a0 += (a0 >> 15) & 676; /* 0...675 */
+    a1 = (a2 << 14) + (s[2 * i + 1] << 6) + ((s[2 * i] - a0) >> 2);
+    a1 = mullo(a1, -23655);
+
+    /* invalid inputs might need reduction mod 7510 */
+    a1 -= 7510;
+    a1 += (a1 >> 15) & 7510;
+
+    R3[80] = a0;
+    R3[81] = a1;
+    s -= 40;
+    i = 24;
+    for (;;) {
+        A2 = A0 = _mm256_loadu_si256((__m256i *) &R4[i]);
+        S0 = _mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i *) (s + i)));
+        A0 = sub(mulhiconst(A0, 248), mulhiconst(mulloconst(A0, -24818), 676)); /* -338...400 */
+        A0 = add(A0, S0); /* -338...655 */
+        A0 = ifnegaddconst(A0, 676); /* 0...675 */
+        A1 = add(shiftleftconst(A2, 6), signedshiftrightconst(sub(S0, A0), 2));
+        A1 = mulloconst(A1, -23655);
+
+        /* invalid inputs might need reduction mod 676 */
+        A1 = ifgesubconst(A1, 676);
+
+        /* A0: r0r2r4r6r8r10r12r14 r16r18r20r22r24r26r28r30 */
+        /* A1: r1r3r5r7r9r11r13r15 r17r19r21r23r25r27r29r31 */
+        B0 = _mm256_unpacklo_epi16(A0, A1);
+        B1 = _mm256_unpackhi_epi16(A0, A1);
+        /* B0: r0r1r2r3r4r5r6r7 r16r17r18r19r20r21r22r23 */
+        /* B1: r8r9r10r11r12r13r14r15 r24r25r26r27r28r29r30r31 */
+        C0 = _mm256_permute2x128_si256(B0, B1, 0x20);
+        C1 = _mm256_permute2x128_si256(B0, B1, 0x31);
+        /* C0: r0r1r2r3r4r5r6r7 r8r9r10r11r12r13r14r15 */
+        /* C1: r16r17r18r19r20r21r22r23 r24r25r26r27r28r29r30r31 */
+        _mm256_storeu_si256((__m256i *) (&R3[2 * i]), C0);
+        _mm256_storeu_si256((__m256i *) (16 + &R3[2 * i]), C1);
+        if (!i) {
+            break;
+        }
+        i = -16 - ((~15) & -i);
+    }
+
+    /* R3 ------> R2: reconstruct mod 163*[416]+[4621] */
+
+    i = 0;
+    s -= 1;
+    a2 = a0 = R3[81];
+    a0 = mulhi(a0, -64) - mulhi(mullo(a0, 25206), 416); /* -224...208 */
+    a0 += s[1 * i + 0]; /* -224...463 */
+    a0 -= 416; /* -640..>47 */
+    a0 += (a0 >> 15) & 416; /* -224...415 */
+    a0 += (a0 >> 15) & 416; /* 0...415 */
+    a1 = (a2 << 3) + ((s[i] - a0) >> 5);
+    a1 = mullo(a1, 20165);
+
+    /* invalid inputs might need reduction mod 4621 */
+    a1 -= 4621;
+    a1 += (a1 >> 15) & 4621;
+
+    R2[162] = a0;
+    R2[163] = a1;
+    s -= 81;
+    i = 65;
+    for (;;) {
+        A2 = A0 = _mm256_loadu_si256((__m256i *) &R3[i]);
+        S0 = _mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i *) (s + i)));
+        A0 = sub(mulhiconst(A0, -64), mulhiconst(mulloconst(A0, 25206), 416)); /* -224...208 */
+        A0 = add(A0, S0); /* -224...463 */
+        A0 = subconst(A0, 416); /* -640...47 */
+        A0 = ifnegaddconst(A0, 416); /* -224...415 */
+        A0 = ifnegaddconst(A0, 416); /* 0...415 */
+        A1 = add(shiftleftconst(A2, 3), signedshiftrightconst(sub(S0, A0), 5));
+        A1 = mulloconst(A1, 20165);
+
+        /* invalid inputs might need reduction mod 416 */
+        A1 = ifgesubconst(A1, 416);
+
+        /* A0: r0r2r4r6r8r10r12r14 r16r18r20r22r24r26r28r30 */
+        /* A1: r1r3r5r7r9r11r13r15 r17r19r21r23r25r27r29r31 */
+        B0 = _mm256_unpacklo_epi16(A0, A1);
+        B1 = _mm256_unpackhi_epi16(A0, A1);
+        /* B0: r0r1r2r3r4r5r6r7 r16r17r18r19r20r21r22r23 */
+        /* B1: r8r9r10r11r12r13r14r15 r24r25r26r27r28r29r30r31 */
+        C0 = _mm256_permute2x128_si256(B0, B1, 0x20);
+        C1 = _mm256_permute2x128_si256(B0, B1, 0x31);
+        /* C0: r0r1r2r3r4r5r6r7 r8r9r10r11r12r13r14r15 */
+        /* C1: r16r17r18r19r20r21r22r23 r24r25r26r27r28r29r30r31 */
+        _mm256_storeu_si256((__m256i *) (&R2[2 * i]), C0);
+        _mm256_storeu_si256((__m256i *) (16 + &R2[2 * i]), C1);
+        if (!i) {
+            break;
+        }
+        i = -16 - ((~15) & -i);
+    }
+
+    /* R2 ------> R1: reconstruct mod 326*[326]+[4621] */
+
+    R1[326] = R2[163];
+    s -= 163;
+    i = 147;
+    for (;;) {
+        A2 = A0 = _mm256_loadu_si256((__m256i *) &R2[i]);
+        S0 = _mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i *) (s + i)));
+        A0 = sub(mulhiconst(A0, -48), mulhiconst(mulloconst(A0, 14072), 326)); /* -175...163 */
+        A0 = add(A0, S0); /* -175...418 */
+        A0 = subconst(A0, 326); /* -501...92 */
+        A0 = ifnegaddconst(A0, 326); /* -175...325 */
+        A0 = ifnegaddconst(A0, 326); /* 0...325 */
+        A1 = add(shiftleftconst(A2, 7), signedshiftrightconst(sub(S0, A0), 1));
+        A1 = mulloconst(A1, -19701);
+
+        /* invalid inputs might need reduction mod 326 */
+        A1 = ifgesubconst(A1, 326);
+
+        /* A0: r0r2r4r6r8r10r12r14 r16r18r20r22r24r26r28r30 */
+        /* A1: r1r3r5r7r9r11r13r15 r17r19r21r23r25r27r29r31 */
+        B0 = _mm256_unpacklo_epi16(A0, A1);
+        B1 = _mm256_unpackhi_epi16(A0, A1);
+        /* B0: r0r1r2r3r4r5r6r7 r16r17r18r19r20r21r22r23 */
+        /* B1: r8r9r10r11r12r13r14r15 r24r25r26r27r28r29r30r31 */
+        C0 = _mm256_permute2x128_si256(B0, B1, 0x20);
+        C1 = _mm256_permute2x128_si256(B0, B1, 0x31);
+        /* C0: r0r1r2r3r4r5r6r7 r8r9r10r11r12r13r14r15 */
+        /* C1: r16r17r18r19r20r21r22r23 r24r25r26r27r28r29r30r31 */
+        _mm256_storeu_si256((__m256i *) (&R1[2 * i]), C0);
+        _mm256_storeu_si256((__m256i *) (16 + &R1[2 * i]), C1);
+        if (!i) {
+            break;
+        }
+        i = -16 - ((~15) & -i);
+    }
+
+    /* R1 ------> R0: reconstruct mod 653*[4621] */
+
+    R0[652] = R1[326] - 2310;
+    s -= 652;
+    i = 310;
+    for (;;) {
+        A0 = _mm256_loadu_si256((__m256i *) &R1[i]);
+        S0 = _mm256_loadu_si256((__m256i *) (s + 2 * i));
+        S1 = _mm256_srli_epi16(S0, 8);
+        S0 &= _mm256_set1_epi16(255);
+        A0 = sub(mulhiconst(A0, -1635), mulhiconst(mulloconst(A0, -3631), 4621)); /* -2720...2310 */
+        A0 = add(A0, S1); /* -2720...2565 */
+        A0 = sub(mulhiconst(A0, -1635), mulhiconst(mulloconst(A0, -3631), 4621)); /* -2375...2378 */
+        A0 = add(A0, S0); /* -2375...2633 */
+        A0 = ifnegaddconst(A0, 4621); /* 0...4620 */
+        A1 = add(shiftleftconst(S1, 8), sub(S0, A0));
+        A1 = mulloconst(A1, -29499);
+
+        /* invalid inputs might need reduction mod 4621 */
+        A1 = ifgesubconst(A1, 4621);
+
+        A0 = subconst(A0, 2310);
+        A1 = subconst(A1, 2310);
+        /* A0: r0r2r4r6r8r10r12r14 r16r18r20r22r24r26r28r30 */
+        /* A1: r1r3r5r7r9r11r13r15 r17r19r21r23r25r27r29r31 */
+        B0 = _mm256_unpacklo_epi16(A0, A1);
+        B1 = _mm256_unpackhi_epi16(A0, A1);
+        /* B0: r0r1r2r3r4r5r6r7 r16r17r18r19r20r21r22r23 */
+        /* B1: r8r9r10r11r12r13r14r15 r24r25r26r27r28r29r30r31 */
+        C0 = _mm256_permute2x128_si256(B0, B1, 0x20);
+        C1 = _mm256_permute2x128_si256(B0, B1, 0x31);
+        /* C0: r0r1r2r3r4r5r6r7 r8r9r10r11r12r13r14r15 */
+        /* C1: r16r17r18r19r20r21r22r23 r24r25r26r27r28r29r30r31 */
+        _mm256_storeu_si256((__m256i *) (&R0[2 * i]), C0);
+        _mm256_storeu_si256((__m256i *) (16 + &R0[2 * i]), C1);
+        if (!i) {
+            break;
+        }
+        i = -16 - ((~15) & -i);
+    }
+}
diff --git a/crypto_kem/sntrup653/avx2/crypto_decode_653x4621.h b/crypto_kem/sntrup653/avx2/crypto_decode_653x4621.h
new file mode 100644
index 00000000..cb2a0a18
--- /dev/null
+++ b/crypto_kem/sntrup653/avx2/crypto_decode_653x4621.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_SNTRUP653_AVX2_CRYPTO_DECODE_653X4621_H
+#define PQCLEAN_SNTRUP653_AVX2_CRYPTO_DECODE_653X4621_H
+
+#include <stdint.h>
+#define PQCLEAN_SNTRUP653_AVX2_crypto_decode_653x4621_STRBYTES 994
+#define PQCLEAN_SNTRUP653_AVX2_crypto_decode_653x4621_ITEMS 653
+#define PQCLEAN_SNTRUP653_AVX2_crypto_decode_653x4621_ITEMBYTES 2
+
+void PQCLEAN_SNTRUP653_AVX2_crypto_decode_653x4621(void *v, const unsigned char *s);
+#endif
diff --git a/crypto_kem/sntrup653/avx2/crypto_decode_653xint16.c b/crypto_kem/sntrup653/avx2/crypto_decode_653xint16.c
new file mode 100644
index 00000000..dfa1cda2
--- /dev/null
+++ b/crypto_kem/sntrup653/avx2/crypto_decode_653xint16.c
@@ -0,0 +1,16 @@
+#include "crypto_decode_653xint16.h"
+
+
+void PQCLEAN_SNTRUP653_AVX2_crypto_decode_653xint16(void *v, const unsigned char *s) {
+    uint16_t *x = v;
+    int i;
+
+    for (i = 0; i < 653; ++i) {
+        uint16_t u0 = s[0];
+        uint16_t u1 = s[1];
+        u1 <<= 8;
+        *x = u0 | u1;
+        x += 1;
+        s += 2;
+    }
+}
diff --git a/crypto_kem/sntrup653/avx2/crypto_decode_653xint16.h b/crypto_kem/sntrup653/avx2/crypto_decode_653xint16.h
new file mode 100644
index 00000000..42e759a9
--- /dev/null
+++ b/crypto_kem/sntrup653/avx2/crypto_decode_653xint16.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_SNTRUP653_AVX2_CRYPTO_DECODE_653XINT16_H
+#define PQCLEAN_SNTRUP653_AVX2_CRYPTO_DECODE_653XINT16_H
+
+#include <stdint.h>
+#define PQCLEAN_SNTRUP653_AVX2_crypto_decode_653xint16_STRBYTES 1306
+#define PQCLEAN_SNTRUP653_AVX2_crypto_decode_653xint16_ITEMBYTES 2
+#define PQCLEAN_SNTRUP653_AVX2_crypto_decode_653xint16_ITEMS 653
+
+void PQCLEAN_SNTRUP653_AVX2_crypto_decode_653xint16(void *v, const unsigned char *s);
+#endif
diff --git a/crypto_kem/sntrup653/avx2/crypto_decode_653xint32.c b/crypto_kem/sntrup653/avx2/crypto_decode_653xint32.c
new file mode 100644
index 00000000..cbd3252f
--- /dev/null
+++ b/crypto_kem/sntrup653/avx2/crypto_decode_653xint32.c
@@ -0,0 +1,20 @@
+#include "crypto_decode_653xint32.h"
+
+
+void PQCLEAN_SNTRUP653_AVX2_crypto_decode_653xint32(void *v, const unsigned char *s) {
+    uint32_t *x = v;
+    int i;
+
+    for (i = 0; i < 653; ++i) {
+        uint32_t u0 = s[0];
+        uint32_t u1 = s[1];
+        uint32_t u2 = s[2];
+        uint32_t u3 = s[3];
+        u1 <<= 8;
+        u2 <<= 16;
+        u3 <<= 24;
+        *x = u0 | u1 | u2 | u3;
+        x += 1;
+        s += 4;
+    }
+}
diff --git a/crypto_kem/sntrup653/avx2/crypto_decode_653xint32.h b/crypto_kem/sntrup653/avx2/crypto_decode_653xint32.h
new file mode 100644
index 00000000..5da882d6
--- /dev/null
+++ b/crypto_kem/sntrup653/avx2/crypto_decode_653xint32.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_SNTRUP653_AVX2_CRYPTO_DECODE_653XINT32_H
+#define PQCLEAN_SNTRUP653_AVX2_CRYPTO_DECODE_653XINT32_H
+
+#include <stdint.h>
+#define PQCLEAN_SNTRUP653_AVX2_crypto_decode_653xint32_STRBYTES 2612
+#define PQCLEAN_SNTRUP653_AVX2_crypto_decode_653xint32_ITEMBYTES 4
+#define PQCLEAN_SNTRUP653_AVX2_crypto_decode_653xint32_ITEMS 653
+
+void PQCLEAN_SNTRUP653_AVX2_crypto_decode_653xint32(void *v, const unsigned char *s);
+#endif
diff --git a/crypto_kem/sntrup653/avx2/crypto_decode_int16.c b/crypto_kem/sntrup653/avx2/crypto_decode_int16.c
new file mode 100644
index 00000000..7db93101
--- /dev/null
+++ b/crypto_kem/sntrup653/avx2/crypto_decode_int16.c
@@ -0,0 +1,9 @@
+#include "crypto_decode_int16.h"
+
+
+void PQCLEAN_SNTRUP653_AVX2_crypto_decode_int16(void *x, const unsigned char *s) {
+    uint16_t u0 = s[0];
+    uint16_t u1 = s[1];
+    u1 <<= 8;
+    *(uint16_t *) x = u0 | u1;
+}
diff --git a/crypto_kem/sntrup653/avx2/crypto_decode_int16.h b/crypto_kem/sntrup653/avx2/crypto_decode_int16.h
new file mode 100644
index 00000000..58e7279e
--- /dev/null
+++ b/crypto_kem/sntrup653/avx2/crypto_decode_int16.h
@@ -0,0 +1,9 @@
+#ifndef PQCLEAN_SNTRUP653_AVX2_CRYPTO_DECODE_INT16_H
+#define PQCLEAN_SNTRUP653_AVX2_CRYPTO_DECODE_INT16_H
+
+#include <stdint.h>
+#define crypto_core_multsntrup857_STRBYTES 2
+#define crypto_core_multsntrup857_ITEMBYTES 2
+#define crypto_core_multsntrup857_ITEMS 1
+void PQCLEAN_SNTRUP653_AVX2_crypto_decode_int16(void *x, const unsigned char *s);
+#endif
diff --git a/crypto_kem/sntrup653/avx2/crypto_encode_653x1541.c b/crypto_kem/sntrup653/avx2/crypto_encode_653x1541.c
new file mode 100644
index 00000000..4269c16b
--- /dev/null
+++ b/crypto_kem/sntrup653/avx2/crypto_encode_653x1541.c
@@ -0,0 +1,286 @@
+#include "crypto_encode_653x1541.h"
+#include <immintrin.h>
+/* auto-generated; do not edit */
+
+#define int16 int16_t
+#define uint16 uint16_t
+#define uint32 uint32_t
+
+void PQCLEAN_SNTRUP653_AVX2_crypto_encode_653x1541(unsigned char *out, const void *v) {
+    const int16 *R0 = v;
+    /* XXX: caller could overlap R with input */
+    uint16 R[327];
+    long i;
+    const uint16 *reading;
+    uint16 *writing;
+    uint16 r0, r1;
+    uint32 r2;
+    uint32 s0;
+
+    reading = (uint16 *) R0;
+    writing = R;
+    i = 41;
+    while (i > 0) {
+        __m256i x, y;
+        --i;
+        if (!i) {
+            reading -= 4;
+            writing -= 2;
+            out -= 2;
+        }
+        x = _mm256_loadu_si256((__m256i *) reading);
+        x = _mm256_add_epi16(x, _mm256_set1_epi16(2310));
+        x &= _mm256_set1_epi16(16383);
+        x = _mm256_mulhi_epi16(x, _mm256_set1_epi16(21846));
+        y = x & _mm256_set1_epi32(65535);
+        x = _mm256_srli_epi32(x, 16);
+        x = _mm256_mullo_epi32(x, _mm256_set1_epi32(1541));
+        x = _mm256_add_epi32(y, x);
+        x = _mm256_shuffle_epi8(x, _mm256_set_epi8(
+                                    12, 8, 4, 0, 12, 8, 4, 0, 14, 13, 10, 9, 6, 5, 2, 1,
+                                    12, 8, 4, 0, 12, 8, 4, 0, 14, 13, 10, 9, 6, 5, 2, 1
+                                ));
+        x = _mm256_permute4x64_epi64(x, 0xd8);
+        _mm_storeu_si128((__m128i *) writing, _mm256_extractf128_si256(x, 0));
+        s0 = _mm256_extract_epi32(x, 4);
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 = _mm256_extract_epi32(x, 6);
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        reading += 16;
+        writing += 8;
+    }
+    R[326] = (((R0[652] + 2310) & 16383) * 10923) >> 15;
+
+    reading = (uint16 *) R;
+    writing = R;
+    i = 11;
+    while (i > 0) {
+        __m256i x, x2, y, y2;
+        --i;
+        if (!i) {
+            reading -= 26;
+            writing -= 13;
+            out -= 26;
+        }
+        x = _mm256_loadu_si256((__m256i *) (reading + 0));
+        x2 = _mm256_loadu_si256((__m256i *) (reading + 16));
+        y = x & _mm256_set1_epi32(65535);
+        y2 = x2 & _mm256_set1_epi32(65535);
+        x = _mm256_srli_epi32(x, 16);
+        x2 = _mm256_srli_epi32(x2, 16);
+        x = _mm256_mullo_epi32(x, _mm256_set1_epi32(9277));
+        x2 = _mm256_mullo_epi32(x2, _mm256_set1_epi32(9277));
+        x = _mm256_add_epi32(y, x);
+        x2 = _mm256_add_epi32(y2, x2);
+        x = _mm256_shuffle_epi8(x, _mm256_set_epi8(
+                                    15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0,
+                                    15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0
+                                ));
+        x2 = _mm256_shuffle_epi8(x2, _mm256_set_epi8(
+                                     15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0,
+                                     15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0
+                                 ));
+        x = _mm256_permute4x64_epi64(x, 0xd8);
+        x2 = _mm256_permute4x64_epi64(x2, 0xd8);
+        _mm256_storeu_si256((__m256i *) writing, _mm256_permute2f128_si256(x, x2, 0x31));
+        _mm256_storeu_si256((__m256i *) out, _mm256_permute2f128_si256(x, x2, 0x20));
+        reading += 32;
+        writing += 16;
+        out += 32;
+    }
+    R[163] = R[326];
+
+    reading = (uint16 *) R;
+    writing = R;
+    i = 11;
+    while (i > 0) {
+        __m256i x, y;
+        --i;
+        if (!i) {
+            reading -= 12;
+            writing -= 6;
+            out -= 6;
+        }
+        x = _mm256_loadu_si256((__m256i *) reading);
+        y = x & _mm256_set1_epi32(65535);
+        x = _mm256_srli_epi32(x, 16);
+        x = _mm256_mullo_epi32(x, _mm256_set1_epi32(1314));
+        x = _mm256_add_epi32(y, x);
+        x = _mm256_shuffle_epi8(x, _mm256_set_epi8(
+                                    12, 8, 4, 0, 12, 8, 4, 0, 14, 13, 10, 9, 6, 5, 2, 1,
+                                    12, 8, 4, 0, 12, 8, 4, 0, 14, 13, 10, 9, 6, 5, 2, 1
+                                ));
+        x = _mm256_permute4x64_epi64(x, 0xd8);
+        _mm_storeu_si128((__m128i *) writing, _mm256_extractf128_si256(x, 0));
+        s0 = _mm256_extract_epi32(x, 4);
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 = _mm256_extract_epi32(x, 6);
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        reading += 16;
+        writing += 8;
+    }
+
+    reading = (uint16 *) R;
+    writing = R;
+    i = 3;
+    while (i > 0) {
+        __m256i x, x2, y, y2;
+        --i;
+        if (!i) {
+            reading -= 14;
+            writing -= 7;
+            out -= 14;
+        }
+        x = _mm256_loadu_si256((__m256i *) (reading + 0));
+        x2 = _mm256_loadu_si256((__m256i *) (reading + 16));
+        y = x & _mm256_set1_epi32(65535);
+        y2 = x2 & _mm256_set1_epi32(65535);
+        x = _mm256_srli_epi32(x, 16);
+        x2 = _mm256_srli_epi32(x2, 16);
+        x = _mm256_mullo_epi32(x, _mm256_set1_epi32(6745));
+        x2 = _mm256_mullo_epi32(x2, _mm256_set1_epi32(6745));
+        x = _mm256_add_epi32(y, x);
+        x2 = _mm256_add_epi32(y2, x2);
+        x = _mm256_shuffle_epi8(x, _mm256_set_epi8(
+                                    15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0,
+                                    15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0
+                                ));
+        x2 = _mm256_shuffle_epi8(x2, _mm256_set_epi8(
+                                     15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0,
+                                     15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0
+                                 ));
+        x = _mm256_permute4x64_epi64(x, 0xd8);
+        x2 = _mm256_permute4x64_epi64(x2, 0xd8);
+        _mm256_storeu_si256((__m256i *) writing, _mm256_permute2f128_si256(x, x2, 0x31));
+        _mm256_storeu_si256((__m256i *) out, _mm256_permute2f128_si256(x, x2, 0x20));
+        reading += 32;
+        writing += 16;
+        out += 32;
+    }
+
+    reading = (uint16 *) R;
+    writing = R;
+    i = 3;
+    while (i > 0) {
+        __m256i x, y;
+        --i;
+        if (!i) {
+            reading -= 8;
+            writing -= 4;
+            out -= 4;
+        }
+        x = _mm256_loadu_si256((__m256i *) reading);
+        y = x & _mm256_set1_epi32(65535);
+        x = _mm256_srli_epi32(x, 16);
+        x = _mm256_mullo_epi32(x, _mm256_set1_epi32(695));
+        x = _mm256_add_epi32(y, x);
+        x = _mm256_shuffle_epi8(x, _mm256_set_epi8(
+                                    12, 8, 4, 0, 12, 8, 4, 0, 14, 13, 10, 9, 6, 5, 2, 1,
+                                    12, 8, 4, 0, 12, 8, 4, 0, 14, 13, 10, 9, 6, 5, 2, 1
+                                ));
+        x = _mm256_permute4x64_epi64(x, 0xd8);
+        _mm_storeu_si128((__m128i *) writing, _mm256_extractf128_si256(x, 0));
+        s0 = _mm256_extract_epi32(x, 4);
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 = _mm256_extract_epi32(x, 6);
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        reading += 16;
+        writing += 8;
+    }
+    R[20] = R[40];
+
+    for (i = 0; i < 10; ++i) {
+        r0 = R[2 * i];
+        r1 = R[2 * i + 1];
+        r2 = r0 + r1 * (uint32)1887;
+        *out++ = r2;
+        r2 >>= 8;
+        R[i] = r2;
+    }
+    R[10] = R[20];
+
+    for (i = 0; i < 5; ++i) {
+        r0 = R[2 * i];
+        r1 = R[2 * i + 1];
+        r2 = r0 + r1 * (uint32)13910;
+        *out++ = r2;
+        r2 >>= 8;
+        *out++ = r2;
+        r2 >>= 8;
+        R[i] = r2;
+    }
+    R[5] = R[10];
+
+    for (i = 0; i < 2; ++i) {
+        r0 = R[2 * i];
+        r1 = R[2 * i + 1];
+        r2 = r0 + r1 * (uint32)2953;
+        *out++ = r2;
+        r2 >>= 8;
+        *out++ = r2;
+        r2 >>= 8;
+        R[i] = r2;
+    }
+    r0 = R[4];
+    r1 = R[5];
+    r2 = r0 + r1 * (uint32)2953;
+    *out++ = r2;
+    r2 >>= 8;
+    R[2] = r2;
+
+    r0 = R[0];
+    r1 = R[1];
+    r2 = r0 + r1 * (uint32)134;
+    *out++ = r2;
+    r2 >>= 8;
+    R[0] = r2;
+    R[1] = R[2];
+
+    r0 = R[0];
+    r1 = R[1];
+    r2 = r0 + r1 * (uint32)71;
+    *out++ = r2;
+    r2 >>= 8;
+    R[0] = r2;
+
+    r0 = R[0];
+    *out++ = r0;
+    r0 >>= 8;
+    *out++ = r0; /*clang-analyzer-deadcode.DeadStores*/ /*r0 >>= 8;*/
+}
diff --git a/crypto_kem/sntrup653/avx2/crypto_encode_653x1541.h b/crypto_kem/sntrup653/avx2/crypto_encode_653x1541.h
new file mode 100644
index 00000000..6319c0a0
--- /dev/null
+++ b/crypto_kem/sntrup653/avx2/crypto_encode_653x1541.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_SNTRUP653_AVX2_CRYPTO_ENCODE_653X1541_H
+#define PQCLEAN_SNTRUP653_AVX2_CRYPTO_ENCODE_653X1541_H
+
+#include <stdint.h>
+#define PQCLEAN_SNTRUP653_AVX2_crypto_encode_653x1541_STRBYTES 865
+#define PQCLEAN_SNTRUP653_AVX2_crypto_encode_653x1541_ITEMS 653
+#define PQCLEAN_SNTRUP653_AVX2_crypto_encode_653x1541_ITEMBYTES 2
+
+void PQCLEAN_SNTRUP653_AVX2_crypto_encode_653x1541(unsigned char *out, const void *v);
+#endif
diff --git a/crypto_kem/sntrup653/avx2/crypto_encode_653x1541round.c b/crypto_kem/sntrup653/avx2/crypto_encode_653x1541round.c
new file mode 100644
index 00000000..d019eeb5
--- /dev/null
+++ b/crypto_kem/sntrup653/avx2/crypto_encode_653x1541round.c
@@ -0,0 +1,288 @@
+#include "crypto_encode_653x1541round.h"
+#include <immintrin.h>
+/* auto-generated; do not edit */
+
+#define int16 int16_t
+#define uint16 uint16_t
+#define uint32 uint32_t
+
+void PQCLEAN_SNTRUP653_AVX2_crypto_encode_653x1541round(unsigned char *out, const void *v) {
+    const int16 *R0 = v;
+    /* XXX: caller could overlap R with input */
+    uint16 R[327];
+    long i;
+    const uint16 *reading;
+    uint16 *writing;
+    uint16 r0, r1;
+    uint32 r2;
+    uint32 s0;
+
+    reading = (uint16 *) R0;
+    writing = R;
+    i = 41;
+    while (i > 0) {
+        __m256i x, y;
+        --i;
+        if (!i) {
+            reading -= 4;
+            writing -= 2;
+            out -= 2;
+        }
+        x = _mm256_loadu_si256((__m256i *) reading);
+        x = _mm256_mulhrs_epi16(x, _mm256_set1_epi16(10923));
+        x = _mm256_add_epi16(x, _mm256_add_epi16(x, x));
+        x = _mm256_add_epi16(x, _mm256_set1_epi16(2310));
+        x &= _mm256_set1_epi16(16383);
+        x = _mm256_mulhi_epi16(x, _mm256_set1_epi16(21846));
+        y = x & _mm256_set1_epi32(65535);
+        x = _mm256_srli_epi32(x, 16);
+        x = _mm256_mullo_epi32(x, _mm256_set1_epi32(1541));
+        x = _mm256_add_epi32(y, x);
+        x = _mm256_shuffle_epi8(x, _mm256_set_epi8(
+                                    12, 8, 4, 0, 12, 8, 4, 0, 14, 13, 10, 9, 6, 5, 2, 1,
+                                    12, 8, 4, 0, 12, 8, 4, 0, 14, 13, 10, 9, 6, 5, 2, 1
+                                ));
+        x = _mm256_permute4x64_epi64(x, 0xd8);
+        _mm_storeu_si128((__m128i *) writing, _mm256_extractf128_si256(x, 0));
+        s0 = _mm256_extract_epi32(x, 4);
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 = _mm256_extract_epi32(x, 6);
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        reading += 16;
+        writing += 8;
+    }
+    R[326] = (((3 * ((10923 * R0[652] + 16384) >> 15) + 2310) & 16383) * 10923) >> 15;
+
+    reading = (uint16 *) R;
+    writing = R;
+    i = 11;
+    while (i > 0) {
+        __m256i x, x2, y, y2;
+        --i;
+        if (!i) {
+            reading -= 26;
+            writing -= 13;
+            out -= 26;
+        }
+        x = _mm256_loadu_si256((__m256i *) (reading + 0));
+        x2 = _mm256_loadu_si256((__m256i *) (reading + 16));
+        y = x & _mm256_set1_epi32(65535);
+        y2 = x2 & _mm256_set1_epi32(65535);
+        x = _mm256_srli_epi32(x, 16);
+        x2 = _mm256_srli_epi32(x2, 16);
+        x = _mm256_mullo_epi32(x, _mm256_set1_epi32(9277));
+        x2 = _mm256_mullo_epi32(x2, _mm256_set1_epi32(9277));
+        x = _mm256_add_epi32(y, x);
+        x2 = _mm256_add_epi32(y2, x2);
+        x = _mm256_shuffle_epi8(x, _mm256_set_epi8(
+                                    15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0,
+                                    15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0
+                                ));
+        x2 = _mm256_shuffle_epi8(x2, _mm256_set_epi8(
+                                     15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0,
+                                     15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0
+                                 ));
+        x = _mm256_permute4x64_epi64(x, 0xd8);
+        x2 = _mm256_permute4x64_epi64(x2, 0xd8);
+        _mm256_storeu_si256((__m256i *) writing, _mm256_permute2f128_si256(x, x2, 0x31));
+        _mm256_storeu_si256((__m256i *) out, _mm256_permute2f128_si256(x, x2, 0x20));
+        reading += 32;
+        writing += 16;
+        out += 32;
+    }
+    R[163] = R[326];
+
+    reading = (uint16 *) R;
+    writing = R;
+    i = 11;
+    while (i > 0) {
+        __m256i x, y;
+        --i;
+        if (!i) {
+            reading -= 12;
+            writing -= 6;
+            out -= 6;
+        }
+        x = _mm256_loadu_si256((__m256i *) reading);
+        y = x & _mm256_set1_epi32(65535);
+        x = _mm256_srli_epi32(x, 16);
+        x = _mm256_mullo_epi32(x, _mm256_set1_epi32(1314));
+        x = _mm256_add_epi32(y, x);
+        x = _mm256_shuffle_epi8(x, _mm256_set_epi8(
+                                    12, 8, 4, 0, 12, 8, 4, 0, 14, 13, 10, 9, 6, 5, 2, 1,
+                                    12, 8, 4, 0, 12, 8, 4, 0, 14, 13, 10, 9, 6, 5, 2, 1
+                                ));
+        x = _mm256_permute4x64_epi64(x, 0xd8);
+        _mm_storeu_si128((__m128i *) writing, _mm256_extractf128_si256(x, 0));
+        s0 = _mm256_extract_epi32(x, 4);
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 = _mm256_extract_epi32(x, 6);
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        reading += 16;
+        writing += 8;
+    }
+
+    reading = (uint16 *) R;
+    writing = R;
+    i = 3;
+    while (i > 0) {
+        __m256i x, x2, y, y2;
+        --i;
+        if (!i) {
+            reading -= 14;
+            writing -= 7;
+            out -= 14;
+        }
+        x = _mm256_loadu_si256((__m256i *) (reading + 0));
+        x2 = _mm256_loadu_si256((__m256i *) (reading + 16));
+        y = x & _mm256_set1_epi32(65535);
+        y2 = x2 & _mm256_set1_epi32(65535);
+        x = _mm256_srli_epi32(x, 16);
+        x2 = _mm256_srli_epi32(x2, 16);
+        x = _mm256_mullo_epi32(x, _mm256_set1_epi32(6745));
+        x2 = _mm256_mullo_epi32(x2, _mm256_set1_epi32(6745));
+        x = _mm256_add_epi32(y, x);
+        x2 = _mm256_add_epi32(y2, x2);
+        x = _mm256_shuffle_epi8(x, _mm256_set_epi8(
+                                    15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0,
+                                    15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0
+                                ));
+        x2 = _mm256_shuffle_epi8(x2, _mm256_set_epi8(
+                                     15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0,
+                                     15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0
+                                 ));
+        x = _mm256_permute4x64_epi64(x, 0xd8);
+        x2 = _mm256_permute4x64_epi64(x2, 0xd8);
+        _mm256_storeu_si256((__m256i *) writing, _mm256_permute2f128_si256(x, x2, 0x31));
+        _mm256_storeu_si256((__m256i *) out, _mm256_permute2f128_si256(x, x2, 0x20));
+        reading += 32;
+        writing += 16;
+        out += 32;
+    }
+
+    reading = (uint16 *) R;
+    writing = R;
+    i = 3;
+    while (i > 0) {
+        __m256i x, y;
+        --i;
+        if (!i) {
+            reading -= 8;
+            writing -= 4;
+            out -= 4;
+        }
+        x = _mm256_loadu_si256((__m256i *) reading);
+        y = x & _mm256_set1_epi32(65535);
+        x = _mm256_srli_epi32(x, 16);
+        x = _mm256_mullo_epi32(x, _mm256_set1_epi32(695));
+        x = _mm256_add_epi32(y, x);
+        x = _mm256_shuffle_epi8(x, _mm256_set_epi8(
+                                    12, 8, 4, 0, 12, 8, 4, 0, 14, 13, 10, 9, 6, 5, 2, 1,
+                                    12, 8, 4, 0, 12, 8, 4, 0, 14, 13, 10, 9, 6, 5, 2, 1
+                                ));
+        x = _mm256_permute4x64_epi64(x, 0xd8);
+        _mm_storeu_si128((__m128i *) writing, _mm256_extractf128_si256(x, 0));
+        s0 = _mm256_extract_epi32(x, 4);
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 = _mm256_extract_epi32(x, 6);
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        reading += 16;
+        writing += 8;
+    }
+    R[20] = R[40];
+
+    for (i = 0; i < 10; ++i) {
+        r0 = R[2 * i];
+        r1 = R[2 * i + 1];
+        r2 = r0 + r1 * (uint32)1887;
+        *out++ = r2;
+        r2 >>= 8;
+        R[i] = r2;
+    }
+    R[10] = R[20];
+
+    for (i = 0; i < 5; ++i) {
+        r0 = R[2 * i];
+        r1 = R[2 * i + 1];
+        r2 = r0 + r1 * (uint32)13910;
+        *out++ = r2;
+        r2 >>= 8;
+        *out++ = r2;
+        r2 >>= 8;
+        R[i] = r2;
+    }
+    R[5] = R[10];
+
+    for (i = 0; i < 2; ++i) {
+        r0 = R[2 * i];
+        r1 = R[2 * i + 1];
+        r2 = r0 + r1 * (uint32)2953;
+        *out++ = r2;
+        r2 >>= 8;
+        *out++ = r2;
+        r2 >>= 8;
+        R[i] = r2;
+    }
+    r0 = R[4];
+    r1 = R[5];
+    r2 = r0 + r1 * (uint32)2953;
+    *out++ = r2;
+    r2 >>= 8;
+    R[2] = r2;
+
+    r0 = R[0];
+    r1 = R[1];
+    r2 = r0 + r1 * (uint32)134;
+    *out++ = r2;
+    r2 >>= 8;
+    R[0] = r2;
+    R[1] = R[2];
+
+    r0 = R[0];
+    r1 = R[1];
+    r2 = r0 + r1 * (uint32)71;
+    *out++ = r2;
+    r2 >>= 8;
+    R[0] = r2;
+
+    r0 = R[0];
+    *out++ = r0;
+    r0 >>= 8;
+    *out++ = r0; /*clang-analyzer-deadcode.DeadStores*/ /*r0 >>= 8;*/
+}
diff --git a/crypto_kem/sntrup653/avx2/crypto_encode_653x1541round.h b/crypto_kem/sntrup653/avx2/crypto_encode_653x1541round.h
new file mode 100644
index 00000000..03640612
--- /dev/null
+++ b/crypto_kem/sntrup653/avx2/crypto_encode_653x1541round.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_SNTRUP653_AVX2_CRYPTO_ENCODE_653X1541ROUND_H
+#define PQCLEAN_SNTRUP653_AVX2_CRYPTO_ENCODE_653X1541ROUND_H
+
+#include <stdint.h>
+#define PQCLEAN_SNTRUP653_AVX2_crypto_encode_653x1541round_STRBYTES 865
+#define PQCLEAN_SNTRUP653_AVX2_crypto_encode_653x1541round_ITEMS 653
+#define PQCLEAN_SNTRUP653_AVX2_crypto_encode_653x1541round_ITEMBYTES 2
+
+void PQCLEAN_SNTRUP653_AVX2_crypto_encode_653x1541round(unsigned char *out, const void *v);
+#endif
diff --git a/crypto_kem/sntrup653/avx2/crypto_encode_653x3.c b/crypto_kem/sntrup653/avx2/crypto_encode_653x3.c
new file mode 100644
index 00000000..c4b4b5e0
--- /dev/null
+++ b/crypto_kem/sntrup653/avx2/crypto_encode_653x3.c
@@ -0,0 +1,64 @@
+#include "crypto_encode_653x3.h"
+#include <immintrin.h>
+#define uint8 uint8_t
+
+#define p 653
+#define loops 6
+#define overshoot 29
+
+static const union {
+    uint8 init[32];
+    __m256i val;
+} lobytes_buf = { .init = {
+        255, 0, 255, 0, 255, 0, 255, 0,
+        255, 0, 255, 0, 255, 0, 255, 0,
+        255, 0, 255, 0, 255, 0, 255, 0,
+        255, 0, 255, 0, 255, 0, 255, 0,
+    }
+};
+#define lobytes (lobytes_buf.val)
+
+void PQCLEAN_SNTRUP653_AVX2_crypto_encode_653x3(unsigned char *s, const void *v) {
+    const uint8 *f = v;
+    int loop;
+    const uint8 *nextf = f + 128 - 4 * overshoot;
+    unsigned char *nexts = s + 32 - overshoot;
+
+    for (loop = loops; loop > 0; --loop) {
+        __m256i f0 = _mm256_loadu_si256((const __m256i *) (f + 0));
+        __m256i f1 = _mm256_loadu_si256((const __m256i *) (f + 32));
+        __m256i f2 = _mm256_loadu_si256((const __m256i *) (f + 64));
+        __m256i f3 = _mm256_loadu_si256((const __m256i *) (f + 96));
+        f = nextf;
+        nextf += 128;
+
+        __m256i a0 = _mm256_packus_epi16(f0 & lobytes, f1 & lobytes);
+        /* 0 2 4 6 8 10 12 14 32 34 36 38 40 42 44 46 */
+        /* 16 18 20 22 24 26 28 30 48 50 52 54 56 58 60 62 */
+        __m256i a1 = _mm256_packus_epi16(_mm256_srli_epi16(f0, 8), _mm256_srli_epi16(f1, 8));
+        /* 1 3 ... */
+        __m256i a2 = _mm256_packus_epi16(f2 & lobytes, f3 & lobytes);
+        __m256i a3 = _mm256_packus_epi16(_mm256_srli_epi16(f2, 8), _mm256_srli_epi16(f3, 8));
+
+        a0 = _mm256_add_epi8(a0, _mm256_slli_epi16(a1 & _mm256_set1_epi8(63), 2));
+        a2 = _mm256_add_epi8(a2, _mm256_slli_epi16(a3 & _mm256_set1_epi8(63), 2));
+
+        __m256i b0 = _mm256_packus_epi16(a0 & lobytes, a2 & lobytes);
+        /* 0 4 8 12 32 36 40 44 64 68 72 76 96 100 104 108 */
+        /* 16 20 24 28 48 52 56 60 80 84 88 92 112 116 120 124 */
+        __m256i b2 = _mm256_packus_epi16(_mm256_srli_epi16(a0, 8), _mm256_srli_epi16(a2, 8));
+        /* 2 6 ... */
+
+        b0 = _mm256_add_epi8(b0, _mm256_slli_epi16(b2 & _mm256_set1_epi8(15), 4));
+
+        b0 = _mm256_permutevar8x32_epi32(b0, _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0));
+
+        b0 = _mm256_add_epi8(b0, _mm256_set1_epi8(85));
+
+        _mm256_storeu_si256((__m256i *) s, b0);
+        s = nexts;
+        nexts += 32;
+    }
+
+    *s++ = *f++ + 1;
+}
diff --git a/crypto_kem/sntrup653/avx2/crypto_encode_653x3.h b/crypto_kem/sntrup653/avx2/crypto_encode_653x3.h
new file mode 100644
index 00000000..029ce76f
--- /dev/null
+++ b/crypto_kem/sntrup653/avx2/crypto_encode_653x3.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_SNTRUP653_AVX2_CRYPTO_ENCODE_653X3_H
+#define PQCLEAN_SNTRUP653_AVX2_CRYPTO_ENCODE_653X3_H
+
+#include <stdint.h>
+#define PQCLEAN_SNTRUP653_AVX2_crypto_encode_653x3_STRBYTES 164
+#define PQCLEAN_SNTRUP653_AVX2_crypto_encode_653x3_ITEMS 653
+#define PQCLEAN_SNTRUP653_AVX2_crypto_encode_653x3_ITEMBYTES 1
+
+void PQCLEAN_SNTRUP653_AVX2_crypto_encode_653x3(unsigned char *s, const void *v);
+#endif
diff --git a/crypto_kem/sntrup653/avx2/crypto_encode_653x4621.c b/crypto_kem/sntrup653/avx2/crypto_encode_653x4621.c
new file mode 100644
index 00000000..45c443b7
--- /dev/null
+++ b/crypto_kem/sntrup653/avx2/crypto_encode_653x4621.c
@@ -0,0 +1,288 @@
+#include "crypto_encode_653x4621.h"
+#include <immintrin.h>
+/* auto-generated; do not edit */
+
+#define int16 int16_t
+#define uint16 uint16_t
+#define uint32 uint32_t
+
+void PQCLEAN_SNTRUP653_AVX2_crypto_encode_653x4621(unsigned char *out, const void *v) {
+    const int16 *R0 = v;
+    /* XXX: caller could overlap R with input */
+    uint16 R[327];
+    long i;
+    const uint16 *reading;
+    uint16 *writing;
+    uint16 r0, r1;
+    uint32 r2;
+    uint32 s0;
+
+    reading = (uint16 *) R0;
+    writing = R;
+    i = 21;
+    while (i > 0) {
+        __m256i x, x2, y, y2;
+        --i;
+        if (!i) {
+            reading -= 20;
+            writing -= 10;
+            out -= 20;
+        }
+        x = _mm256_loadu_si256((__m256i *) (reading + 0));
+        x2 = _mm256_loadu_si256((__m256i *) (reading + 16));
+        x = _mm256_add_epi16(x, _mm256_set1_epi16(2310));
+        x2 = _mm256_add_epi16(x2, _mm256_set1_epi16(2310));
+        x &= _mm256_set1_epi16(16383);
+        x2 &= _mm256_set1_epi16(16383);
+        y = x & _mm256_set1_epi32(65535);
+        y2 = x2 & _mm256_set1_epi32(65535);
+        x = _mm256_srli_epi32(x, 16);
+        x2 = _mm256_srli_epi32(x2, 16);
+        x = _mm256_mullo_epi32(x, _mm256_set1_epi32(4621));
+        x2 = _mm256_mullo_epi32(x2, _mm256_set1_epi32(4621));
+        x = _mm256_add_epi32(y, x);
+        x2 = _mm256_add_epi32(y2, x2);
+        x = _mm256_shuffle_epi8(x, _mm256_set_epi8(
+                                    15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0,
+                                    15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0
+                                ));
+        x2 = _mm256_shuffle_epi8(x2, _mm256_set_epi8(
+                                     15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0,
+                                     15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0
+                                 ));
+        x = _mm256_permute4x64_epi64(x, 0xd8);
+        x2 = _mm256_permute4x64_epi64(x2, 0xd8);
+        _mm256_storeu_si256((__m256i *) writing, _mm256_permute2f128_si256(x, x2, 0x31));
+        _mm256_storeu_si256((__m256i *) out, _mm256_permute2f128_si256(x, x2, 0x20));
+        reading += 32;
+        writing += 16;
+        out += 32;
+    }
+    R[326] = ((R0[652] + 2310) & 16383);
+
+    reading = (uint16 *) R;
+    writing = R;
+    i = 21;
+    while (i > 0) {
+        __m256i x, y;
+        --i;
+        if (!i) {
+            reading -= 10;
+            writing -= 5;
+            out -= 5;
+        }
+        x = _mm256_loadu_si256((__m256i *) reading);
+        y = x & _mm256_set1_epi32(65535);
+        x = _mm256_srli_epi32(x, 16);
+        x = _mm256_mullo_epi32(x, _mm256_set1_epi32(326));
+        x = _mm256_add_epi32(y, x);
+        x = _mm256_shuffle_epi8(x, _mm256_set_epi8(
+                                    12, 8, 4, 0, 12, 8, 4, 0, 14, 13, 10, 9, 6, 5, 2, 1,
+                                    12, 8, 4, 0, 12, 8, 4, 0, 14, 13, 10, 9, 6, 5, 2, 1
+                                ));
+        x = _mm256_permute4x64_epi64(x, 0xd8);
+        _mm_storeu_si128((__m128i *) writing, _mm256_extractf128_si256(x, 0));
+        s0 = _mm256_extract_epi32(x, 4);
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 = _mm256_extract_epi32(x, 6);
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        reading += 16;
+        writing += 8;
+    }
+    R[163] = R[326];
+
+    reading = (uint16 *) R;
+    writing = R;
+    i = 11;
+    while (i > 0) {
+        __m256i x, y;
+        --i;
+        if (!i) {
+            reading -= 12;
+            writing -= 6;
+            out -= 6;
+        }
+        x = _mm256_loadu_si256((__m256i *) reading);
+        y = x & _mm256_set1_epi32(65535);
+        x = _mm256_srli_epi32(x, 16);
+        x = _mm256_mullo_epi32(x, _mm256_set1_epi32(416));
+        x = _mm256_add_epi32(y, x);
+        x = _mm256_shuffle_epi8(x, _mm256_set_epi8(
+                                    12, 8, 4, 0, 12, 8, 4, 0, 14, 13, 10, 9, 6, 5, 2, 1,
+                                    12, 8, 4, 0, 12, 8, 4, 0, 14, 13, 10, 9, 6, 5, 2, 1
+                                ));
+        x = _mm256_permute4x64_epi64(x, 0xd8);
+        _mm_storeu_si128((__m128i *) writing, _mm256_extractf128_si256(x, 0));
+        s0 = _mm256_extract_epi32(x, 4);
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 = _mm256_extract_epi32(x, 6);
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        reading += 16;
+        writing += 8;
+    }
+
+    reading = (uint16 *) R;
+    writing = R;
+    i = 5;
+    while (i > 0) {
+        __m256i x, y;
+        --i;
+        x = _mm256_loadu_si256((__m256i *) reading);
+        y = x & _mm256_set1_epi32(65535);
+        x = _mm256_srli_epi32(x, 16);
+        x = _mm256_mullo_epi32(x, _mm256_set1_epi32(676));
+        x = _mm256_add_epi32(y, x);
+        x = _mm256_shuffle_epi8(x, _mm256_set_epi8(
+                                    12, 8, 4, 0, 12, 8, 4, 0, 14, 13, 10, 9, 6, 5, 2, 1,
+                                    12, 8, 4, 0, 12, 8, 4, 0, 14, 13, 10, 9, 6, 5, 2, 1
+                                ));
+        x = _mm256_permute4x64_epi64(x, 0xd8);
+        _mm_storeu_si128((__m128i *) writing, _mm256_extractf128_si256(x, 0));
+        s0 = _mm256_extract_epi32(x, 4);
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 = _mm256_extract_epi32(x, 6);
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        reading += 16;
+        writing += 8;
+    }
+    r0 = R[80];
+    r1 = R[81];
+    r2 = r0 + r1 * (uint32)676;
+    *out++ = r2;
+    r2 >>= 8;
+    *out++ = r2;
+    r2 >>= 8;
+    R[40] = r2;
+
+    reading = (uint16 *) R;
+    writing = R;
+    i = 3;
+    while (i > 0) {
+        __m256i x, y;
+        --i;
+        if (!i) {
+            reading -= 8;
+            writing -= 4;
+            out -= 4;
+        }
+        x = _mm256_loadu_si256((__m256i *) reading);
+        y = x & _mm256_set1_epi32(65535);
+        x = _mm256_srli_epi32(x, 16);
+        x = _mm256_mullo_epi32(x, _mm256_set1_epi32(1786));
+        x = _mm256_add_epi32(y, x);
+        x = _mm256_shuffle_epi8(x, _mm256_set_epi8(
+                                    12, 8, 4, 0, 12, 8, 4, 0, 14, 13, 10, 9, 6, 5, 2, 1,
+                                    12, 8, 4, 0, 12, 8, 4, 0, 14, 13, 10, 9, 6, 5, 2, 1
+                                ));
+        x = _mm256_permute4x64_epi64(x, 0xd8);
+        _mm_storeu_si128((__m128i *) writing, _mm256_extractf128_si256(x, 0));
+        s0 = _mm256_extract_epi32(x, 4);
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 = _mm256_extract_epi32(x, 6);
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        reading += 16;
+        writing += 8;
+    }
+    R[20] = R[40];
+
+    for (i = 0; i < 10; ++i) {
+        r0 = R[2 * i];
+        r1 = R[2 * i + 1];
+        r2 = r0 + r1 * (uint32)12461;
+        *out++ = r2;
+        r2 >>= 8;
+        *out++ = r2;
+        r2 >>= 8;
+        R[i] = r2;
+    }
+    R[10] = R[20];
+
+    for (i = 0; i < 5; ++i) {
+        r0 = R[2 * i];
+        r1 = R[2 * i + 1];
+        r2 = r0 + r1 * (uint32)2370;
+        *out++ = r2;
+        r2 >>= 8;
+        *out++ = r2;
+        r2 >>= 8;
+        R[i] = r2;
+    }
+    R[5] = R[10];
+
+    for (i = 0; i < 3; ++i) {
+        r0 = R[2 * i];
+        r1 = R[2 * i + 1];
+        r2 = r0 + r1 * (uint32)86;
+        R[i] = r2;
+    }
+
+    r0 = R[0];
+    r1 = R[1];
+    r2 = r0 + r1 * (uint32)7396;
+    *out++ = r2;
+    r2 >>= 8;
+    *out++ = r2;
+    r2 >>= 8;
+    R[0] = r2;
+    R[1] = R[2];
+
+    r0 = R[0];
+    r1 = R[1];
+    r2 = r0 + r1 * (uint32)835;
+    *out++ = r2;
+    r2 >>= 8;
+    *out++ = r2;
+    r2 >>= 8;
+    R[0] = r2;
+
+    r0 = R[0];
+    *out++ = r0; /*clang-analyzer-deadcode.DeadStores*/ /*r0 >>= 8;*/
+}
diff --git a/crypto_kem/sntrup653/avx2/crypto_encode_653x4621.h b/crypto_kem/sntrup653/avx2/crypto_encode_653x4621.h
new file mode 100644
index 00000000..f56bb49f
--- /dev/null
+++ b/crypto_kem/sntrup653/avx2/crypto_encode_653x4621.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_SNTRUP653_AVX2_CRYPTO_ENCODE_653X4621_H
+#define PQCLEAN_SNTRUP653_AVX2_CRYPTO_ENCODE_653X4621_H
+
+#include <stdint.h>
+#define PQCLEAN_SNTRUP653_AVX2_crypto_encode_653x4621_STRBYTES 994
+#define PQCLEAN_SNTRUP653_AVX2_crypto_encode_653x4621_ITEMS 653
+#define PQCLEAN_SNTRUP653_AVX2_crypto_encode_653x4621_ITEMBYTES 2
+
+void PQCLEAN_SNTRUP653_AVX2_crypto_encode_653x4621(unsigned char *out, const void *v);
+#endif
diff --git a/crypto_kem/sntrup653/avx2/crypto_encode_653xfreeze3.c b/crypto_kem/sntrup653/avx2/crypto_encode_653xfreeze3.c
new file mode 100644
index 00000000..dedece25
--- /dev/null
+++ b/crypto_kem/sntrup653/avx2/crypto_encode_653xfreeze3.c
@@ -0,0 +1,31 @@
+#include "crypto_encode_653xfreeze3.h"
+#include <immintrin.h>
+#define int16 int16_t
+
+#define p 653
+
+void PQCLEAN_SNTRUP653_AVX2_crypto_encode_653xfreeze3(unsigned char *s, const void *v) {
+    const int16 *r = v;
+
+    int i = p - 16;
+    for (;;) {
+        do {
+            __m256i x = _mm256_loadu_si256((__m256i *) r);
+            __m256i y = _mm256_mulhrs_epi16(x, _mm256_set1_epi16(10923));
+            x = _mm256_sub_epi16(x, y);
+            y = _mm256_add_epi16(y, y);
+            x = _mm256_sub_epi16(x, y);
+            __m128i x0 = _mm256_extractf128_si256(x, 0);
+            __m128i x1 = _mm256_extractf128_si256(x, 1);
+            _mm_storeu_si128((__m128i *) s, _mm_packs_epi16(x0, x1));
+            i -= 16;
+            r += 16;
+            s += 16;
+        } while (i >= 0);
+        if (i <= -16) {
+            break;
+        }
+        r += i;
+        s += i;
+    }
+}
diff --git a/crypto_kem/sntrup653/avx2/crypto_encode_653xfreeze3.h b/crypto_kem/sntrup653/avx2/crypto_encode_653xfreeze3.h
new file mode 100644
index 00000000..e4e8102c
--- /dev/null
+++ b/crypto_kem/sntrup653/avx2/crypto_encode_653xfreeze3.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_SNTRUP653_AVX2_CRYPTO_ENCODE_653XFREEZE3_H
+#define PQCLEAN_SNTRUP653_AVX2_CRYPTO_ENCODE_653XFREEZE3_H
+
+#include <stdint.h>
+#define PQCLEAN_SNTRUP653_AVX2_crypto_encode_653xfreeze3_STRBYTES 653
+#define PQCLEAN_SNTRUP653_AVX2_crypto_encode_653xfreeze3_ITEMS 653
+#define PQCLEAN_SNTRUP653_AVX2_crypto_encode_653xfreeze3_ITEMBYTES 2
+
+void PQCLEAN_SNTRUP653_AVX2_crypto_encode_653xfreeze3(unsigned char *s, const void *v);
+#endif
diff --git a/crypto_kem/sntrup653/avx2/crypto_encode_653xint16.c b/crypto_kem/sntrup653/avx2/crypto_encode_653xint16.c
new file mode 100644
index 00000000..5443662e
--- /dev/null
+++ b/crypto_kem/sntrup653/avx2/crypto_encode_653xint16.c
@@ -0,0 +1,13 @@
+#include "crypto_encode_653xint16.h"
+
+
+void PQCLEAN_SNTRUP653_AVX2_crypto_encode_653xint16(unsigned char *s, const void *v) {
+    const uint16_t *x = v;
+    int i;
+
+    for (i = 0; i < 653; ++i) {
+        uint16_t u = *x++;
+        *s++ = u;
+        *s++ = u >> 8;
+    }
+}
diff --git a/crypto_kem/sntrup653/avx2/crypto_encode_653xint16.h b/crypto_kem/sntrup653/avx2/crypto_encode_653xint16.h
new file mode 100644
index 00000000..17130d4f
--- /dev/null
+++ b/crypto_kem/sntrup653/avx2/crypto_encode_653xint16.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_SNTRUP653_AVX2_CRYPTO_ENCODE_653XINT16_H
+#define PQCLEAN_SNTRUP653_AVX2_CRYPTO_ENCODE_653XINT16_H
+
+#include <stdint.h>
+#define PQCLEAN_SNTRUP653_AVX2_crypto_encode_653xint16_STRBYTES 1306
+#define PQCLEAN_SNTRUP653_AVX2_crypto_encode_653xint16_ITEMBYTES 2
+#define PQCLEAN_SNTRUP653_AVX2_crypto_encode_653xint16_ITEMS 653
+
+void PQCLEAN_SNTRUP653_AVX2_crypto_encode_653xint16(unsigned char *s, const void *v);
+#endif
diff --git a/crypto_kem/sntrup653/avx2/crypto_encode_int16.c b/crypto_kem/sntrup653/avx2/crypto_encode_int16.c
new file mode 100644
index 00000000..aa327402
--- /dev/null
+++ b/crypto_kem/sntrup653/avx2/crypto_encode_int16.c
@@ -0,0 +1,9 @@
+#include "crypto_encode_int16.h"
+
+#define uint16 uint16_t
+
+void PQCLEAN_SNTRUP653_AVX2_crypto_encode_int16(unsigned char *s, const void *x) {
+    uint16 u = *(const uint16 *) x;
+    s[0] = u;
+    s[1] = u >> 8;
+}
diff --git a/crypto_kem/sntrup653/avx2/crypto_encode_int16.h b/crypto_kem/sntrup653/avx2/crypto_encode_int16.h
new file mode 100644
index 00000000..553e8ee4
--- /dev/null
+++ b/crypto_kem/sntrup653/avx2/crypto_encode_int16.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_SNTRUP653_AVX2_CRYPTO_ENCODE_INT16_H
+#define PQCLEAN_SNTRUP653_AVX2_CRYPTO_ENCODE_INT16_H
+
+#include <stdint.h>
+#define PQCLEAN_SNTRUP653_AVX2_crypto_encode_int16_STRBYTES 2
+#define PQCLEAN_SNTRUP653_AVX2_crypto_encode_int16_ITEMBYTES 2
+#define PQCLEAN_SNTRUP653_AVX2_crypto_encode_int16_ITEMS 1
+
+void PQCLEAN_SNTRUP653_AVX2_crypto_encode_int16(unsigned char *s, const void *x);
+#endif
diff --git a/crypto_kem/sntrup653/avx2/crypto_sort_int32.c b/crypto_kem/sntrup653/avx2/crypto_sort_int32.c
new file mode 100644
index 00000000..1db58519
--- /dev/null
+++ b/crypto_kem/sntrup653/avx2/crypto_sort_int32.c
@@ -0,0 +1,1210 @@
+#include "crypto_sort_int32.h"
+#include <immintrin.h>
+// Based on supercop-20200820/crypto_sort/int32/avx2
+
+
+#define int32 int32_t
+
+typedef __m256i int32x8;
+#define int32x8_load(z) _mm256_loadu_si256((__m256i *) (z))
+#define int32x8_store(z,i) _mm256_storeu_si256((__m256i *) (z),(i))
+#define int32x8_min _mm256_min_epi32
+#define int32x8_max _mm256_max_epi32
+
+#define int32x8_MINMAX(a,b) \
+    do { \
+        int32x8 c = int32x8_min((a),(b)); \
+        (b) = int32x8_max((a),(b)); \
+        (a) = c; \
+    } while(0)
+
+static inline void int32_MINMAX(int32 *a, int32 *b) {
+    int32 ab = *b ^ *a;
+    int32 c = (int32)((int64_t) * b - (int64_t) * a);
+    c ^= ab & (c ^ *b);
+    c >>= 31;
+    c &= ab;
+    *a ^= c;
+    *b ^= c;
+}
+
+static void minmax_vector(int32 *x, int32 *y, size_t n) {
+    if ((long long) n < 8) {
+        while ((long long) n > 0) {
+            int32_MINMAX(x, y);
+            ++x;
+            ++y;
+            --n;
+        }
+        return;
+    }
+    if (n & 7) {
+        int32x8 x0 = int32x8_load(x + n - 8);
+        int32x8 y0 = int32x8_load(y + n - 8);
+        int32x8_MINMAX(x0, y0);
+        int32x8_store(x + n - 8, x0);
+        int32x8_store(y + n - 8, y0);
+        n &= ~7;
+    }
+    do {
+        int32x8 x0 = int32x8_load(x);
+        int32x8 y0 = int32x8_load(y);
+        int32x8_MINMAX(x0, y0);
+        int32x8_store(x, x0);
+        int32x8_store(y, y0);
+        x += 8;
+        y += 8;
+        n -= 8;
+    } while (n);
+}
+
+/* stages 8,4,2,1 of size-16 bitonic merging */
+static void merge16_finish(int32 *x, int32x8 x0, int32x8 x1, int flagdown) {
+    int32x8 b0, b1, c0, c1, mask;
+
+    int32x8_MINMAX(x0, x1);
+
+    b0 = _mm256_permute2x128_si256(x0, x1, 0x20); /* A0123B0123 */
+    b1 = _mm256_permute2x128_si256(x0, x1, 0x31); /* A4567B4567 */
+
+    int32x8_MINMAX(b0, b1);
+
+    c0 = _mm256_unpacklo_epi64(b0, b1); /* A0145B0145 */
+    c1 = _mm256_unpackhi_epi64(b0, b1); /* A2367B2367 */
+
+    int32x8_MINMAX(c0, c1);
+
+    b0 = _mm256_unpacklo_epi32(c0, c1); /* A0213B0213 */
+    b1 = _mm256_unpackhi_epi32(c0, c1); /* A4657B4657 */
+
+    c0 = _mm256_unpacklo_epi64(b0, b1); /* A0246B0246 */
+    c1 = _mm256_unpackhi_epi64(b0, b1); /* A1357B1357 */
+
+    int32x8_MINMAX(c0, c1);
+
+    b0 = _mm256_unpacklo_epi32(c0, c1); /* A0123B0123 */
+    b1 = _mm256_unpackhi_epi32(c0, c1); /* A4567B4567 */
+
+    x0 = _mm256_permute2x128_si256(b0, b1, 0x20); /* A01234567 */
+    x1 = _mm256_permute2x128_si256(b0, b1, 0x31); /* A01234567 */
+
+    if (flagdown) {
+        mask = _mm256_set1_epi32(-1);
+        x0 ^= mask;
+        x1 ^= mask;
+    }
+
+    int32x8_store(&x[0], x0);
+    int32x8_store(&x[8], x1);
+}
+
+/* stages 64,32 of bitonic merging; n is multiple of 128 */
+static void int32_twostages_32(int32 *x, size_t n) {
+    size_t i;
+
+    while (n > 0) {
+        for (i = 0; i < 32; i += 8) {
+            int32x8 x0 = int32x8_load(&x[i]);
+            int32x8 x1 = int32x8_load(&x[i + 32]);
+            int32x8 x2 = int32x8_load(&x[i + 64]);
+            int32x8 x3 = int32x8_load(&x[i + 96]);
+
+            int32x8_MINMAX(x0, x2);
+            int32x8_MINMAX(x1, x3);
+            int32x8_MINMAX(x0, x1);
+            int32x8_MINMAX(x2, x3);
+
+            int32x8_store(&x[i], x0);
+            int32x8_store(&x[i + 32], x1);
+            int32x8_store(&x[i + 64], x2);
+            int32x8_store(&x[i + 96], x3);
+        }
+        x += 128;
+        n -= 128;
+    }
+}
+
+/* stages 4q,2q,q of bitonic merging */
+static size_t int32_threestages(int32 *x, size_t n, size_t q) {
+    size_t k, i;
+
+    for (k = 0; k + 8 * q <= n; k += 8 * q) {
+        for (i = k; i < k + q; i += 8) {
+            int32x8 x0 = int32x8_load(&x[i]);
+            int32x8 x1 = int32x8_load(&x[i + q]);
+            int32x8 x2 = int32x8_load(&x[i + 2 * q]);
+            int32x8 x3 = int32x8_load(&x[i + 3 * q]);
+            int32x8 x4 = int32x8_load(&x[i + 4 * q]);
+            int32x8 x5 = int32x8_load(&x[i + 5 * q]);
+            int32x8 x6 = int32x8_load(&x[i + 6 * q]);
+            int32x8 x7 = int32x8_load(&x[i + 7 * q]);
+
+            int32x8_MINMAX(x0, x4);
+            int32x8_MINMAX(x1, x5);
+            int32x8_MINMAX(x2, x6);
+            int32x8_MINMAX(x3, x7);
+            int32x8_MINMAX(x0, x2);
+            int32x8_MINMAX(x1, x3);
+            int32x8_MINMAX(x4, x6);
+            int32x8_MINMAX(x5, x7);
+            int32x8_MINMAX(x0, x1);
+            int32x8_MINMAX(x2, x3);
+            int32x8_MINMAX(x4, x5);
+            int32x8_MINMAX(x6, x7);
+
+            int32x8_store(&x[i], x0);
+            int32x8_store(&x[i + q], x1);
+            int32x8_store(&x[i + 2 * q], x2);
+            int32x8_store(&x[i + 3 * q], x3);
+            int32x8_store(&x[i + 4 * q], x4);
+            int32x8_store(&x[i + 5 * q], x5);
+            int32x8_store(&x[i + 6 * q], x6);
+            int32x8_store(&x[i + 7 * q], x7);
+        }
+    }
+
+    return k;
+}
+
+/* n is a power of 2; n >= 8; if n == 8 then flagdown */
+// NOLINTNEXTLINE(google-readability-function-size)
+static void int32_sort_2power(int32 *x, size_t n, int flagdown) {
+    size_t p, q, i, j, k;
+    int32x8 mask;
+
+    if (n == 8) {
+        int32 x0 = x[0];
+        int32 x1 = x[1];
+        int32 x2 = x[2];
+        int32 x3 = x[3];
+        int32 x4 = x[4];
+        int32 x5 = x[5];
+        int32 x6 = x[6];
+        int32 x7 = x[7];
+
+        /* odd-even sort instead of bitonic sort */
+
+        int32_MINMAX(&x1, &x0);
+        int32_MINMAX(&x3, &x2);
+        int32_MINMAX(&x2, &x0);
+        int32_MINMAX(&x3, &x1);
+        int32_MINMAX(&x2, &x1);
+
+        int32_MINMAX(&x5, &x4);
+        int32_MINMAX(&x7, &x6);
+        int32_MINMAX(&x6, &x4);
+        int32_MINMAX(&x7, &x5);
+        int32_MINMAX(&x6, &x5);
+
+        int32_MINMAX(&x4, &x0);
+        int32_MINMAX(&x6, &x2);
+        int32_MINMAX(&x4, &x2);
+
+        int32_MINMAX(&x5, &x1);
+        int32_MINMAX(&x7, &x3);
+        int32_MINMAX(&x5, &x3);
+
+        int32_MINMAX(&x2, &x1);
+        int32_MINMAX(&x4, &x3);
+        int32_MINMAX(&x6, &x5);
+
+        x[0] = x0;
+        x[1] = x1;
+        x[2] = x2;
+        x[3] = x3;
+        x[4] = x4;
+        x[5] = x5;
+        x[6] = x6;
+        x[7] = x7;
+        return;
+    }
+
+    if (n == 16) {
+        int32x8 x0, x1, b0, b1, c0, c1;
+
+        x0 = int32x8_load(&x[0]);
+        x1 = int32x8_load(&x[8]);
+
+        mask = _mm256_set_epi32(0, 0, -1, -1, 0, 0, -1, -1);
+
+        x0 ^= mask; /* A01234567 */
+        x1 ^= mask; /* B01234567 */
+
+        b0 = _mm256_unpacklo_epi32(x0, x1); /* AB0AB1AB4AB5 */
+        b1 = _mm256_unpackhi_epi32(x0, x1); /* AB2AB3AB6AB7 */
+
+        c0 = _mm256_unpacklo_epi64(b0, b1); /* AB0AB2AB4AB6 */
+        c1 = _mm256_unpackhi_epi64(b0, b1); /* AB1AB3AB5AB7 */
+
+        int32x8_MINMAX(c0, c1);
+
+        mask = _mm256_set_epi32(0, 0, -1, -1, -1, -1, 0, 0);
+        c0 ^= mask;
+        c1 ^= mask;
+
+        b0 = _mm256_unpacklo_epi32(c0, c1); /* A01B01A45B45 */
+        b1 = _mm256_unpackhi_epi32(c0, c1); /* A23B23A67B67 */
+
+        int32x8_MINMAX(b0, b1);
+
+        x0 = _mm256_unpacklo_epi64(b0, b1); /* A01234567 */
+        x1 = _mm256_unpackhi_epi64(b0, b1); /* B01234567 */
+
+        b0 = _mm256_unpacklo_epi32(x0, x1); /* AB0AB1AB4AB5 */
+        b1 = _mm256_unpackhi_epi32(x0, x1); /* AB2AB3AB6AB7 */
+
+        c0 = _mm256_unpacklo_epi64(b0, b1); /* AB0AB2AB4AB6 */
+        c1 = _mm256_unpackhi_epi64(b0, b1); /* AB1AB3AB5AB7 */
+
+        int32x8_MINMAX(c0, c1);
+
+        b0 = _mm256_unpacklo_epi32(c0, c1); /* A01B01A45B45 */
+        b1 = _mm256_unpackhi_epi32(c0, c1); /* A23B23A67B67 */
+
+        b0 ^= mask;
+        b1 ^= mask;
+
+        c0 = _mm256_permute2x128_si256(b0, b1, 0x20); /* A01B01A23B23 */
+        c1 = _mm256_permute2x128_si256(b0, b1, 0x31); /* A45B45A67B67 */
+
+        int32x8_MINMAX(c0, c1);
+
+        b0 = _mm256_permute2x128_si256(c0, c1, 0x20); /* A01B01A45B45 */
+        b1 = _mm256_permute2x128_si256(c0, c1, 0x31); /* A23B23A67B67 */
+
+        int32x8_MINMAX(b0, b1);
+
+        x0 = _mm256_unpacklo_epi64(b0, b1); /* A01234567 */
+        x1 = _mm256_unpackhi_epi64(b0, b1); /* B01234567 */
+
+        b0 = _mm256_unpacklo_epi32(x0, x1); /* AB0AB1AB4AB5 */
+        b1 = _mm256_unpackhi_epi32(x0, x1); /* AB2AB3AB6AB7 */
+
+        c0 = _mm256_unpacklo_epi64(b0, b1); /* AB0AB2AB4AB6 */
+        c1 = _mm256_unpackhi_epi64(b0, b1); /* AB1AB3AB5AB7 */
+
+        int32x8_MINMAX(c0, c1);
+
+        b0 = _mm256_unpacklo_epi32(c0, c1); /* A01B01A45B45 */
+        b1 = _mm256_unpackhi_epi32(c0, c1); /* A23B23A67B67 */
+
+        x0 = _mm256_unpacklo_epi64(b0, b1); /* A01234567 */
+        x1 = _mm256_unpackhi_epi64(b0, b1); /* B01234567 */
+
+        mask = _mm256_set1_epi32(-1);
+        if (flagdown) {
+            x1 ^= mask;
+        } else {
+            x0 ^= mask;
+        }
+
+        merge16_finish(x, x0, x1, flagdown);
+        return;
+    }
+
+    if (n == 32) {
+        int32x8 x0, x1, x2, x3;
+
+        int32_sort_2power(x, 16, 1);
+        int32_sort_2power(x + 16, 16, 0);
+
+        x0 = int32x8_load(&x[0]);
+        x1 = int32x8_load(&x[8]);
+        x2 = int32x8_load(&x[16]);
+        x3 = int32x8_load(&x[24]);
+
+        if (flagdown) {
+            mask = _mm256_set1_epi32(-1);
+            x0 ^= mask;
+            x1 ^= mask;
+            x2 ^= mask;
+            x3 ^= mask;
+        }
+
+        int32x8_MINMAX(x0, x2);
+        int32x8_MINMAX(x1, x3);
+
+        merge16_finish(x, x0, x1, flagdown);
+        merge16_finish(x + 16, x2, x3, flagdown);
+        return;
+    }
+
+    p = n >> 3;
+    for (i = 0; i < p; i += 8) {
+        int32x8 x0 = int32x8_load(&x[i]);
+        int32x8 x2 = int32x8_load(&x[i + 2 * p]);
+        int32x8 x4 = int32x8_load(&x[i + 4 * p]);
+        int32x8 x6 = int32x8_load(&x[i + 6 * p]);
+
+        /* odd-even stage instead of bitonic stage */
+
+        int32x8_MINMAX(x4, x0);
+        int32x8_MINMAX(x6, x2);
+        int32x8_MINMAX(x2, x0);
+        int32x8_MINMAX(x6, x4);
+        int32x8_MINMAX(x2, x4);
+
+        int32x8_store(&x[i], x0);
+        int32x8_store(&x[i + 2 * p], x2);
+        int32x8_store(&x[i + 4 * p], x4);
+        int32x8_store(&x[i + 6 * p], x6);
+
+        int32x8 x1 = int32x8_load(&x[i + p]);
+        int32x8 x3 = int32x8_load(&x[i + 3 * p]);
+        int32x8 x5 = int32x8_load(&x[i + 5 * p]);
+        int32x8 x7 = int32x8_load(&x[i + 7 * p]);
+
+        int32x8_MINMAX(x1, x5);
+        int32x8_MINMAX(x3, x7);
+        int32x8_MINMAX(x1, x3);
+        int32x8_MINMAX(x5, x7);
+        int32x8_MINMAX(x5, x3);
+
+        int32x8_store(&x[i + p], x1);
+        int32x8_store(&x[i + 3 * p], x3);
+        int32x8_store(&x[i + 5 * p], x5);
+        int32x8_store(&x[i + 7 * p], x7);
+    }
+
+    if (n >= 128) {
+        int flip, flipflip;
+
+        mask = _mm256_set1_epi32(-1);
+
+        for (j = 0; j < n; j += 32) {
+            int32x8 x0 = int32x8_load(&x[j]);
+            int32x8 x1 = int32x8_load(&x[j + 16]);
+            x0 ^= mask;
+            x1 ^= mask;
+            int32x8_store(&x[j], x0);
+            int32x8_store(&x[j + 16], x1);
+        }
+
+        p = 8;
+        for (;;) { /* for p in [8, 16, ..., n/16] */
+            q = p >> 1;
+            while (q >= 128) {
+                int32_threestages(x, n, q >> 2);
+                q >>= 3;
+            }
+            if (q == 64) {
+                int32_twostages_32(x, n);
+                q = 16;
+            }
+            if (q == 32) {
+                q = 8;
+                for (k = 0; k < n; k += 8 * q) {
+                    for (i = k; i < k + q; i += 8) {
+                        int32x8 x0 = int32x8_load(&x[i]);
+                        int32x8 x1 = int32x8_load(&x[i + q]);
+                        int32x8 x2 = int32x8_load(&x[i + 2 * q]);
+                        int32x8 x3 = int32x8_load(&x[i + 3 * q]);
+                        int32x8 x4 = int32x8_load(&x[i + 4 * q]);
+                        int32x8 x5 = int32x8_load(&x[i + 5 * q]);
+                        int32x8 x6 = int32x8_load(&x[i + 6 * q]);
+                        int32x8 x7 = int32x8_load(&x[i + 7 * q]);
+
+                        int32x8_MINMAX(x0, x4);
+                        int32x8_MINMAX(x1, x5);
+                        int32x8_MINMAX(x2, x6);
+                        int32x8_MINMAX(x3, x7);
+                        int32x8_MINMAX(x0, x2);
+                        int32x8_MINMAX(x1, x3);
+                        int32x8_MINMAX(x4, x6);
+                        int32x8_MINMAX(x5, x7);
+                        int32x8_MINMAX(x0, x1);
+                        int32x8_MINMAX(x2, x3);
+                        int32x8_MINMAX(x4, x5);
+                        int32x8_MINMAX(x6, x7);
+
+                        int32x8_store(&x[i], x0);
+                        int32x8_store(&x[i + q], x1);
+                        int32x8_store(&x[i + 2 * q], x2);
+                        int32x8_store(&x[i + 3 * q], x3);
+                        int32x8_store(&x[i + 4 * q], x4);
+                        int32x8_store(&x[i + 5 * q], x5);
+                        int32x8_store(&x[i + 6 * q], x6);
+                        int32x8_store(&x[i + 7 * q], x7);
+                    }
+                }
+                q = 4;
+            }
+            if (q == 16) {
+                q = 8;
+                for (k = 0; k < n; k += 4 * q) {
+                    for (i = k; i < k + q; i += 8) {
+                        int32x8 x0 = int32x8_load(&x[i]);
+                        int32x8 x1 = int32x8_load(&x[i + q]);
+                        int32x8 x2 = int32x8_load(&x[i + 2 * q]);
+                        int32x8 x3 = int32x8_load(&x[i + 3 * q]);
+
+                        int32x8_MINMAX(x0, x2);
+                        int32x8_MINMAX(x1, x3);
+                        int32x8_MINMAX(x0, x1);
+                        int32x8_MINMAX(x2, x3);
+
+                        int32x8_store(&x[i], x0);
+                        int32x8_store(&x[i + q], x1);
+                        int32x8_store(&x[i + 2 * q], x2);
+                        int32x8_store(&x[i + 3 * q], x3);
+                    }
+                }
+                q = 4;
+            }
+            if (q == 8) {
+                for (k = 0; k < n; k += q + q) {
+                    int32x8 x0 = int32x8_load(&x[k]);
+                    int32x8 x1 = int32x8_load(&x[k + q]);
+
+                    int32x8_MINMAX(x0, x1);
+
+                    int32x8_store(&x[k], x0);
+                    int32x8_store(&x[k + q], x1);
+                }
+            }
+
+            q = n >> 3;
+            flip = (p << 1 == q);
+            flipflip = !flip;
+            for (j = 0; j < q; j += p + p) {
+                for (k = j; k < j + p + p; k += p) {
+                    for (i = k; i < k + p; i += 8) {
+                        int32x8 x0 = int32x8_load(&x[i]);
+                        int32x8 x1 = int32x8_load(&x[i + q]);
+                        int32x8 x2 = int32x8_load(&x[i + 2 * q]);
+                        int32x8 x3 = int32x8_load(&x[i + 3 * q]);
+                        int32x8 x4 = int32x8_load(&x[i + 4 * q]);
+                        int32x8 x5 = int32x8_load(&x[i + 5 * q]);
+                        int32x8 x6 = int32x8_load(&x[i + 6 * q]);
+                        int32x8 x7 = int32x8_load(&x[i + 7 * q]);
+
+                        int32x8_MINMAX(x0, x1);
+                        int32x8_MINMAX(x2, x3);
+                        int32x8_MINMAX(x4, x5);
+                        int32x8_MINMAX(x6, x7);
+                        int32x8_MINMAX(x0, x2);
+                        int32x8_MINMAX(x1, x3);
+                        int32x8_MINMAX(x4, x6);
+                        int32x8_MINMAX(x5, x7);
+                        int32x8_MINMAX(x0, x4);
+                        int32x8_MINMAX(x1, x5);
+                        int32x8_MINMAX(x2, x6);
+                        int32x8_MINMAX(x3, x7);
+
+                        if (flip) {
+                            x0 ^= mask;
+                            x1 ^= mask;
+                            x2 ^= mask;
+                            x3 ^= mask;
+                            x4 ^= mask;
+                            x5 ^= mask;
+                            x6 ^= mask;
+                            x7 ^= mask;
+                        }
+
+                        int32x8_store(&x[i], x0);
+                        int32x8_store(&x[i + q], x1);
+                        int32x8_store(&x[i + 2 * q], x2);
+                        int32x8_store(&x[i + 3 * q], x3);
+                        int32x8_store(&x[i + 4 * q], x4);
+                        int32x8_store(&x[i + 5 * q], x5);
+                        int32x8_store(&x[i + 6 * q], x6);
+                        int32x8_store(&x[i + 7 * q], x7);
+                    }
+                    flip ^= 1;
+                }
+                flip ^= flipflip;
+            }
+
+            if (p << 4 == n) {
+                break;
+            }
+            p <<= 1;
+        }
+    }
+
+    for (p = 4; p >= 1; p >>= 1) {
+        int32 *z = x;
+        int32 *target = x + n;
+        if (p == 4) {
+            mask = _mm256_set_epi32(0, 0, 0, 0, -1, -1, -1, -1);
+            while (z != target) {
+                int32x8 x0 = int32x8_load(&z[0]);
+                int32x8 x1 = int32x8_load(&z[8]);
+                x0 ^= mask;
+                x1 ^= mask;
+                int32x8_store(&z[0], x0);
+                int32x8_store(&z[8], x1);
+                z += 16;
+            }
+        } else if (p == 2) {
+            mask = _mm256_set_epi32(0, 0, -1, -1, -1, -1, 0, 0);
+            while (z != target) {
+                int32x8 x0 = int32x8_load(&z[0]);
+                int32x8 x1 = int32x8_load(&z[8]);
+                x0 ^= mask;
+                x1 ^= mask;
+                int32x8 b0 = _mm256_permute2x128_si256(x0, x1, 0x20);
+                int32x8 b1 = _mm256_permute2x128_si256(x0, x1, 0x31);
+                int32x8_MINMAX(b0, b1);
+                int32x8 c0 = _mm256_permute2x128_si256(b0, b1, 0x20);
+                int32x8 c1 = _mm256_permute2x128_si256(b0, b1, 0x31);
+                int32x8_store(&z[0], c0);
+                int32x8_store(&z[8], c1);
+                z += 16;
+            }
+        } else { /* p == 1 */
+            mask = _mm256_set_epi32(0, -1, -1, 0, 0, -1, -1, 0);
+            while (z != target) {
+                int32x8 x0 = int32x8_load(&z[0]);
+                int32x8 x1 = int32x8_load(&z[8]);
+                x0 ^= mask;
+                x1 ^= mask;
+                int32x8 b0 = _mm256_permute2x128_si256(x0, x1, 0x20); /* A0123B0123 */
+                int32x8 b1 = _mm256_permute2x128_si256(x0, x1, 0x31); /* A4567B4567 */
+                int32x8 c0 = _mm256_unpacklo_epi64(b0, b1); /* A0145B0145 */
+                int32x8 c1 = _mm256_unpackhi_epi64(b0, b1); /* A2367B2367 */
+                int32x8_MINMAX(c0, c1);
+                int32x8 d0 = _mm256_unpacklo_epi64(c0, c1); /* A0123B0123 */
+                int32x8 d1 = _mm256_unpackhi_epi64(c0, c1); /* A4567B4567 */
+                int32x8_MINMAX(d0, d1);
+                int32x8 e0 = _mm256_permute2x128_si256(d0, d1, 0x20);
+                int32x8 e1 = _mm256_permute2x128_si256(d0, d1, 0x31);
+                int32x8_store(&z[0], e0);
+                int32x8_store(&z[8], e1);
+                z += 16;
+            }
+        }
+
+        q = n >> 4;
+        while (q >= 128 || q == 32) {
+            int32_threestages(x, n, q >> 2);
+            q >>= 3;
+        }
+        while (q >= 16) {
+            q >>= 1;
+            for (j = 0; j < n; j += 4 * q) {
+                for (k = j; k < j + q; k += 8) {
+                    int32x8 x0 = int32x8_load(&x[k]);
+                    int32x8 x1 = int32x8_load(&x[k + q]);
+                    int32x8 x2 = int32x8_load(&x[k + 2 * q]);
+                    int32x8 x3 = int32x8_load(&x[k + 3 * q]);
+
+                    int32x8_MINMAX(x0, x2);
+                    int32x8_MINMAX(x1, x3);
+                    int32x8_MINMAX(x0, x1);
+                    int32x8_MINMAX(x2, x3);
+
+                    int32x8_store(&x[k], x0);
+                    int32x8_store(&x[k + q], x1);
+                    int32x8_store(&x[k + 2 * q], x2);
+                    int32x8_store(&x[k + 3 * q], x3);
+                }
+            }
+            q >>= 1;
+        }
+        if (q == 8) {
+            for (j = 0; j < n; j += 2 * q) {
+                int32x8 x0 = int32x8_load(&x[j]);
+                int32x8 x1 = int32x8_load(&x[j + q]);
+
+                int32x8_MINMAX(x0, x1);
+
+                int32x8_store(&x[j], x0);
+                int32x8_store(&x[j + q], x1);
+            }
+        }
+
+        q = n >> 3;
+        for (k = 0; k < q; k += 8) {
+            int32x8 x0 = int32x8_load(&x[k]);
+            int32x8 x1 = int32x8_load(&x[k + q]);
+            int32x8 x2 = int32x8_load(&x[k + 2 * q]);
+            int32x8 x3 = int32x8_load(&x[k + 3 * q]);
+            int32x8 x4 = int32x8_load(&x[k + 4 * q]);
+            int32x8 x5 = int32x8_load(&x[k + 5 * q]);
+            int32x8 x6 = int32x8_load(&x[k + 6 * q]);
+            int32x8 x7 = int32x8_load(&x[k + 7 * q]);
+
+            int32x8_MINMAX(x0, x1);
+            int32x8_MINMAX(x2, x3);
+            int32x8_MINMAX(x4, x5);
+            int32x8_MINMAX(x6, x7);
+            int32x8_MINMAX(x0, x2);
+            int32x8_MINMAX(x1, x3);
+            int32x8_MINMAX(x4, x6);
+            int32x8_MINMAX(x5, x7);
+            int32x8_MINMAX(x0, x4);
+            int32x8_MINMAX(x1, x5);
+            int32x8_MINMAX(x2, x6);
+            int32x8_MINMAX(x3, x7);
+
+            int32x8_store(&x[k], x0);
+            int32x8_store(&x[k + q], x1);
+            int32x8_store(&x[k + 2 * q], x2);
+            int32x8_store(&x[k + 3 * q], x3);
+            int32x8_store(&x[k + 4 * q], x4);
+            int32x8_store(&x[k + 5 * q], x5);
+            int32x8_store(&x[k + 6 * q], x6);
+            int32x8_store(&x[k + 7 * q], x7);
+        }
+    }
+
+    /* everything is still masked with _mm256_set_epi32(0,-1,0,-1,0,-1,0,-1); */
+    mask = _mm256_set1_epi32(-1);
+
+    for (i = 0; i < n; i += 64) {
+        int32x8 a0 = int32x8_load(&x[i]);
+        int32x8 a1 = int32x8_load(&x[i + 8]);
+        int32x8 a2 = int32x8_load(&x[i + 16]);
+        int32x8 a3 = int32x8_load(&x[i + 24]);
+        int32x8 a4 = int32x8_load(&x[i + 32]);
+        int32x8 a5 = int32x8_load(&x[i + 40]);
+        int32x8 a6 = int32x8_load(&x[i + 48]);
+        int32x8 a7 = int32x8_load(&x[i + 56]);
+
+        int32x8 b0 = _mm256_unpacklo_epi32(a0, a1); /* AB0AB1AB4AB5 */
+        int32x8 b1 = _mm256_unpackhi_epi32(a0, a1); /* AB2AB3AB6AB7 */
+        int32x8 b2 = _mm256_unpacklo_epi32(a2, a3); /* CD0CD1CD4CD5 */
+        int32x8 b3 = _mm256_unpackhi_epi32(a2, a3); /* CD2CD3CD6CD7 */
+        int32x8 b4 = _mm256_unpacklo_epi32(a4, a5); /* EF0EF1EF4EF5 */
+        int32x8 b5 = _mm256_unpackhi_epi32(a4, a5); /* EF2EF3EF6EF7 */
+        int32x8 b6 = _mm256_unpacklo_epi32(a6, a7); /* GH0GH1GH4GH5 */
+        int32x8 b7 = _mm256_unpackhi_epi32(a6, a7); /* GH2GH3GH6GH7 */
+
+        int32x8 c0 = _mm256_unpacklo_epi64(b0, b2); /* ABCD0ABCD4 */
+        int32x8 c1 = _mm256_unpacklo_epi64(b1, b3); /* ABCD2ABCD6 */
+        int32x8 c2 = _mm256_unpackhi_epi64(b0, b2); /* ABCD1ABCD5 */
+        int32x8 c3 = _mm256_unpackhi_epi64(b1, b3); /* ABCD3ABCD7 */
+        int32x8 c4 = _mm256_unpacklo_epi64(b4, b6); /* EFGH0EFGH4 */
+        int32x8 c5 = _mm256_unpacklo_epi64(b5, b7); /* EFGH2EFGH6 */
+        int32x8 c6 = _mm256_unpackhi_epi64(b4, b6); /* EFGH1EFGH5 */
+        int32x8 c7 = _mm256_unpackhi_epi64(b5, b7); /* EFGH3EFGH7 */
+
+        if (flagdown) {
+            c2 ^= mask;
+            c3 ^= mask;
+            c6 ^= mask;
+            c7 ^= mask;
+        } else {
+            c0 ^= mask;
+            c1 ^= mask;
+            c4 ^= mask;
+            c5 ^= mask;
+        }
+
+        int32x8 d0 = _mm256_permute2x128_si256(c0, c4, 0x20); /* ABCDEFGH0 */
+        int32x8 d1 = _mm256_permute2x128_si256(c2, c6, 0x20); /* ABCDEFGH1 */
+        int32x8 d2 = _mm256_permute2x128_si256(c1, c5, 0x20); /* ABCDEFGH2 */
+        int32x8 d3 = _mm256_permute2x128_si256(c3, c7, 0x20); /* ABCDEFGH5 */
+        int32x8 d4 = _mm256_permute2x128_si256(c0, c4, 0x31); /* ABCDEFGH4 */
+        int32x8 d5 = _mm256_permute2x128_si256(c2, c6, 0x31); /* ABCDEFGH3 */
+        int32x8 d6 = _mm256_permute2x128_si256(c1, c5, 0x31); /* ABCDEFGH6 */
+        int32x8 d7 = _mm256_permute2x128_si256(c3, c7, 0x31); /* ABCDEFGH7 */
+
+        int32x8_MINMAX(d0, d1);
+        int32x8_MINMAX(d2, d3);
+        int32x8_MINMAX(d4, d5);
+        int32x8_MINMAX(d6, d7);
+        int32x8_MINMAX(d0, d2);
+        int32x8_MINMAX(d1, d3);
+        int32x8_MINMAX(d4, d6);
+        int32x8_MINMAX(d5, d7);
+        int32x8_MINMAX(d0, d4);
+        int32x8_MINMAX(d1, d5);
+        int32x8_MINMAX(d2, d6);
+        int32x8_MINMAX(d3, d7);
+
+        int32x8 e0 = _mm256_unpacklo_epi32(d0, d1);
+        int32x8 e1 = _mm256_unpackhi_epi32(d0, d1);
+        int32x8 e2 = _mm256_unpacklo_epi32(d2, d3);
+        int32x8 e3 = _mm256_unpackhi_epi32(d2, d3);
+        int32x8 e4 = _mm256_unpacklo_epi32(d4, d5);
+        int32x8 e5 = _mm256_unpackhi_epi32(d4, d5);
+        int32x8 e6 = _mm256_unpacklo_epi32(d6, d7);
+        int32x8 e7 = _mm256_unpackhi_epi32(d6, d7);
+
+        int32x8 f0 = _mm256_unpacklo_epi64(e0, e2);
+        int32x8 f1 = _mm256_unpacklo_epi64(e1, e3);
+        int32x8 f2 = _mm256_unpackhi_epi64(e0, e2);
+        int32x8 f3 = _mm256_unpackhi_epi64(e1, e3);
+        int32x8 f4 = _mm256_unpacklo_epi64(e4, e6);
+        int32x8 f5 = _mm256_unpacklo_epi64(e5, e7);
+        int32x8 f6 = _mm256_unpackhi_epi64(e4, e6);
+        int32x8 f7 = _mm256_unpackhi_epi64(e5, e7);
+
+        int32x8 g0 = _mm256_permute2x128_si256(f0, f4, 0x20);
+        int32x8 g1 = _mm256_permute2x128_si256(f2, f6, 0x20);
+        int32x8 g2 = _mm256_permute2x128_si256(f1, f5, 0x20);
+        int32x8 g3 = _mm256_permute2x128_si256(f3, f7, 0x20);
+        int32x8 g4 = _mm256_permute2x128_si256(f0, f4, 0x31);
+        int32x8 g5 = _mm256_permute2x128_si256(f2, f6, 0x31);
+        int32x8 g6 = _mm256_permute2x128_si256(f1, f5, 0x31);
+        int32x8 g7 = _mm256_permute2x128_si256(f3, f7, 0x31);
+
+        int32x8_store(&x[i], g0);
+        int32x8_store(&x[i + 8], g1);
+        int32x8_store(&x[i + 16], g2);
+        int32x8_store(&x[i + 24], g3);
+        int32x8_store(&x[i + 32], g4);
+        int32x8_store(&x[i + 40], g5);
+        int32x8_store(&x[i + 48], g6);
+        int32x8_store(&x[i + 56], g7);
+    }
+
+    q = n >> 4;
+    while (q >= 128 || q == 32) {
+        q >>= 2;
+        for (j = 0; j < n; j += 8 * q) {
+            for (i = j; i < j + q; i += 8) {
+                int32x8 x0 = int32x8_load(&x[i]);
+                int32x8 x1 = int32x8_load(&x[i + q]);
+                int32x8 x2 = int32x8_load(&x[i + 2 * q]);
+                int32x8 x3 = int32x8_load(&x[i + 3 * q]);
+                int32x8 x4 = int32x8_load(&x[i + 4 * q]);
+                int32x8 x5 = int32x8_load(&x[i + 5 * q]);
+                int32x8 x6 = int32x8_load(&x[i + 6 * q]);
+                int32x8 x7 = int32x8_load(&x[i + 7 * q]);
+                int32x8_MINMAX(x0, x4);
+                int32x8_MINMAX(x1, x5);
+                int32x8_MINMAX(x2, x6);
+                int32x8_MINMAX(x3, x7);
+                int32x8_MINMAX(x0, x2);
+                int32x8_MINMAX(x1, x3);
+                int32x8_MINMAX(x4, x6);
+                int32x8_MINMAX(x5, x7);
+                int32x8_MINMAX(x0, x1);
+                int32x8_MINMAX(x2, x3);
+                int32x8_MINMAX(x4, x5);
+                int32x8_MINMAX(x6, x7);
+                int32x8_store(&x[i], x0);
+                int32x8_store(&x[i + q], x1);
+                int32x8_store(&x[i + 2 * q], x2);
+                int32x8_store(&x[i + 3 * q], x3);
+                int32x8_store(&x[i + 4 * q], x4);
+                int32x8_store(&x[i + 5 * q], x5);
+                int32x8_store(&x[i + 6 * q], x6);
+                int32x8_store(&x[i + 7 * q], x7);
+            }
+        }
+        q >>= 1;
+    }
+    while (q >= 16) {
+        q >>= 1;
+        for (j = 0; j < n; j += 4 * q) {
+            for (i = j; i < j + q; i += 8) {
+                int32x8 x0 = int32x8_load(&x[i]);
+                int32x8 x1 = int32x8_load(&x[i + q]);
+                int32x8 x2 = int32x8_load(&x[i + 2 * q]);
+                int32x8 x3 = int32x8_load(&x[i + 3 * q]);
+                int32x8_MINMAX(x0, x2);
+                int32x8_MINMAX(x1, x3);
+                int32x8_MINMAX(x0, x1);
+                int32x8_MINMAX(x2, x3);
+                int32x8_store(&x[i], x0);
+                int32x8_store(&x[i + q], x1);
+                int32x8_store(&x[i + 2 * q], x2);
+                int32x8_store(&x[i + 3 * q], x3);
+            }
+        }
+        q >>= 1;
+    }
+    if (q == 8) {
+        for (j = 0; j < n; j += q + q) {
+            int32x8 x0 = int32x8_load(&x[j]);
+            int32x8 x1 = int32x8_load(&x[j + q]);
+            int32x8_MINMAX(x0, x1);
+            int32x8_store(&x[j], x0);
+            int32x8_store(&x[j + q], x1);
+        }
+    }
+
+    q = n >> 3;
+    for (i = 0; i < q; i += 8) {
+        int32x8 x0 = int32x8_load(&x[i]);
+        int32x8 x1 = int32x8_load(&x[i + q]);
+        int32x8 x2 = int32x8_load(&x[i + 2 * q]);
+        int32x8 x3 = int32x8_load(&x[i + 3 * q]);
+        int32x8 x4 = int32x8_load(&x[i + 4 * q]);
+        int32x8 x5 = int32x8_load(&x[i + 5 * q]);
+        int32x8 x6 = int32x8_load(&x[i + 6 * q]);
+        int32x8 x7 = int32x8_load(&x[i + 7 * q]);
+
+        int32x8_MINMAX(x0, x1);
+        int32x8_MINMAX(x2, x3);
+        int32x8_MINMAX(x4, x5);
+        int32x8_MINMAX(x6, x7);
+        int32x8_MINMAX(x0, x2);
+        int32x8_MINMAX(x1, x3);
+        int32x8_MINMAX(x4, x6);
+        int32x8_MINMAX(x5, x7);
+        int32x8_MINMAX(x0, x4);
+        int32x8_MINMAX(x1, x5);
+        int32x8_MINMAX(x2, x6);
+        int32x8_MINMAX(x3, x7);
+
+        int32x8 b0 = _mm256_unpacklo_epi32(x0, x4); /* AE0AE1AE4AE5 */
+        int32x8 b1 = _mm256_unpackhi_epi32(x0, x4); /* AE2AE3AE6AE7 */
+        int32x8 b2 = _mm256_unpacklo_epi32(x1, x5); /* BF0BF1BF4BF5 */
+        int32x8 b3 = _mm256_unpackhi_epi32(x1, x5); /* BF2BF3BF6BF7 */
+        int32x8 b4 = _mm256_unpacklo_epi32(x2, x6); /* CG0CG1CG4CG5 */
+        int32x8 b5 = _mm256_unpackhi_epi32(x2, x6); /* CG2CG3CG6CG7 */
+        int32x8 b6 = _mm256_unpacklo_epi32(x3, x7); /* DH0DH1DH4DH5 */
+        int32x8 b7 = _mm256_unpackhi_epi32(x3, x7); /* DH2DH3DH6DH7 */
+
+        int32x8 c0 = _mm256_unpacklo_epi64(b0, b4); /* AECG0AECG4 */
+        int32x8 c1 = _mm256_unpacklo_epi64(b1, b5); /* AECG2AECG6 */
+        int32x8 c2 = _mm256_unpackhi_epi64(b0, b4); /* AECG1AECG5 */
+        int32x8 c3 = _mm256_unpackhi_epi64(b1, b5); /* AECG3AECG7 */
+        int32x8 c4 = _mm256_unpacklo_epi64(b2, b6); /* BFDH0BFDH4 */
+        int32x8 c5 = _mm256_unpacklo_epi64(b3, b7); /* BFDH2BFDH6 */
+        int32x8 c6 = _mm256_unpackhi_epi64(b2, b6); /* BFDH1BFDH5 */
+        int32x8 c7 = _mm256_unpackhi_epi64(b3, b7); /* BFDH3BFDH7 */
+
+        int32x8 d0 = _mm256_permute2x128_si256(c0, c4, 0x20); /* AECGBFDH0 */
+        int32x8 d1 = _mm256_permute2x128_si256(c1, c5, 0x20); /* AECGBFDH2 */
+        int32x8 d2 = _mm256_permute2x128_si256(c2, c6, 0x20); /* AECGBFDH1 */
+        int32x8 d3 = _mm256_permute2x128_si256(c3, c7, 0x20); /* AECGBFDH3 */
+        int32x8 d4 = _mm256_permute2x128_si256(c0, c4, 0x31); /* AECGBFDH4 */
+        int32x8 d5 = _mm256_permute2x128_si256(c1, c5, 0x31); /* AECGBFDH6 */
+        int32x8 d6 = _mm256_permute2x128_si256(c2, c6, 0x31); /* AECGBFDH5 */
+        int32x8 d7 = _mm256_permute2x128_si256(c3, c7, 0x31); /* AECGBFDH7 */
+
+        if (flagdown) {
+            d0 ^= mask;
+            d1 ^= mask;
+            d2 ^= mask;
+            d3 ^= mask;
+            d4 ^= mask;
+            d5 ^= mask;
+            d6 ^= mask;
+            d7 ^= mask;
+        }
+
+        int32x8_store(&x[i], d0);
+        int32x8_store(&x[i + q], d4);
+        int32x8_store(&x[i + 2 * q], d1);
+        int32x8_store(&x[i + 3 * q], d5);
+        int32x8_store(&x[i + 4 * q], d2);
+        int32x8_store(&x[i + 5 * q], d6);
+        int32x8_store(&x[i + 6 * q], d3);
+        int32x8_store(&x[i + 7 * q], d7);
+    }
+}
+
+void PQCLEAN_SNTRUP653_AVX2_crypto_sort_int32(int32 *x, size_t n) {
+    size_t q, i, j;
+
+    if (n <= 8) {
+        if (n == 8) {
+            int32_MINMAX(&x[0], &x[1]);
+            int32_MINMAX(&x[1], &x[2]);
+            int32_MINMAX(&x[2], &x[3]);
+            int32_MINMAX(&x[3], &x[4]);
+            int32_MINMAX(&x[4], &x[5]);
+            int32_MINMAX(&x[5], &x[6]);
+            int32_MINMAX(&x[6], &x[7]);
+        }
+        if (n >= 7) {
+            int32_MINMAX(&x[0], &x[1]);
+            int32_MINMAX(&x[1], &x[2]);
+            int32_MINMAX(&x[2], &x[3]);
+            int32_MINMAX(&x[3], &x[4]);
+            int32_MINMAX(&x[4], &x[5]);
+            int32_MINMAX(&x[5], &x[6]);
+        }
+        if (n >= 6) {
+            int32_MINMAX(&x[0], &x[1]);
+            int32_MINMAX(&x[1], &x[2]);
+            int32_MINMAX(&x[2], &x[3]);
+            int32_MINMAX(&x[3], &x[4]);
+            int32_MINMAX(&x[4], &x[5]);
+        }
+        if (n >= 5) {
+            int32_MINMAX(&x[0], &x[1]);
+            int32_MINMAX(&x[1], &x[2]);
+            int32_MINMAX(&x[2], &x[3]);
+            int32_MINMAX(&x[3], &x[4]);
+        }
+        if (n >= 4) {
+            int32_MINMAX(&x[0], &x[1]);
+            int32_MINMAX(&x[1], &x[2]);
+            int32_MINMAX(&x[2], &x[3]);
+        }
+        if (n >= 3) {
+            int32_MINMAX(&x[0], &x[1]);
+            int32_MINMAX(&x[1], &x[2]);
+        }
+        if (n >= 2) {
+            int32_MINMAX(&x[0], &x[1]);
+        }
+        return;
+    }
+
+    if (!(n & (n - 1))) {
+        int32_sort_2power(x, n, 0);
+        return;
+    }
+
+    q = 8;
+    while (q < n - q) {
+        q += q;
+    }
+    /* n > q >= 8 */
+
+    if (q <= 128) { /* n <= 256 */
+        int32x8 y[32];
+        for (i = q >> 3; i < q >> 2; ++i) {
+            y[i] = _mm256_set1_epi32(0x7fffffff);
+        }
+        for (i = 0; i < n; ++i) {
+            ((int32 *) y)[i] = x[i];
+        }
+        int32_sort_2power((int32 *) y, 2 * q, 0);
+        for (i = 0; i < n; ++i) {
+            x[i] = ((int32 *) y)[i];
+        }
+        return;
+    }
+
+    int32_sort_2power(x, q, 1);
+    PQCLEAN_SNTRUP653_AVX2_crypto_sort_int32(x + q, n - q);
+
+    while (q >= 64) {
+        q >>= 2;
+        j = int32_threestages(x, n, q);
+        minmax_vector(x + j, x + j + 4 * q, n - 4 * q - j);
+        if (j + 4 * q <= n) {
+            for (i = j; i < j + q; i += 8) {
+                int32x8 x0 = int32x8_load(&x[i]);
+                int32x8 x1 = int32x8_load(&x[i + q]);
+                int32x8 x2 = int32x8_load(&x[i + 2 * q]);
+                int32x8 x3 = int32x8_load(&x[i + 3 * q]);
+                int32x8_MINMAX(x0, x2);
+                int32x8_MINMAX(x1, x3);
+                int32x8_MINMAX(x0, x1);
+                int32x8_MINMAX(x2, x3);
+                int32x8_store(&x[i], x0);
+                int32x8_store(&x[i + q], x1);
+                int32x8_store(&x[i + 2 * q], x2);
+                int32x8_store(&x[i + 3 * q], x3);
+            }
+            j += 4 * q;
+        }
+        minmax_vector(x + j, x + j + 2 * q, n - 2 * q - j);
+        if (j + 2 * q <= n) {
+            for (i = j; i < j + q; i += 8) {
+                int32x8 x0 = int32x8_load(&x[i]);
+                int32x8 x1 = int32x8_load(&x[i + q]);
+                int32x8_MINMAX(x0, x1);
+                int32x8_store(&x[i], x0);
+                int32x8_store(&x[i + q], x1);
+            }
+            j += 2 * q;
+        }
+        minmax_vector(x + j, x + j + q, n - q - j);
+        q >>= 1;
+    }
+    if (q == 32) {
+        j = 0;
+        for (; j + 64 <= n; j += 64) {
+            int32x8 x0 = int32x8_load(&x[j]);
+            int32x8 x1 = int32x8_load(&x[j + 8]);
+            int32x8 x2 = int32x8_load(&x[j + 16]);
+            int32x8 x3 = int32x8_load(&x[j + 24]);
+            int32x8 x4 = int32x8_load(&x[j + 32]);
+            int32x8 x5 = int32x8_load(&x[j + 40]);
+            int32x8 x6 = int32x8_load(&x[j + 48]);
+            int32x8 x7 = int32x8_load(&x[j + 56]);
+            int32x8_MINMAX(x0, x4);
+            int32x8_MINMAX(x1, x5);
+            int32x8_MINMAX(x2, x6);
+            int32x8_MINMAX(x3, x7);
+            int32x8_MINMAX(x0, x2);
+            int32x8_MINMAX(x1, x3);
+            int32x8_MINMAX(x4, x6);
+            int32x8_MINMAX(x5, x7);
+            int32x8_MINMAX(x0, x1);
+            int32x8_MINMAX(x2, x3);
+            int32x8_MINMAX(x4, x5);
+            int32x8_MINMAX(x6, x7);
+            int32x8 a0 = _mm256_permute2x128_si256(x0, x1, 0x20);
+            int32x8 a1 = _mm256_permute2x128_si256(x0, x1, 0x31);
+            int32x8 a2 = _mm256_permute2x128_si256(x2, x3, 0x20);
+            int32x8 a3 = _mm256_permute2x128_si256(x2, x3, 0x31);
+            int32x8 a4 = _mm256_permute2x128_si256(x4, x5, 0x20);
+            int32x8 a5 = _mm256_permute2x128_si256(x4, x5, 0x31);
+            int32x8 a6 = _mm256_permute2x128_si256(x6, x7, 0x20);
+            int32x8 a7 = _mm256_permute2x128_si256(x6, x7, 0x31);
+            int32x8_MINMAX(a0, a1);
+            int32x8_MINMAX(a2, a3);
+            int32x8_MINMAX(a4, a5);
+            int32x8_MINMAX(a6, a7);
+            int32x8 b0 = _mm256_permute2x128_si256(a0, a1, 0x20);
+            int32x8 b1 = _mm256_permute2x128_si256(a0, a1, 0x31);
+            int32x8 b2 = _mm256_permute2x128_si256(a2, a3, 0x20);
+            int32x8 b3 = _mm256_permute2x128_si256(a2, a3, 0x31);
+            int32x8 b4 = _mm256_permute2x128_si256(a4, a5, 0x20);
+            int32x8 b5 = _mm256_permute2x128_si256(a4, a5, 0x31);
+            int32x8 b6 = _mm256_permute2x128_si256(a6, a7, 0x20);
+            int32x8 b7 = _mm256_permute2x128_si256(a6, a7, 0x31);
+            int32x8 c0 = _mm256_unpacklo_epi64(b0, b1);
+            int32x8 c1 = _mm256_unpackhi_epi64(b0, b1);
+            int32x8 c2 = _mm256_unpacklo_epi64(b2, b3);
+            int32x8 c3 = _mm256_unpackhi_epi64(b2, b3);
+            int32x8 c4 = _mm256_unpacklo_epi64(b4, b5);
+            int32x8 c5 = _mm256_unpackhi_epi64(b4, b5);
+            int32x8 c6 = _mm256_unpacklo_epi64(b6, b7);
+            int32x8 c7 = _mm256_unpackhi_epi64(b6, b7);
+            int32x8_MINMAX(c0, c1);
+            int32x8_MINMAX(c2, c3);
+            int32x8_MINMAX(c4, c5);
+            int32x8_MINMAX(c6, c7);
+            int32x8 d0 = _mm256_unpacklo_epi32(c0, c1);
+            int32x8 d1 = _mm256_unpackhi_epi32(c0, c1);
+            int32x8 d2 = _mm256_unpacklo_epi32(c2, c3);
+            int32x8 d3 = _mm256_unpackhi_epi32(c2, c3);
+            int32x8 d4 = _mm256_unpacklo_epi32(c4, c5);
+            int32x8 d5 = _mm256_unpackhi_epi32(c4, c5);
+            int32x8 d6 = _mm256_unpacklo_epi32(c6, c7);
+            int32x8 d7 = _mm256_unpackhi_epi32(c6, c7);
+            int32x8 e0 = _mm256_unpacklo_epi64(d0, d1);
+            int32x8 e1 = _mm256_unpackhi_epi64(d0, d1);
+            int32x8 e2 = _mm256_unpacklo_epi64(d2, d3);
+            int32x8 e3 = _mm256_unpackhi_epi64(d2, d3);
+            int32x8 e4 = _mm256_unpacklo_epi64(d4, d5);
+            int32x8 e5 = _mm256_unpackhi_epi64(d4, d5);
+            int32x8 e6 = _mm256_unpacklo_epi64(d6, d7);
+            int32x8 e7 = _mm256_unpackhi_epi64(d6, d7);
+            int32x8_MINMAX(e0, e1);
+            int32x8_MINMAX(e2, e3);
+            int32x8_MINMAX(e4, e5);
+            int32x8_MINMAX(e6, e7);
+            int32x8 f0 = _mm256_unpacklo_epi32(e0, e1);
+            int32x8 f1 = _mm256_unpackhi_epi32(e0, e1);
+            int32x8 f2 = _mm256_unpacklo_epi32(e2, e3);
+            int32x8 f3 = _mm256_unpackhi_epi32(e2, e3);
+            int32x8 f4 = _mm256_unpacklo_epi32(e4, e5);
+            int32x8 f5 = _mm256_unpackhi_epi32(e4, e5);
+            int32x8 f6 = _mm256_unpacklo_epi32(e6, e7);
+            int32x8 f7 = _mm256_unpackhi_epi32(e6, e7);
+            int32x8_store(&x[j], f0);
+            int32x8_store(&x[j + 8], f1);
+            int32x8_store(&x[j + 16], f2);
+            int32x8_store(&x[j + 24], f3);
+            int32x8_store(&x[j + 32], f4);
+            int32x8_store(&x[j + 40], f5);
+            int32x8_store(&x[j + 48], f6);
+            int32x8_store(&x[j + 56], f7);
+        }
+        minmax_vector(x + j, x + j + 32, n - 32 - j);
+        goto continue16;
+    }
+    if (q == 16) {
+        j = 0;
+continue16:
+        for (; j + 32 <= n; j += 32) {
+            int32x8 x0 = int32x8_load(&x[j]);
+            int32x8 x1 = int32x8_load(&x[j + 8]);
+            int32x8 x2 = int32x8_load(&x[j + 16]);
+            int32x8 x3 = int32x8_load(&x[j + 24]);
+            int32x8_MINMAX(x0, x2);
+            int32x8_MINMAX(x1, x3);
+            int32x8_MINMAX(x0, x1);
+            int32x8_MINMAX(x2, x3);
+            int32x8 a0 = _mm256_permute2x128_si256(x0, x1, 0x20);
+            int32x8 a1 = _mm256_permute2x128_si256(x0, x1, 0x31);
+            int32x8 a2 = _mm256_permute2x128_si256(x2, x3, 0x20);
+            int32x8 a3 = _mm256_permute2x128_si256(x2, x3, 0x31);
+            int32x8_MINMAX(a0, a1);
+            int32x8_MINMAX(a2, a3);
+            int32x8 b0 = _mm256_permute2x128_si256(a0, a1, 0x20);
+            int32x8 b1 = _mm256_permute2x128_si256(a0, a1, 0x31);
+            int32x8 b2 = _mm256_permute2x128_si256(a2, a3, 0x20);
+            int32x8 b3 = _mm256_permute2x128_si256(a2, a3, 0x31);
+            int32x8 c0 = _mm256_unpacklo_epi64(b0, b1);
+            int32x8 c1 = _mm256_unpackhi_epi64(b0, b1);
+            int32x8 c2 = _mm256_unpacklo_epi64(b2, b3);
+            int32x8 c3 = _mm256_unpackhi_epi64(b2, b3);
+            int32x8_MINMAX(c0, c1);
+            int32x8_MINMAX(c2, c3);
+            int32x8 d0 = _mm256_unpacklo_epi32(c0, c1);
+            int32x8 d1 = _mm256_unpackhi_epi32(c0, c1);
+            int32x8 d2 = _mm256_unpacklo_epi32(c2, c3);
+            int32x8 d3 = _mm256_unpackhi_epi32(c2, c3);
+            int32x8 e0 = _mm256_unpacklo_epi64(d0, d1);
+            int32x8 e1 = _mm256_unpackhi_epi64(d0, d1);
+            int32x8 e2 = _mm256_unpacklo_epi64(d2, d3);
+            int32x8 e3 = _mm256_unpackhi_epi64(d2, d3);
+            int32x8_MINMAX(e0, e1);
+            int32x8_MINMAX(e2, e3);
+            int32x8 f0 = _mm256_unpacklo_epi32(e0, e1);
+            int32x8 f1 = _mm256_unpackhi_epi32(e0, e1);
+            int32x8 f2 = _mm256_unpacklo_epi32(e2, e3);
+            int32x8 f3 = _mm256_unpackhi_epi32(e2, e3);
+            int32x8_store(&x[j], f0);
+            int32x8_store(&x[j + 8], f1);
+            int32x8_store(&x[j + 16], f2);
+            int32x8_store(&x[j + 24], f3);
+        }
+        minmax_vector(x + j, x + j + 16, n - 16 - j);
+        goto continue8;
+    }
+    /* q == 8 */
+    j = 0;
+continue8:
+    for (; j + 16 <= n; j += 16) {
+        int32x8 x0 = int32x8_load(&x[j]);
+        int32x8 x1 = int32x8_load(&x[j + 8]);
+        int32x8_MINMAX(x0, x1);
+        int32x8_store(&x[j], x0);
+        int32x8_store(&x[j + 8], x1);
+        int32x8 a0 = _mm256_permute2x128_si256(x0, x1, 0x20); /* x0123y0123 */
+        int32x8 a1 = _mm256_permute2x128_si256(x0, x1, 0x31); /* x4567y4567 */
+        int32x8_MINMAX(a0, a1);
+        int32x8 b0 = _mm256_permute2x128_si256(a0, a1, 0x20); /* x01234567 */
+        int32x8 b1 = _mm256_permute2x128_si256(a0, a1, 0x31); /* y01234567 */
+        int32x8 c0 = _mm256_unpacklo_epi64(b0, b1); /* x01y01x45y45 */
+        int32x8 c1 = _mm256_unpackhi_epi64(b0, b1); /* x23y23x67y67 */
+        int32x8_MINMAX(c0, c1);
+        int32x8 d0 = _mm256_unpacklo_epi32(c0, c1); /* x02x13x46x57 */
+        int32x8 d1 = _mm256_unpackhi_epi32(c0, c1); /* y02y13y46y57 */
+        int32x8 e0 = _mm256_unpacklo_epi64(d0, d1); /* x02y02x46y46 */
+        int32x8 e1 = _mm256_unpackhi_epi64(d0, d1); /* x13y13x57y57 */
+        int32x8_MINMAX(e0, e1);
+        int32x8 f0 = _mm256_unpacklo_epi32(e0, e1); /* x01234567 */
+        int32x8 f1 = _mm256_unpackhi_epi32(e0, e1); /* y01234567 */
+        int32x8_store(&x[j], f0);
+        int32x8_store(&x[j + 8], f1);
+    }
+    minmax_vector(x + j, x + j + 8, n - 8 - j);
+    if (j + 8 <= n) {
+        int32_MINMAX(&x[j], &x[j + 4]);
+        int32_MINMAX(&x[j + 1], &x[j + 5]);
+        int32_MINMAX(&x[j + 2], &x[j + 6]);
+        int32_MINMAX(&x[j + 3], &x[j + 7]);
+        int32_MINMAX(&x[j], &x[j + 2]);
+        int32_MINMAX(&x[j + 1], &x[j + 3]);
+        int32_MINMAX(&x[j], &x[j + 1]);
+        int32_MINMAX(&x[j + 2], &x[j + 3]);
+        int32_MINMAX(&x[j + 4], &x[j + 6]);
+        int32_MINMAX(&x[j + 5], &x[j + 7]);
+        int32_MINMAX(&x[j + 4], &x[j + 5]);
+        int32_MINMAX(&x[j + 6], &x[j + 7]);
+        j += 8;
+    }
+    minmax_vector(x + j, x + j + 4, n - 4 - j);
+    if (j + 4 <= n) {
+        int32_MINMAX(&x[j], &x[j + 2]);
+        int32_MINMAX(&x[j + 1], &x[j + 3]);
+        int32_MINMAX(&x[j], &x[j + 1]);
+        int32_MINMAX(&x[j + 2], &x[j + 3]);
+        j += 4;
+    }
+    if (j + 3 <= n) {
+        int32_MINMAX(&x[j], &x[j + 2]);
+    }
+    if (j + 2 <= n) {
+        int32_MINMAX(&x[j], &x[j + 1]);
+    }
+}
diff --git a/crypto_kem/sntrup653/avx2/crypto_sort_int32.h b/crypto_kem/sntrup653/avx2/crypto_sort_int32.h
new file mode 100644
index 00000000..a3772a41
--- /dev/null
+++ b/crypto_kem/sntrup653/avx2/crypto_sort_int32.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_SNTRUP653_AVX2_CRYPTO_SORT
+#define PQCLEAN_SNTRUP653_AVX2_CRYPTO_SORT
+
+#include <stddef.h>
+#include <stdint.h>
+
+
+void PQCLEAN_SNTRUP653_AVX2_crypto_sort_int32(int32_t *x, size_t n);
+
+#endif
diff --git a/crypto_kem/sntrup653/avx2/crypto_sort_uint32.c b/crypto_kem/sntrup653/avx2/crypto_sort_uint32.c
new file mode 100644
index 00000000..289d07f6
--- /dev/null
+++ b/crypto_kem/sntrup653/avx2/crypto_sort_uint32.c
@@ -0,0 +1,20 @@
+#include "crypto_sort_int32.h"
+#include "crypto_sort_uint32.h"
+#include <stdint.h>
+
+#define uint32 uint32_t
+
+/* can save time by vectorizing xor loops */
+/* can save time by integrating xor loops with int32_sort */
+
+void PQCLEAN_SNTRUP653_AVX2_crypto_sort_uint32(uint32_t *array, size_t n) {
+    uint32 *x = array;
+    size_t j;
+    for (j = 0; j < n; ++j) {
+        x[j] ^= 0x80000000;
+    }
+    PQCLEAN_SNTRUP653_AVX2_crypto_sort_int32((int32_t *)array, n);
+    for (j = 0; j < n; ++j) {
+        x[j] ^= 0x80000000;
+    }
+}
diff --git a/crypto_kem/sntrup653/avx2/crypto_sort_uint32.h b/crypto_kem/sntrup653/avx2/crypto_sort_uint32.h
new file mode 100644
index 00000000..cab8ea1f
--- /dev/null
+++ b/crypto_kem/sntrup653/avx2/crypto_sort_uint32.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_SNTRUP653_AVX2_CRYPTO_SORT_UINT32_H
+#define PQCLEAN_SNTRUP653_AVX2_CRYPTO_SORT_UINT32_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+
+void PQCLEAN_SNTRUP653_AVX2_crypto_sort_uint32(uint32_t *array, size_t n);
+
+#endif
diff --git a/crypto_kem/sntrup653/avx2/crypto_stream_aes256ctr.c b/crypto_kem/sntrup653/avx2/crypto_stream_aes256ctr.c
new file mode 100644
index 00000000..ad8eccf9
--- /dev/null
+++ b/crypto_kem/sntrup653/avx2/crypto_stream_aes256ctr.c
@@ -0,0 +1,15 @@
+#include "crypto_stream_aes256ctr.h"
+
+
+int PQCLEAN_SNTRUP653_AVX2_crypto_stream_aes256ctr(
+    uint8_t *out,
+    size_t outlen,
+    const uint8_t nonce[AESCTR_NONCEBYTES],
+    const uint8_t key[AES256_KEYBYTES]) {
+
+    aes256ctx state;
+    aes256_ctr_keyexp(&state, key);
+    aes256_ctr(out, outlen, nonce, &state);
+    aes256_ctx_release(&state);
+    return 0;
+}
diff --git a/crypto_kem/sntrup653/avx2/crypto_stream_aes256ctr.h b/crypto_kem/sntrup653/avx2/crypto_stream_aes256ctr.h
new file mode 100644
index 00000000..f4a2d133
--- /dev/null
+++ b/crypto_kem/sntrup653/avx2/crypto_stream_aes256ctr.h
@@ -0,0 +1,15 @@
+#ifndef PQCLEAN_SNTRUP653_AVX2_CRYPTO_STREAM_AES256CTR_H
+#define PQCLEAN_SNTRUP653_AVX2_CRYPTO_STREAM_AES256CTR_H
+#include "aes.h"
+#include <stddef.h>
+#include <stdint.h>
+
+
+
+int PQCLEAN_SNTRUP653_AVX2_crypto_stream_aes256ctr(
+    uint8_t *out,
+    size_t outlen,
+    const uint8_t nonce[AESCTR_NONCEBYTES],
+    const uint8_t key[AES256_KEYBYTES]);
+
+#endif
diff --git a/crypto_kem/sntrup653/avx2/crypto_verify_897.c b/crypto_kem/sntrup653/avx2/crypto_verify_897.c
new file mode 100644
index 00000000..4b298e67
--- /dev/null
+++ b/crypto_kem/sntrup653/avx2/crypto_verify_897.c
@@ -0,0 +1,36 @@
+#include "crypto_verify_897.h"
+#include <immintrin.h>
+
+int PQCLEAN_SNTRUP653_AVX2_crypto_verify_897(const unsigned char *x, const unsigned char *y) {
+    __m256i diff = _mm256_set1_epi8(0);
+    unsigned int differentbits = 0;
+    int i = PQCLEAN_SNTRUP653_AVX2_crypto_verify_897_BYTES;
+
+    i -= 32;
+    for (;;) {
+        do {
+            __m256i x0 = _mm256_loadu_si256((__m256i *) x);
+            __m256i y0 = _mm256_loadu_si256((__m256i *) y);
+            diff |= x0 ^ y0;
+            i -= 32;
+            x += 32;
+            y += 32;
+        } while (i >= 0);
+        if (i <= -32) {
+            break;
+        }
+        x += i;
+        y += i;
+    }
+
+    diff |= _mm256_srli_epi16(diff, 8);
+    diff |= _mm256_srli_epi32(diff, 16);
+    diff |= _mm256_srli_epi64(diff, 32);
+
+    differentbits = _mm256_extract_epi8(diff, 0);
+    differentbits |= _mm256_extract_epi8(diff, 8);
+    differentbits |= _mm256_extract_epi8(diff, 16);
+    differentbits |= _mm256_extract_epi8(diff, 24);
+
+    return (int) (1 & ((differentbits - 1) >> 8)) - 1;
+}
diff --git a/crypto_kem/sntrup653/avx2/crypto_verify_897.h b/crypto_kem/sntrup653/avx2/crypto_verify_897.h
new file mode 100644
index 00000000..4d3f4260
--- /dev/null
+++ b/crypto_kem/sntrup653/avx2/crypto_verify_897.h
@@ -0,0 +1,8 @@
+#ifndef PQCLEAN_SNTRUP653_AVX2_CRYPTO_VERIFY_897_H
+#define PQCLEAN_SNTRUP653_AVX2_CRYPTO_VERIFY_897_H
+
+#include <stdint.h>
+#define PQCLEAN_SNTRUP653_AVX2_crypto_verify_897_BYTES 897
+
+int PQCLEAN_SNTRUP653_AVX2_crypto_verify_897(const unsigned char *x, const unsigned char *y);
+#endif
diff --git a/crypto_kem/sntrup653/avx2/kem.c b/crypto_kem/sntrup653/avx2/kem.c
new file mode 100644
index 00000000..b9c026c5
--- /dev/null
+++ b/crypto_kem/sntrup653/avx2/kem.c
@@ -0,0 +1,247 @@
+#include "api.h"
+#include "crypto_sort_uint32.h"
+#include "params.h"
+#include "randombytes.h"
+#include "sha2.h"
+
+
+
+#define int8 int8_t
+#define int16 int16_t
+#define int32 int32_t
+#define uint16 uint16_t
+#define uint32 uint32_t
+
+/* ----- arithmetic mod 3 */
+
+typedef int8 small;
+/* F3 is always represented as -1,0,1 */
+
+/* ----- arithmetic mod q */
+
+typedef int16 Fq;
+/* always represented as -(q-1)/2...(q-1)/2 */
+
+/* ----- small polynomials */
+
+/* R3_fromR(R_fromRq(r)) */
+static void R3_fromRq(small *out, const Fq *r) {
+    crypto_encode_pxfreeze3((unsigned char *) out, (unsigned char *) r);
+}
+
+/* h = f*g in the ring R3 */
+static void R3_mult(small *h, const small *f, const small *g) {
+    crypto_core_mult3((unsigned char *) h, (const unsigned char *) f, (const unsigned char *) g);
+}
+
+/* ----- polynomials mod q */
+
+/* h = h*g in the ring Rq */
+static void Rq_mult_small(Fq *h, const small *g) {
+    crypto_encode_pxint16((unsigned char *) h, h);
+    crypto_core_mult((unsigned char *) h, (const unsigned char *) h, (const unsigned char *) g);
+    crypto_decode_pxint16(h, (const unsigned char *) h);
+}
+
+/* h = 3f in Rq */
+static void Rq_mult3(Fq *h, const Fq *f) {
+    crypto_encode_pxint16((unsigned char *) h, f);
+    crypto_core_scale3((unsigned char *) h, (const unsigned char *) h);
+    crypto_decode_pxint16(h, (const unsigned char *) h);
+}
+
+/* out = 1/(3*in) in Rq */
+/* caller must have 2p+1 bytes free in out, not just 2p */
+static void Rq_recip3(Fq *out, const small *in) {
+    crypto_core_inv((unsigned char *) out, (const unsigned char *) in);
+    /* could check byte 2*p for failure; but, in context, inv always works */
+    crypto_decode_pxint16(out, (unsigned char *) out);
+}
+
+/* ----- underlying hash function */
+
+#define Hash_bytes 32
+
+static void Hash(unsigned char *out, const unsigned char *in, int inlen) {
+    unsigned char h[64];
+    int i;
+    sha512(h, in, inlen);
+    for (i = 0; i < 32; ++i) {
+        out[i] = h[i];
+    }
+}
+
+/* ----- higher-level randomness */
+
+static void Short_random(small *out) {
+    uint32 L[ppadsort];
+    int i;
+
+    randombytes((unsigned char *) L, 4 * p);
+    crypto_decode_pxint32(L, (unsigned char *) L);
+    for (i = 0; i < w; ++i) {
+        L[i] = L[i] & (uint32) - 2;
+    }
+    for (i = w; i < p; ++i) {
+        L[i] = (L[i] & (uint32) - 3) | 1;
+    }
+    for (i = p; i < ppadsort; ++i) {
+        L[i] = 0xffffffff;
+    }
+    PQCLEAN_SNTRUP653_AVX2_crypto_sort_uint32(L, ppadsort);
+    for (i = 0; i < p; ++i) {
+        out[i] = (L[i] & 3) - 1;
+    }
+}
+
+static void Small_random(small *out) {
+    uint32 L[p];
+    int i;
+
+    randombytes((unsigned char *) L, sizeof L);
+    crypto_decode_pxint32(L, (unsigned char *) L);
+    for (i = 0; i < p; ++i) {
+        out[i] = (((L[i] & 0x3fffffff) * 3) >> 30) - 1;
+    }
+}
+
+/* ----- Streamlined NTRU Prime */
+
+typedef small Inputs[p]; /* passed by reference */
+#define Ciphertexts_bytes Rounded_bytes
+#define SecretKeys_bytes (2*Small_bytes)
+#define PublicKeys_bytes Rq_bytes
+#define Confirm_bytes 32
+
+/* c,r_enc[1:] = Hide(r,pk,cache); cache is Hash4(pk) */
+/* also set r_enc[0]=3 */
+/* also set x[0]=2, and x[1:1+Hash_bytes] = Hash3(r_enc) */
+/* also overwrite x[1+Hash_bytes:1+2*Hash_bytes] */
+static void Hide(unsigned char *x, unsigned char *c, unsigned char *r_enc, const Inputs r, const unsigned char *pk, const unsigned char *cache) {
+    Fq h[p];
+    int i;
+
+    Small_encode(r_enc + 1, r);
+    Rq_decode(h, pk);
+    Rq_mult_small(h, r);
+    Round_and_encode(c, h);
+    r_enc[0] = 3;
+    Hash(x + 1, r_enc, 1 + Small_bytes);
+    for (i = 0; i < Hash_bytes; ++i) {
+        x[1 + Hash_bytes + i] = cache[i];
+    }
+    x[0] = 2;
+    Hash(c + Ciphertexts_bytes, x, 1 + Hash_bytes * 2);
+}
+
+
+int PQCLEAN_SNTRUP653_AVX2_crypto_kem_keypair(unsigned char *pk, unsigned char *sk) {
+    small g[p];
+    for (;;) {
+        Small_random(g);
+        {
+            small v[p + 1];
+            crypto_core_inv3((unsigned char *) v, (const unsigned char *) g);
+            if (v[p] == 0) {
+                Small_encode(sk + Small_bytes, v);
+                break;
+            }
+        }
+    }
+    {
+        small f[p];
+        Short_random(f);
+        Small_encode(sk, f);
+        {
+            Fq h[p + 1];
+            Rq_recip3(h, f); /* always works */
+            Rq_mult_small(h, g);
+            Rq_encode(pk, h);
+        }
+    }
+    {
+        int i;
+        unsigned char sksave = sk[SecretKeys_bytes - 1];
+        for (i = 0; i < PublicKeys_bytes; ++i) {
+            sk[SecretKeys_bytes + i] = pk[i];
+        }
+        sk[SecretKeys_bytes - 1] = 4;
+        Hash(sk + SecretKeys_bytes + PublicKeys_bytes + Small_bytes, sk + SecretKeys_bytes - 1, 1 + PublicKeys_bytes);
+        sk[SecretKeys_bytes - 1] = sksave;
+        randombytes(sk + SecretKeys_bytes + PublicKeys_bytes, Small_bytes);
+    }
+    return 0;
+}
+
+int PQCLEAN_SNTRUP653_AVX2_crypto_kem_enc(unsigned char *c, unsigned char *k, const unsigned char *pk) {
+    unsigned char cache[Hash_bytes];
+    int i;
+    {
+        unsigned char y[1 + PublicKeys_bytes]; /* XXX: can eliminate with incremental hashing */
+        for (i = 0; i < PublicKeys_bytes; ++i) {
+            y[1 + i] = pk[i];
+        }
+        y[0] = 4;
+        Hash(cache, y, sizeof y);
+    }
+    {
+        Inputs r;
+        Short_random(r);
+        {
+            unsigned char r_enc[Small_bytes + 1];
+            unsigned char x[1 + Hash_bytes + Ciphertexts_bytes + Confirm_bytes];
+            Hide(x, c, r_enc, r, pk, cache);
+            for (i = 0; i < Ciphertexts_bytes + Confirm_bytes; ++i) {
+                x[1 + Hash_bytes + i] = c[i];
+            }
+            x[0] = 1;
+            Hash(k, x, sizeof x);
+        }
+    }
+    return 0;
+}
+
+int PQCLEAN_SNTRUP653_AVX2_crypto_kem_dec(unsigned char *k, const unsigned char *c, const unsigned char *sk) {
+    const unsigned char *pk = sk + SecretKeys_bytes;
+    const unsigned char *rho = pk + PublicKeys_bytes;
+    const unsigned char *cache = rho + Small_bytes;
+    int mask, i;
+    Inputs r;
+    {
+        Fq d[p];
+        Rounded_decode(d, c);
+        {
+            small f[p];
+            Small_decode(f, sk);
+            Rq_mult_small(d, f);
+            Rq_mult3(d, d);
+        }
+        {
+            small e[p];
+            small v[p];
+            R3_fromRq(e, d);
+            Small_decode(v, sk + Small_bytes);
+            R3_mult(r, e, v);
+        }
+        crypto_core_wforce((unsigned char *) r, (unsigned char *) r);
+    }
+    {
+        unsigned char r_enc[1 + Small_bytes];
+        unsigned char cnew[Ciphertexts_bytes + Confirm_bytes];
+        unsigned char x[1 + Hash_bytes + Ciphertexts_bytes + Confirm_bytes];
+        /* XXX: can use incremental hashing to reduce x size */
+
+        Hide(x, cnew, r_enc, r, pk, cache);
+        mask = crypto_verify_clen(c, cnew);
+        for (i = 0; i < Small_bytes; ++i) {
+            r_enc[i + 1] ^= mask & (r_enc[i + 1] ^ rho[i]);
+        }
+        Hash(x + 1, r_enc, 1 + Small_bytes); /* XXX: can instead do cmov on cached hash of rho */
+        for (i = 0; i < Ciphertexts_bytes + Confirm_bytes; ++i) {
+            x[1 + Hash_bytes + i] = c[i];
+        }
+        x[0] = 1 + mask;
+        Hash(k, x, sizeof x);
+    }
+    return 0;
+}
diff --git a/crypto_kem/sntrup653/avx2/params.h b/crypto_kem/sntrup653/avx2/params.h
new file mode 100644
index 00000000..08c585b0
--- /dev/null
+++ b/crypto_kem/sntrup653/avx2/params.h
@@ -0,0 +1,71 @@
+#ifndef params_H
+#define params_H
+#include "crypto_core_inv3sntrup653.h"
+#include "crypto_core_invsntrup653.h"
+#include "crypto_core_mult3sntrup653.h"
+#include "crypto_core_multsntrup653.h"
+#include "crypto_core_scale3sntrup653.h"
+#include "crypto_core_weightsntrup653.h"
+#include "crypto_core_wforcesntrup653.h"
+#include "crypto_decode_653x1541.h"
+#include "crypto_decode_653x3.h"
+#include "crypto_decode_653x4621.h"
+#include "crypto_decode_653xint16.h"
+#include "crypto_decode_653xint32.h"
+#include "crypto_encode_653x1541.h"
+#include "crypto_encode_653x1541round.h"
+#include "crypto_encode_653x3.h"
+#include "crypto_encode_653x4621.h"
+#include "crypto_encode_653xfreeze3.h"
+#include "crypto_encode_653xint16.h"
+#include "crypto_encode_int16.h"
+#include "crypto_verify_897.h"
+
+
+#define p 653
+#define qinv (-29499) /* reciprocal of q mod 2^16 */
+#define q27 29045 /* closest integer to 2^27/q */
+#define q18 57 /* closest integer to 2^18/q */
+#define ppad 657
+#define crypto_core_weight PQCLEAN_SNTRUP653_AVX2_crypto_core_weightsntrup653
+#define q 4621
+#define w 288
+
+#define ppadsort 653
+
+#define crypto_verify_clen PQCLEAN_SNTRUP653_AVX2_crypto_verify_897
+
+#define Rq_bytes PQCLEAN_SNTRUP653_AVX2_crypto_encode_653x4621_STRBYTES
+#define Rq_encode PQCLEAN_SNTRUP653_AVX2_crypto_encode_653x4621
+#define Rq_decode PQCLEAN_SNTRUP653_AVX2_crypto_decode_653x4621
+
+#define Rounded_bytes PQCLEAN_SNTRUP653_AVX2_crypto_decode_653x1541_STRBYTES
+#define Rounded_decode PQCLEAN_SNTRUP653_AVX2_crypto_decode_653x1541
+
+#define Round_and_encode PQCLEAN_SNTRUP653_AVX2_crypto_encode_653x1541round
+
+#define Small_bytes PQCLEAN_SNTRUP653_AVX2_crypto_encode_653x3_STRBYTES
+#define Small_encode PQCLEAN_SNTRUP653_AVX2_crypto_encode_653x3
+#define Small_decode PQCLEAN_SNTRUP653_AVX2_crypto_decode_653x3
+
+#define crypto_encode_pxfreeze3 PQCLEAN_SNTRUP653_AVX2_crypto_encode_653xfreeze3
+
+#define crypto_decode_pxint32 PQCLEAN_SNTRUP653_AVX2_crypto_decode_653xint32
+
+#define crypto_decode_pxint16 PQCLEAN_SNTRUP653_AVX2_crypto_decode_653xint16
+
+#define crypto_encode_pxint16 PQCLEAN_SNTRUP653_AVX2_crypto_encode_653xint16
+
+#define crypto_core_wforce PQCLEAN_SNTRUP653_AVX2_crypto_core_wforcesntrup653
+
+#define crypto_core_scale3 PQCLEAN_SNTRUP653_AVX2_crypto_core_scale3sntrup653
+
+#define crypto_core_inv PQCLEAN_SNTRUP653_AVX2_crypto_core_invsntrup653
+
+#define crypto_core_inv3 PQCLEAN_SNTRUP653_AVX2_crypto_core_inv3sntrup653
+
+#define crypto_core_mult PQCLEAN_SNTRUP653_AVX2_crypto_core_multsntrup653
+
+#define crypto_core_mult3 PQCLEAN_SNTRUP653_AVX2_crypto_core_mult3sntrup653
+
+#endif
diff --git a/crypto_kem/sntrup653/clean/LICENSE b/crypto_kem/sntrup653/clean/LICENSE
new file mode 100644
index 00000000..d5d21fff
--- /dev/null
+++ b/crypto_kem/sntrup653/clean/LICENSE
@@ -0,0 +1 @@
+Public Domain
diff --git a/crypto_kem/sntrup653/clean/Makefile b/crypto_kem/sntrup653/clean/Makefile
new file mode 100644
index 00000000..81ded6d1
--- /dev/null
+++ b/crypto_kem/sntrup653/clean/Makefile
@@ -0,0 +1,19 @@
+# This Makefile can be used with GNU Make or BSD Make
+
+LIB=libsntrup653_clean.a
+HEADERS=api.h crypto_core_inv3sntrup653.h crypto_core_invsntrup653.h crypto_core_mult3sntrup653.h crypto_core_multsntrup653.h crypto_core_scale3sntrup653.h crypto_core_weightsntrup653.h crypto_core_wforcesntrup653.h crypto_decode_653x1541.h crypto_decode_653x3.h crypto_decode_653x4621.h crypto_decode_653xint16.h crypto_decode_653xint32.h crypto_encode_653x1541.h crypto_encode_653x1541round.h crypto_encode_653x3.h crypto_encode_653x4621.h crypto_encode_653xfreeze3.h crypto_encode_653xint16.h crypto_encode_int16.h crypto_sort_int32.h crypto_sort_uint32.h crypto_stream_aes256ctr.h crypto_verify_897.h params.h 
+OBJECTS=crypto_core_inv3sntrup653.o crypto_core_invsntrup653.o crypto_core_mult3sntrup653.o crypto_core_multsntrup653.o crypto_core_scale3sntrup653.o crypto_core_weightsntrup653.o crypto_core_wforcesntrup653.o crypto_decode_653x1541.o crypto_decode_653x3.o crypto_decode_653x4621.o crypto_decode_653xint16.o crypto_decode_653xint32.o crypto_encode_653x1541.o crypto_encode_653x1541round.o crypto_encode_653x3.o crypto_encode_653x4621.o crypto_encode_653xfreeze3.o crypto_encode_653xint16.o crypto_encode_int16.o crypto_sort_int32.o crypto_sort_uint32.o crypto_stream_aes256ctr.o crypto_verify_897.o kem.o 
+
+CFLAGS=-O3 -Wall -Wextra -Wpedantic -Wvla -Werror -Wredundant-decls -Wmissing-prototypes -std=c99 -I../../../common $(EXTRAFLAGS)
+
+all: $(LIB)
+
+%.o: %.c $(HEADERS)
+	$(CC) $(CFLAGS) -c -o $@ $<
+
+$(LIB): $(OBJECTS)
+	$(AR) -r $@ $(OBJECTS)
+
+clean:
+	$(RM) $(OBJECTS)
+	$(RM) $(LIB)
diff --git a/crypto_kem/sntrup653/clean/Makefile.Microsoft_nmake b/crypto_kem/sntrup653/clean/Makefile.Microsoft_nmake
new file mode 100644
index 00000000..5d595797
--- /dev/null
+++ b/crypto_kem/sntrup653/clean/Makefile.Microsoft_nmake
@@ -0,0 +1,19 @@
+# This Makefile can be used with Microsoft Visual Studio's nmake using the command:
+#    nmake /f Makefile.Microsoft_nmake
+
+LIBRARY=libsntrup653_clean.lib
+OBJECTS=crypto_core_inv3sntrup653.obj crypto_core_invsntrup653.obj crypto_core_mult3sntrup653.obj crypto_core_multsntrup653.obj crypto_core_scale3sntrup653.obj crypto_core_weightsntrup653.obj crypto_core_wforcesntrup653.obj crypto_decode_653x1541.obj crypto_decode_653x3.obj crypto_decode_653x4621.obj crypto_decode_653xint16.obj crypto_decode_653xint32.obj crypto_encode_653x1541.obj crypto_encode_653x1541round.obj crypto_encode_653x3.obj crypto_encode_653x4621.obj crypto_encode_653xfreeze3.obj crypto_encode_653xint16.obj crypto_encode_int16.obj crypto_sort_int32.obj crypto_sort_uint32.obj crypto_stream_aes256ctr.obj crypto_verify_897.obj kem.obj 
+
+CFLAGS=/nologo /O2 /I ..\..\..\common /W4 /WX
+
+all: $(LIBRARY)
+
+# Make sure objects are recompiled if headers change.
+$(OBJECTS): *.h
+
+$(LIBRARY): $(OBJECTS)
+    LIB.EXE /NOLOGO /WX /OUT:$@ $**
+
+clean:
+    -DEL $(OBJECTS)
+    -DEL $(LIBRARY)
diff --git a/crypto_kem/sntrup653/clean/api.h b/crypto_kem/sntrup653/clean/api.h
new file mode 100644
index 00000000..641b2079
--- /dev/null
+++ b/crypto_kem/sntrup653/clean/api.h
@@ -0,0 +1,16 @@
+#ifndef PQCLEAN_SNTRUP653_CLEAN_API_H
+#define PQCLEAN_SNTRUP653_CLEAN_API_H
+
+
+
+#define PQCLEAN_SNTRUP653_CLEAN_CRYPTO_ALGNAME "sntrup653"
+
+#define PQCLEAN_SNTRUP653_CLEAN_CRYPTO_SECRETKEYBYTES 1518
+#define PQCLEAN_SNTRUP653_CLEAN_CRYPTO_PUBLICKEYBYTES 994
+#define PQCLEAN_SNTRUP653_CLEAN_CRYPTO_CIPHERTEXTBYTES 897
+#define PQCLEAN_SNTRUP653_CLEAN_CRYPTO_BYTES 32
+
+int PQCLEAN_SNTRUP653_CLEAN_crypto_kem_keypair(unsigned char *pk, unsigned char *sk);
+int PQCLEAN_SNTRUP653_CLEAN_crypto_kem_enc(unsigned char *c, unsigned char *k, const unsigned char *pk);
+int PQCLEAN_SNTRUP653_CLEAN_crypto_kem_dec(unsigned char *k, const unsigned char *c, const unsigned char *sk);
+#endif
diff --git a/crypto_kem/sntrup653/clean/crypto_core_inv3sntrup653.c b/crypto_kem/sntrup653/clean/crypto_core_inv3sntrup653.c
new file mode 100644
index 00000000..27a30edc
--- /dev/null
+++ b/crypto_kem/sntrup653/clean/crypto_core_inv3sntrup653.c
@@ -0,0 +1,110 @@
+#include "crypto_core_inv3sntrup653.h"
+#include "params.h"
+
+
+
+#define int8 int8_t
+#define int16 int16_t
+#define int32 int32_t
+#define uint16 uint16_t
+#define uint32 uint32_t
+
+/* ----- masks */
+
+/* return -1 if x!=0; else return 0 */
+static int int16_nonzero_mask(int16 x) {
+    uint16 u = x; /* 0, else 1...65535 */
+    uint32 v = u; /* 0, else 1...65535 */
+    v = -v; /* 0, else 2^32-65535...2^32-1 */
+    v >>= 31; /* 0, else 1 */
+    return -v; /* 0, else -1 */
+}
+
+/* return -1 if x<0; otherwise return 0 */
+static int int16_negative_mask(int16 x) {
+    uint16 u = x;
+    u >>= 15;
+    return -(int) u;
+    /* alternative with gcc -fwrapv: */
+    /* x>>15 compiles to CPU's arithmetic right shift */
+}
+
+/* ----- arithmetic mod 3 */
+
+typedef int8 small;
+/* F3 is always represented as -1,0,1 */
+
+/* works for -16384 <= x < 16384 */
+static small F3_freeze(int16 x) {
+    return x - 3 * ((10923 * x + 16384) >> 15);
+}
+
+/* byte p of output is 0 if recip succeeded; else -1 */
+int PQCLEAN_SNTRUP653_CLEAN_crypto_core_inv3sntrup653(unsigned char *outbytes, const unsigned char *inbytes) {
+    small *out = (void *) outbytes;
+    small *in = (void *) inbytes;
+    small f[p + 1], g[p + 1], v[p + 1], r[p + 1];
+    int i, loop, delta;
+    int sign, swap, t;
+
+    for (i = 0; i < p + 1; ++i) {
+        v[i] = 0;
+    }
+    for (i = 0; i < p + 1; ++i) {
+        r[i] = 0;
+    }
+    r[0] = 1;
+    for (i = 0; i < p; ++i) {
+        f[i] = 0;
+    }
+    f[0] = 1;
+    f[p - 1] = f[p] = -1;
+    for (i = 0; i < p; ++i) {
+        small i1 = in[i] & 1;
+        g[p - 1 - i] = i1 - (in[i] & (i1 << 1));
+    }
+    g[p] = 0;
+
+    delta = 1;
+
+    for (loop = 0; loop < 2 * p - 1; ++loop) {
+        for (i = p; i > 0; --i) {
+            v[i] = v[i - 1];
+        }
+        v[0] = 0;
+
+        sign = -g[0] * f[0];
+        swap = int16_negative_mask(-delta) & int16_nonzero_mask(g[0]);
+        delta ^= swap & (delta ^ -delta);
+        delta += 1;
+
+        for (i = 0; i < p + 1; ++i) {
+            t = swap & (f[i] ^ g[i]);
+            f[i] ^= t;
+            g[i] ^= t;
+            t = swap & (v[i] ^ r[i]);
+            v[i] ^= t;
+            r[i] ^= t;
+        }
+
+        for (i = 0; i < p + 1; ++i) {
+            g[i] = F3_freeze(g[i] + sign * f[i]);
+        }
+        for (i = 0; i < p + 1; ++i) {
+            r[i] = F3_freeze(r[i] + sign * v[i]);
+        }
+
+        for (i = 0; i < p; ++i) {
+            g[i] = g[i + 1];
+        }
+        g[p] = 0;
+    }
+
+    sign = f[0];
+    for (i = 0; i < p; ++i) {
+        out[i] = sign * v[p - 1 - i];
+    }
+
+    out[p] = int16_nonzero_mask(delta);
+    return 0;
+}
diff --git a/crypto_kem/sntrup653/clean/crypto_core_inv3sntrup653.h b/crypto_kem/sntrup653/clean/crypto_core_inv3sntrup653.h
new file mode 100644
index 00000000..3e0d817e
--- /dev/null
+++ b/crypto_kem/sntrup653/clean/crypto_core_inv3sntrup653.h
@@ -0,0 +1,11 @@
+#ifndef PQCLEAN_SNTRUP653_CLEAN_CRYPTO_CORE_INV3SNTRUP653_H
+#define PQCLEAN_SNTRUP653_CLEAN_CRYPTO_CORE_INV3SNTRUP653_H
+
+#include <stdint.h>
+#define PQCLEAN_SNTRUP653_CLEAN_crypto_core_inv3sntrup653_OUTPUTBYTES 654
+#define PQCLEAN_SNTRUP653_CLEAN_crypto_core_inv3sntrup653_INPUTBYTES 653
+#define PQCLEAN_SNTRUP653_CLEAN_crypto_core_inv3sntrup653_KEYBYTES 0
+#define PQCLEAN_SNTRUP653_CLEAN_crypto_core_inv3sntrup653_CONSTBYTES 0
+
+int PQCLEAN_SNTRUP653_CLEAN_crypto_core_inv3sntrup653(unsigned char *outbytes, const unsigned char *inbytes);
+#endif
diff --git a/crypto_kem/sntrup653/clean/crypto_core_invsntrup653.c b/crypto_kem/sntrup653/clean/crypto_core_invsntrup653.c
new file mode 100644
index 00000000..e69c7c40
--- /dev/null
+++ b/crypto_kem/sntrup653/clean/crypto_core_invsntrup653.c
@@ -0,0 +1,131 @@
+#include "crypto_core_invsntrup653.h"
+#include "params.h"
+
+
+#define int8 int8_t
+#define int16 int16_t
+#define int32 int32_t
+#define uint16 uint16_t
+#define uint32 uint32_t
+
+
+/* ----- masks */
+
+/* return -1 if x!=0; else return 0 */
+static int int16_nonzero_mask(int16 x) {
+    uint16 u = x; /* 0, else 1...65535 */
+    uint32 v = u; /* 0, else 1...65535 */
+    v = -v; /* 0, else 2^32-65535...2^32-1 */
+    v >>= 31; /* 0, else 1 */
+    return -v; /* 0, else -1 */
+}
+
+/* return -1 if x<0; otherwise return 0 */
+static int int16_negative_mask(int16 x) {
+    uint16 u = x;
+    u >>= 15;
+    return -(int) u;
+    /* alternative with gcc -fwrapv: */
+    /* x>>15 compiles to CPU's arithmetic right shift */
+}
+
+/* ----- arithmetic mod q */
+
+typedef int8 small;
+
+typedef int16 Fq;
+/* always represented as -(q-1)/2...(q-1)/2 */
+
+/* works for -14000000 < x < 14000000 if q in 4591, 4621, 5167 */
+static Fq Fq_freeze(int32 x) {
+    x -= q * ((q18 * x) >> 18);
+    x -= q * ((q27 * x + 67108864) >> 27);
+    return x;
+}
+
+static Fq Fq_recip(Fq a1) {
+    int i = 1;
+    Fq ai = a1;
+
+    while (i < q - 2) {
+        ai = Fq_freeze(a1 * (int32)ai);
+        i += 1;
+    }
+    return ai;
+}
+
+/* ----- polynomials mod q */
+
+/* out = 1/(3*in) in Rq */
+/* outbytes[2*p] is 0 if recip succeeded; else -1 */
+int PQCLEAN_SNTRUP653_CLEAN_crypto_core_invsntrup653(unsigned char *outbytes, const unsigned char *inbytes) {
+    small *in = (void *) inbytes;
+    Fq out[p], f[p + 1], g[p + 1], v[p + 1], r[p + 1];
+    int i, loop, delta;
+    int swap, t;
+    int32 f0, g0;
+    Fq scale;
+
+    for (i = 0; i < p + 1; ++i) {
+        v[i] = 0;
+    }
+    for (i = 0; i < p + 1; ++i) {
+        r[i] = 0;
+    }
+    r[0] = Fq_recip(3);
+    for (i = 0; i < p; ++i) {
+        f[i] = 0;
+    }
+    f[0] = 1;
+    f[p - 1] = f[p] = -1;
+    for (i = 0; i < p; ++i) {
+        g[p - 1 - i] = in[i];
+    }
+    g[p] = 0;
+
+    delta = 1;
+
+    for (loop = 0; loop < 2 * p - 1; ++loop) {
+        for (i = p; i > 0; --i) {
+            v[i] = v[i - 1];
+        }
+        v[0] = 0;
+
+        swap = int16_negative_mask(-delta) & int16_nonzero_mask(g[0]);
+        delta ^= swap & (delta ^ -delta);
+        delta += 1;
+
+        for (i = 0; i < p + 1; ++i) {
+            t = swap & (f[i] ^ g[i]);
+            f[i] ^= t;
+            g[i] ^= t;
+            t = swap & (v[i] ^ r[i]);
+            v[i] ^= t;
+            r[i] ^= t;
+        }
+
+        f0 = f[0];
+        g0 = g[0];
+        for (i = 0; i < p + 1; ++i) {
+            g[i] = Fq_freeze(f0 * g[i] - g0 * f[i]);
+        }
+        for (i = 0; i < p + 1; ++i) {
+            r[i] = Fq_freeze(f0 * r[i] - g0 * v[i]);
+        }
+
+        for (i = 0; i < p; ++i) {
+            g[i] = g[i + 1];
+        }
+        g[p] = 0;
+    }
+
+    scale = Fq_recip(f[0]);
+    for (i = 0; i < p; ++i) {
+        out[i] = Fq_freeze(scale * (int32)v[p - 1 - i]);
+    }
+
+    crypto_encode_pxint16(outbytes, out);
+
+    outbytes[2 * p] = int16_nonzero_mask(delta);
+    return 0;
+}
diff --git a/crypto_kem/sntrup653/clean/crypto_core_invsntrup653.h b/crypto_kem/sntrup653/clean/crypto_core_invsntrup653.h
new file mode 100644
index 00000000..ab7301fd
--- /dev/null
+++ b/crypto_kem/sntrup653/clean/crypto_core_invsntrup653.h
@@ -0,0 +1,11 @@
+#ifndef PQCLEAN_SNTRUP653_CLEAN_CRYPTO_CORE_INVSNTRUP653_H
+#define PQCLEAN_SNTRUP653_CLEAN_CRYPTO_CORE_INVSNTRUP653_H
+
+#include <stdint.h>
+#define PQCLEAN_SNTRUP653_CLEAN_crypto_core_invsntrup653_OUTPUTBYTES 1307
+#define PQCLEAN_SNTRUP653_CLEAN_crypto_core_invsntrup653_INPUTBYTES 653
+#define PQCLEAN_SNTRUP653_CLEAN_crypto_core_invsntrup653_KEYBYTES 0
+#define PQCLEAN_SNTRUP653_CLEAN_crypto_core_invsntrup653_CONSTBYTES 0
+
+int PQCLEAN_SNTRUP653_CLEAN_crypto_core_invsntrup653(unsigned char *outbytes, const unsigned char *inbytes);
+#endif
diff --git a/crypto_kem/sntrup653/clean/crypto_core_mult3sntrup653.c b/crypto_kem/sntrup653/clean/crypto_core_mult3sntrup653.c
new file mode 100644
index 00000000..3947b9df
--- /dev/null
+++ b/crypto_kem/sntrup653/clean/crypto_core_mult3sntrup653.c
@@ -0,0 +1,57 @@
+#include "crypto_core_mult3sntrup653.h"
+#include "params.h"
+
+
+#define int8 int8_t
+#define int16 int16_t
+typedef int8 small;
+
+/* works for -16384 <= x < 16384 */
+static small F3_freeze(int16 x) {
+    return x - 3 * ((10923 * x + 16384) >> 15);
+}
+
+int PQCLEAN_SNTRUP653_CLEAN_crypto_core_mult3sntrup653(unsigned char *outbytes, const unsigned char *inbytes, const unsigned char *kbytes) {
+    small *h = (void *) outbytes;
+    small f[p];
+    small g[p];
+    small fg[p + p - 1];
+    int16 result;
+    int i, j;
+
+    for (i = 0; i < p; ++i) {
+        small fi = inbytes[i];
+        small fi0 = fi & 1;
+        f[i] = fi0 - (fi & (fi0 << 1));
+    }
+    for (i = 0; i < p; ++i) {
+        small gi = kbytes[i];
+        small gi0 = gi & 1;
+        g[i] = gi0 - (gi & (gi0 << 1));
+    }
+
+    for (i = 0; i < p; ++i) {
+        result = 0;
+        for (j = 0; j <= i; ++j) {
+            result += f[j] * g[i - j];
+        }
+        fg[i] = F3_freeze(result);
+    }
+    for (i = p; i < p + p - 1; ++i) {
+        result = 0;
+        for (j = i - p + 1; j < p; ++j) {
+            result += f[j] * g[i - j];
+        }
+        fg[i] = F3_freeze(result);
+    }
+
+    for (i = p + p - 2; i >= p; --i) {
+        fg[i - p] = F3_freeze(fg[i - p] + fg[i]);
+        fg[i - p + 1] = F3_freeze(fg[i - p + 1] + fg[i]);
+    }
+
+    for (i = 0; i < p; ++i) {
+        h[i] = fg[i];
+    }
+    return 0;
+}
diff --git a/crypto_kem/sntrup653/clean/crypto_core_mult3sntrup653.h b/crypto_kem/sntrup653/clean/crypto_core_mult3sntrup653.h
new file mode 100644
index 00000000..0a944710
--- /dev/null
+++ b/crypto_kem/sntrup653/clean/crypto_core_mult3sntrup653.h
@@ -0,0 +1,11 @@
+#ifndef PQCLEAN_SNTRUP653_CLEAN_CRYPTO_CORE_MULT3SNTRUP653_H
+#define PQCLEAN_SNTRUP653_CLEAN_CRYPTO_CORE_MULT3SNTRUP653_H
+
+#include <stdint.h>
+#define PQCLEAN_SNTRUP653_CLEAN_crypto_core_mult3sntrup653_OUTPUTBYTES 653
+#define PQCLEAN_SNTRUP653_CLEAN_crypto_core_mult3sntrup653_INPUTBYTES 653
+#define PQCLEAN_SNTRUP653_CLEAN_crypto_core_mult3sntrup653_KEYBYTES 653
+#define PQCLEAN_SNTRUP653_CLEAN_crypto_core_mult3sntrup653_CONSTBYTES 0
+
+int PQCLEAN_SNTRUP653_CLEAN_crypto_core_mult3sntrup653(unsigned char *outbytes, const unsigned char *inbytes, const unsigned char *kbytes);
+#endif
diff --git a/crypto_kem/sntrup653/clean/crypto_core_multsntrup653.c b/crypto_kem/sntrup653/clean/crypto_core_multsntrup653.c
new file mode 100644
index 00000000..5b0e6c98
--- /dev/null
+++ b/crypto_kem/sntrup653/clean/crypto_core_multsntrup653.c
@@ -0,0 +1,60 @@
+#include "crypto_core_multsntrup653.h"
+#include "params.h"
+
+
+#define int8 int8_t
+#define int16 int16_t
+#define int32 int32_t
+typedef int8 small;
+
+typedef int16 Fq;
+/* always represented as -(q-1)/2...(q-1)/2 */
+
+/* works for -14000000 < x < 14000000 if q in 4591, 4621, 5167 */
+static Fq Fq_freeze(int32 x) {
+    x -= q * ((q18 * x) >> 18);
+    x -= q * ((q27 * x + 67108864) >> 27);
+    return x;
+}
+
+int PQCLEAN_SNTRUP653_CLEAN_crypto_core_multsntrup653(unsigned char *outbytes, const unsigned char *inbytes, const unsigned char *kbytes) {
+    Fq f[p];
+    small g[p];
+    Fq fg[p + p - 1];
+    int32 result;
+    int i, j;
+
+    crypto_decode_pxint16(f, inbytes);
+    for (i = 0; i < p; ++i) {
+        f[i] = Fq_freeze(f[i]);
+    }
+
+    for (i = 0; i < p; ++i) {
+        small gi = kbytes[i];
+        small gi0 = gi & 1;
+        g[i] = gi0 - (gi & (gi0 << 1));
+    }
+
+    for (i = 0; i < p; ++i) {
+        result = 0;
+        for (j = 0; j <= i; ++j) {
+            result += f[j] * (int32)g[i - j];
+        }
+        fg[i] = Fq_freeze(result);
+    }
+    for (i = p; i < p + p - 1; ++i) {
+        result = 0;
+        for (j = i - p + 1; j < p; ++j) {
+            result += f[j] * (int32)g[i - j];
+        }
+        fg[i] = Fq_freeze(result);
+    }
+
+    for (i = p + p - 2; i >= p; --i) {
+        fg[i - p] = Fq_freeze(fg[i - p] + fg[i]);
+        fg[i - p + 1] = Fq_freeze(fg[i - p + 1] + fg[i]);
+    }
+
+    crypto_encode_pxint16(outbytes, fg);
+    return 0;
+}
diff --git a/crypto_kem/sntrup653/clean/crypto_core_multsntrup653.h b/crypto_kem/sntrup653/clean/crypto_core_multsntrup653.h
new file mode 100644
index 00000000..44f4715b
--- /dev/null
+++ b/crypto_kem/sntrup653/clean/crypto_core_multsntrup653.h
@@ -0,0 +1,11 @@
+#ifndef PQCLEAN_SNTRUP653_CLEAN_CRYPTO_CORE_MULTSNTRUP653_H
+#define PQCLEAN_SNTRUP653_CLEAN_CRYPTO_CORE_MULTSNTRUP653_H
+
+#include <stdint.h>
+#define PQCLEAN_SNTRUP653_CLEAN_crypto_core_multsntrup653_OUTPUTBYTES 1306
+#define PQCLEAN_SNTRUP653_CLEAN_crypto_core_multsntrup653_INPUTBYTES 1306
+#define PQCLEAN_SNTRUP653_CLEAN_crypto_core_multsntrup653_KEYBYTES 653
+#define PQCLEAN_SNTRUP653_CLEAN_crypto_core_multsntrup653_CONSTBYTES 0
+
+int PQCLEAN_SNTRUP653_CLEAN_crypto_core_multsntrup653(unsigned char *outbytes, const unsigned char *inbytes, const unsigned char *kbytes);
+#endif
diff --git a/crypto_kem/sntrup653/clean/crypto_core_scale3sntrup653.c b/crypto_kem/sntrup653/clean/crypto_core_scale3sntrup653.c
new file mode 100644
index 00000000..edac527b
--- /dev/null
+++ b/crypto_kem/sntrup653/clean/crypto_core_scale3sntrup653.c
@@ -0,0 +1,32 @@
+#include "crypto_core_scale3sntrup653.h"
+#include "crypto_decode_653xint16.h"
+#include "crypto_encode_653xint16.h"
+
+
+#define p 653
+#define q 4621
+
+#define crypto_decode_pxint16 PQCLEAN_SNTRUP653_CLEAN_crypto_decode_653xint16
+#define crypto_encode_pxint16 PQCLEAN_SNTRUP653_CLEAN_crypto_encode_653xint16
+
+typedef int16_t Fq;
+
+/* out = 3*in in Rq */
+int PQCLEAN_SNTRUP653_CLEAN_crypto_core_scale3sntrup653(unsigned char *outbytes, const unsigned char *inbytes) {
+    Fq f[p];
+    int i;
+
+    crypto_decode_pxint16(f, inbytes);
+    for (i = 0; i < p; ++i) {
+        Fq x = f[i];
+        x *= 3; /* (-3q+3)/2 ... (3q-3)/2 */
+        x -= (q + 1) / 2; /* -2q+1 ... q-2 */
+        x += q & (x >> 15); /* -q+1 ... q-1 */
+        x += q & (x >> 15); /* 0 ... q-1 */
+        x -= (q - 1) / 2; /* -(q-1)/2 ... (q-1)/2 */
+        f[i] = x;
+    }
+    crypto_encode_pxint16(outbytes, f);
+
+    return 0;
+}
diff --git a/crypto_kem/sntrup653/clean/crypto_core_scale3sntrup653.h b/crypto_kem/sntrup653/clean/crypto_core_scale3sntrup653.h
new file mode 100644
index 00000000..70493195
--- /dev/null
+++ b/crypto_kem/sntrup653/clean/crypto_core_scale3sntrup653.h
@@ -0,0 +1,11 @@
+#ifndef PQCLEAN_SNTRUP653_CLEAN_CRYPTO_CORE_SCALE3SNTRUP653_H
+#define PQCLEAN_SNTRUP653_CLEAN_CRYPTO_CORE_SCALE3SNTRUP653_H
+
+#include <stdint.h>
+#define PQCLEAN_SNTRUP653_CLEAN_crypto_core_scale3sntrup653_OUTPUTBYTES 1306
+#define PQCLEAN_SNTRUP653_CLEAN_crypto_core_scale3sntrup653_INPUTBYTES 1306
+#define PQCLEAN_SNTRUP653_CLEAN_crypto_core_scale3sntrup653_KEYBYTES 0
+#define PQCLEAN_SNTRUP653_CLEAN_crypto_core_scale3sntrup653_CONSTBYTES 0
+
+int PQCLEAN_SNTRUP653_CLEAN_crypto_core_scale3sntrup653(unsigned char *outbytes, const unsigned char *inbytes);
+#endif
diff --git a/crypto_kem/sntrup653/clean/crypto_core_weightsntrup653.c b/crypto_kem/sntrup653/clean/crypto_core_weightsntrup653.c
new file mode 100644
index 00000000..737816f0
--- /dev/null
+++ b/crypto_kem/sntrup653/clean/crypto_core_weightsntrup653.c
@@ -0,0 +1,21 @@
+#include "crypto_core_weightsntrup653.h"
+#include "crypto_encode_int16.h"
+#include "params.h"
+
+
+#define int8 int8_t
+#define int16 int16_t
+
+
+/* out = little-endian weight of bottom bits of in */
+int PQCLEAN_SNTRUP653_CLEAN_crypto_core_weightsntrup653(unsigned char *outbytes, const unsigned char *inbytes) {
+    int8 *in = (void *) inbytes;
+    int16 weight = 0;
+    int i;
+
+    for (i = 0; i < p; ++i) {
+        weight += in[i] & 1;
+    }
+    PQCLEAN_SNTRUP653_CLEAN_crypto_encode_int16(outbytes, &weight);
+    return 0;
+}
diff --git a/crypto_kem/sntrup653/clean/crypto_core_weightsntrup653.h b/crypto_kem/sntrup653/clean/crypto_core_weightsntrup653.h
new file mode 100644
index 00000000..244ca54c
--- /dev/null
+++ b/crypto_kem/sntrup653/clean/crypto_core_weightsntrup653.h
@@ -0,0 +1,11 @@
+#ifndef PQCLEAN_SNTRUP653_CLEAN_CRYPTO_CORE_WEIGHTSNTRUP653_H
+#define PQCLEAN_SNTRUP653_CLEAN_CRYPTO_CORE_WEIGHTSNTRUP653_H
+
+#include <stdint.h>
+#define PQCLEAN_SNTRUP653_CLEAN_crypto_core_weightsntrup653_OUTPUTBYTES 2
+#define PQCLEAN_SNTRUP653_CLEAN_crypto_core_weightsntrup653_INPUTBYTES 653
+#define PQCLEAN_SNTRUP653_CLEAN_crypto_core_weightsntrup653_KEYBYTES 0
+#define PQCLEAN_SNTRUP653_CLEAN_crypto_core_weightsntrup653_CONSTBYTES 0
+
+int PQCLEAN_SNTRUP653_CLEAN_crypto_core_weightsntrup653(unsigned char *outbytes, const unsigned char *inbytes);
+#endif
diff --git a/crypto_kem/sntrup653/clean/crypto_core_wforcesntrup653.c b/crypto_kem/sntrup653/clean/crypto_core_wforcesntrup653.c
new file mode 100644
index 00000000..f8a64711
--- /dev/null
+++ b/crypto_kem/sntrup653/clean/crypto_core_wforcesntrup653.c
@@ -0,0 +1,48 @@
+#include "crypto_core_wforcesntrup653.h"
+#include "params.h"
+
+
+#define int8 int8_t
+#define int16 int16_t
+#define uint16 uint16_t
+#define uint32 uint32_t
+
+typedef int8 small;
+
+
+/* return -1 if x!=0; else return 0 */
+static int int16_nonzero_mask(int16 x) {
+    uint16 u = x; /* 0, else 1...65535 */
+    uint32 v = u; /* 0, else 1...65535 */
+    v = -v; /* 0, else 2^32-65535...2^32-1 */
+    v >>= 31; /* 0, else 1 */
+    return -v; /* 0, else -1 */
+}
+
+/* 0 if Weightw_is(r), else -1 */
+static int Weightw_mask(const small *r) {
+    int weight = 0;
+    int i;
+
+    for (i = 0; i < p; ++i) {
+        weight += r[i] & 1;
+    }
+    return int16_nonzero_mask(weight - w);
+}
+
+/* out = in if bottom bits of in have weight w */
+/* otherwise out = (1,1,...,1,0,0,...,0) */
+int PQCLEAN_SNTRUP653_CLEAN_crypto_core_wforcesntrup653(unsigned char *outbytes, const unsigned char *inbytes) {
+    small *out = (void *) outbytes;
+    const small *in = (const void *) inbytes;
+    int i, mask;
+
+    mask = Weightw_mask(in); /* 0 if weight w, else -1 */
+    for (i = 0; i < w; ++i) {
+        out[i] = ((in[i] ^ 1) & ~mask) ^ 1;
+    }
+    for (i = w; i < p; ++i) {
+        out[i] = in[i] & ~mask;
+    }
+    return 0;
+}
diff --git a/crypto_kem/sntrup653/clean/crypto_core_wforcesntrup653.h b/crypto_kem/sntrup653/clean/crypto_core_wforcesntrup653.h
new file mode 100644
index 00000000..3ba291e2
--- /dev/null
+++ b/crypto_kem/sntrup653/clean/crypto_core_wforcesntrup653.h
@@ -0,0 +1,11 @@
+#ifndef PQCLEAN_SNTRUP653_CLEAN_CRYPTO_CORE_WFORCESNTRUP653_H
+#define PQCLEAN_SNTRUP653_CLEAN_CRYPTO_CORE_WFORCESNTRUP653_H
+
+#include <stdint.h>
+#define PQCLEAN_SNTRUP653_CLEAN_crypto_core_wforcesntrup653_OUTPUTBYTES 653
+#define PQCLEAN_SNTRUP653_CLEAN_crypto_core_wforcesntrup653_INPUTBYTES 653
+#define PQCLEAN_SNTRUP653_CLEAN_crypto_core_wforcesntrup653_KEYBYTES 0
+#define PQCLEAN_SNTRUP653_CLEAN_crypto_core_wforcesntrup653_CONSTBYTES 0
+
+int PQCLEAN_SNTRUP653_CLEAN_crypto_core_wforcesntrup653(unsigned char *outbytes, const unsigned char *inbytes);
+#endif
diff --git a/crypto_kem/sntrup653/clean/crypto_decode_653x1541.c b/crypto_kem/sntrup653/clean/crypto_decode_653x1541.c
new file mode 100644
index 00000000..b6a10aca
--- /dev/null
+++ b/crypto_kem/sntrup653/clean/crypto_decode_653x1541.c
@@ -0,0 +1,200 @@
+#include "crypto_decode_653x1541.h"
+
+/* auto-generated; do not edit */
+
+#define int16 int16_t
+#define uint16 uint16_t
+#define uint32 uint32_t
+#define uint64 uint64_t
+
+/*
+CPU division instruction typically takes time depending on x.
+This software is designed to take time independent of x.
+Time still varies depending on m; user must ensure that m is constant.
+Time also varies on CPUs where multiplication is variable-time.
+There could be more CPU issues.
+There could also be compiler issues.
+*/
+
+static void uint32_divmod_uint14(uint32 *q, uint16 *r, uint32 x, uint16 m) {
+    uint32 v = 0x80000000;
+    uint32 qpart;
+    uint32 mask;
+
+    v /= m;
+
+    /* caller guarantees m > 0 */
+    /* caller guarantees m < 16384 */
+    /* vm <= 2^31 <= vm+m-1 */
+    /* xvm <= 2^31 x <= xvm+x(m-1) */
+
+    *q = 0;
+
+    qpart = (x * (uint64)v) >> 31;
+    /* 2^31 qpart <= xv <= 2^31 qpart + 2^31-1 */
+    /* 2^31 qpart m <= xvm <= 2^31 qpart m + (2^31-1)m */
+    /* 2^31 qpart m <= 2^31 x <= 2^31 qpart m + (2^31-1)m + x(m-1) */
+    /* 0 <= 2^31 newx <= (2^31-1)m + x(m-1) */
+    /* 0 <= newx <= (1-1/2^31)m + x(m-1)/2^31 */
+    /* 0 <= newx <= (1-1/2^31)(2^14-1) + (2^32-1)((2^14-1)-1)/2^31 */
+
+    x -= qpart * m;
+    *q += qpart;
+    /* x <= 49146 */
+
+    qpart = (x * (uint64)v) >> 31;
+    /* 0 <= newx <= (1-1/2^31)m + x(m-1)/2^31 */
+    /* 0 <= newx <= m + 49146(2^14-1)/2^31 */
+    /* 0 <= newx <= m + 0.4 */
+    /* 0 <= newx <= m */
+
+    x -= qpart * m;
+    *q += qpart;
+    /* x <= m */
+
+    x -= m;
+    *q += 1;
+    mask = -(x >> 31);
+    x += mask & (uint32)m;
+    *q += mask;
+    /* x < m */
+
+    *r = x;
+}
+
+static uint16 uint32_mod_uint14(uint32 x, uint16 m) {
+    uint32 q;
+    uint16 r;
+    uint32_divmod_uint14(&q, &r, x, m);
+    return r;
+}
+
+void PQCLEAN_SNTRUP653_CLEAN_crypto_decode_653x1541(void *v, const unsigned char *s) {
+    int16 *R0 = v;
+    uint16 R1[327], R2[164], R3[82], R4[41], R5[21], R6[11], R7[6], R8[3], R9[2], R10[1];
+    long long i;
+    uint16 r0;
+    uint32 r1, r2;
+
+    s += PQCLEAN_SNTRUP653_CLEAN_crypto_decode_653x1541_STRBYTES;
+    r1 = 0;
+    r1 = (r1 << 8) | *--s;
+    r1 = (r1 << 8) | *--s;
+    r1 = uint32_mod_uint14(r1, 2608); /* needed only for invalid inputs */
+    R10[0] = r1;
+
+    r2 = R10[0];
+    r2 = (r2 << 8) | *--s;
+    uint32_divmod_uint14(&r1, &r0, r2, 71);
+    R9[0] = r0;
+    r1 = uint32_mod_uint14(r1, 9402); /* needed only for invalid inputs */
+    R9[1] = r1;
+
+    R8[2] = R9[1];
+    r2 = R9[0];
+    r2 = (r2 << 8) | *--s;
+    uint32_divmod_uint14(&r1, &r0, r2, 134);
+    R8[0] = r0;
+    r1 = uint32_mod_uint14(r1, 134); /* needed only for invalid inputs */
+    R8[1] = r1;
+
+    r2 = R8[2];
+    r2 = (r2 << 8) | *--s;
+    uint32_divmod_uint14(&r1, &r0, r2, 2953);
+    R7[4] = r0;
+    r1 = uint32_mod_uint14(r1, 815); /* needed only for invalid inputs */
+    R7[5] = r1;
+    for (i = 1; i >= 0; --i) {
+        r2 = R8[i];
+        r2 = (r2 << 8) | *--s;
+        r2 = (r2 << 8) | *--s;
+        uint32_divmod_uint14(&r1, &r0, r2, 2953);
+        R7[2 * i] = r0;
+        r1 = uint32_mod_uint14(r1, 2953); /* needed only for invalid inputs */
+        R7[2 * i + 1] = r1;
+    }
+
+    R6[10] = R7[5];
+    for (i = 4; i >= 0; --i) {
+        r2 = R7[i];
+        r2 = (r2 << 8) | *--s;
+        r2 = (r2 << 8) | *--s;
+        uint32_divmod_uint14(&r1, &r0, r2, 13910);
+        R6[2 * i] = r0;
+        r1 = uint32_mod_uint14(r1, 13910); /* needed only for invalid inputs */
+        R6[2 * i + 1] = r1;
+    }
+
+    R5[20] = R6[10];
+    for (i = 9; i >= 0; --i) {
+        r2 = R6[i];
+        r2 = (r2 << 8) | *--s;
+        uint32_divmod_uint14(&r1, &r0, r2, 1887);
+        R5[2 * i] = r0;
+        r1 = uint32_mod_uint14(r1, 1887); /* needed only for invalid inputs */
+        R5[2 * i + 1] = r1;
+    }
+
+    R4[40] = R5[20];
+    for (i = 19; i >= 0; --i) {
+        r2 = R5[i];
+        r2 = (r2 << 8) | *--s;
+        uint32_divmod_uint14(&r1, &r0, r2, 695);
+        R4[2 * i] = r0;
+        r1 = uint32_mod_uint14(r1, 695); /* needed only for invalid inputs */
+        R4[2 * i + 1] = r1;
+    }
+
+    r2 = R4[40];
+    r2 = (r2 << 8) | *--s;
+    r2 = (r2 << 8) | *--s;
+    uint32_divmod_uint14(&r1, &r0, r2, 6745);
+    R3[80] = r0;
+    r1 = uint32_mod_uint14(r1, 7910); /* needed only for invalid inputs */
+    R3[81] = r1;
+    for (i = 39; i >= 0; --i) {
+        r2 = R4[i];
+        r2 = (r2 << 8) | *--s;
+        r2 = (r2 << 8) | *--s;
+        uint32_divmod_uint14(&r1, &r0, r2, 6745);
+        R3[2 * i] = r0;
+        r1 = uint32_mod_uint14(r1, 6745); /* needed only for invalid inputs */
+        R3[2 * i + 1] = r1;
+    }
+
+    r2 = R3[81];
+    r2 = (r2 << 8) | *--s;
+    uint32_divmod_uint14(&r1, &r0, r2, 1314);
+    R2[162] = r0;
+    r1 = uint32_mod_uint14(r1, 1541); /* needed only for invalid inputs */
+    R2[163] = r1;
+    for (i = 80; i >= 0; --i) {
+        r2 = R3[i];
+        r2 = (r2 << 8) | *--s;
+        uint32_divmod_uint14(&r1, &r0, r2, 1314);
+        R2[2 * i] = r0;
+        r1 = uint32_mod_uint14(r1, 1314); /* needed only for invalid inputs */
+        R2[2 * i + 1] = r1;
+    }
+
+    R1[326] = R2[163];
+    for (i = 162; i >= 0; --i) {
+        r2 = R2[i];
+        r2 = (r2 << 8) | *--s;
+        r2 = (r2 << 8) | *--s;
+        uint32_divmod_uint14(&r1, &r0, r2, 9277);
+        R1[2 * i] = r0;
+        r1 = uint32_mod_uint14(r1, 9277); /* needed only for invalid inputs */
+        R1[2 * i + 1] = r1;
+    }
+
+    R0[652] = 3 * R1[326] - 2310;
+    for (i = 325; i >= 0; --i) {
+        r2 = R1[i];
+        r2 = (r2 << 8) | *--s;
+        uint32_divmod_uint14(&r1, &r0, r2, 1541);
+        R0[2 * i] = 3 * r0 - 2310;
+        r1 = uint32_mod_uint14(r1, 1541); /* needed only for invalid inputs */
+        R0[2 * i + 1] = 3 * r1 - 2310;
+    }
+}
diff --git a/crypto_kem/sntrup653/clean/crypto_decode_653x1541.h b/crypto_kem/sntrup653/clean/crypto_decode_653x1541.h
new file mode 100644
index 00000000..1af235df
--- /dev/null
+++ b/crypto_kem/sntrup653/clean/crypto_decode_653x1541.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_SNTRUP653_CLEAN_CRYPTO_DECODE_653X1541_H
+#define PQCLEAN_SNTRUP653_CLEAN_CRYPTO_DECODE_653X1541_H
+
+#include <stdint.h>
+#define PQCLEAN_SNTRUP653_CLEAN_crypto_decode_653x1541_STRBYTES 865
+#define PQCLEAN_SNTRUP653_CLEAN_crypto_decode_653x1541_ITEMS 653
+#define PQCLEAN_SNTRUP653_CLEAN_crypto_decode_653x1541_ITEMBYTES 2
+
+void PQCLEAN_SNTRUP653_CLEAN_crypto_decode_653x1541(void *v, const unsigned char *s);
+#endif
diff --git a/crypto_kem/sntrup653/clean/crypto_decode_653x3.c b/crypto_kem/sntrup653/clean/crypto_decode_653x3.c
new file mode 100644
index 00000000..ccb11adb
--- /dev/null
+++ b/crypto_kem/sntrup653/clean/crypto_decode_653x3.c
@@ -0,0 +1,24 @@
+#include "crypto_decode_653x3.h"
+
+#define uint8 uint8_t
+
+#define p 653
+
+void PQCLEAN_SNTRUP653_CLEAN_crypto_decode_653x3(void *v, const unsigned char *s) {
+    uint8 *f = v;
+    uint8 x;
+    int i;
+
+    for (i = 0; i < p / 4; ++i) {
+        x = *s++;
+        *f++ = ((uint8)(x & 3)) - 1;
+        x >>= 2;
+        *f++ = ((uint8)(x & 3)) - 1;
+        x >>= 2;
+        *f++ = ((uint8)(x & 3)) - 1;
+        x >>= 2;
+        *f++ = ((uint8)(x & 3)) - 1;
+    }
+    x = *s++;
+    *f++ = ((uint8)(x & 3)) - 1;
+}
diff --git a/crypto_kem/sntrup653/clean/crypto_decode_653x3.h b/crypto_kem/sntrup653/clean/crypto_decode_653x3.h
new file mode 100644
index 00000000..ded47a1f
--- /dev/null
+++ b/crypto_kem/sntrup653/clean/crypto_decode_653x3.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_SNTRUP653_CLEAN_CRYPTO_DECODE_653X3_H
+#define PQCLEAN_SNTRUP653_CLEAN_CRYPTO_DECODE_653X3_H
+
+#include <stdint.h>
+#define PQCLEAN_SNTRUP653_CLEAN_crypto_decode_653x3_STRBYTES 164
+#define PQCLEAN_SNTRUP653_CLEAN_crypto_decode_653x3_ITEMS 653
+#define PQCLEAN_SNTRUP653_CLEAN_crypto_decode_653x3_ITEMBYTES 1
+
+void PQCLEAN_SNTRUP653_CLEAN_crypto_decode_653x3(void *v, const unsigned char *s);
+#endif
diff --git a/crypto_kem/sntrup653/clean/crypto_decode_653x4621.c b/crypto_kem/sntrup653/clean/crypto_decode_653x4621.c
new file mode 100644
index 00000000..2bf05aaa
--- /dev/null
+++ b/crypto_kem/sntrup653/clean/crypto_decode_653x4621.c
@@ -0,0 +1,198 @@
+#include "crypto_decode_653x4621.h"
+
+/* auto-generated; do not edit */
+
+#define int16 int16_t
+#define uint16 uint16_t
+#define uint32 uint32_t
+#define uint64 uint64_t
+
+/*
+CPU division instruction typically takes time depending on x.
+This software is designed to take time independent of x.
+Time still varies depending on m; user must ensure that m is constant.
+Time also varies on CPUs where multiplication is variable-time.
+There could be more CPU issues.
+There could also be compiler issues.
+*/
+
+static void uint32_divmod_uint14(uint32 *q, uint16 *r, uint32 x, uint16 m) {
+    uint32 v = 0x80000000;
+    uint32 qpart;
+    uint32 mask;
+
+    v /= m;
+
+    /* caller guarantees m > 0 */
+    /* caller guarantees m < 16384 */
+    /* vm <= 2^31 <= vm+m-1 */
+    /* xvm <= 2^31 x <= xvm+x(m-1) */
+
+    *q = 0;
+
+    qpart = (x * (uint64)v) >> 31;
+    /* 2^31 qpart <= xv <= 2^31 qpart + 2^31-1 */
+    /* 2^31 qpart m <= xvm <= 2^31 qpart m + (2^31-1)m */
+    /* 2^31 qpart m <= 2^31 x <= 2^31 qpart m + (2^31-1)m + x(m-1) */
+    /* 0 <= 2^31 newx <= (2^31-1)m + x(m-1) */
+    /* 0 <= newx <= (1-1/2^31)m + x(m-1)/2^31 */
+    /* 0 <= newx <= (1-1/2^31)(2^14-1) + (2^32-1)((2^14-1)-1)/2^31 */
+
+    x -= qpart * m;
+    *q += qpart;
+    /* x <= 49146 */
+
+    qpart = (x * (uint64)v) >> 31;
+    /* 0 <= newx <= (1-1/2^31)m + x(m-1)/2^31 */
+    /* 0 <= newx <= m + 49146(2^14-1)/2^31 */
+    /* 0 <= newx <= m + 0.4 */
+    /* 0 <= newx <= m */
+
+    x -= qpart * m;
+    *q += qpart;
+    /* x <= m */
+
+    x -= m;
+    *q += 1;
+    mask = -(x >> 31);
+    x += mask & (uint32)m;
+    *q += mask;
+    /* x < m */
+
+    *r = x;
+}
+
+static uint16 uint32_mod_uint14(uint32 x, uint16 m) {
+    uint32 q;
+    uint16 r;
+    uint32_divmod_uint14(&q, &r, x, m);
+    return r;
+}
+
+void PQCLEAN_SNTRUP653_CLEAN_crypto_decode_653x4621(void *v, const unsigned char *s) {
+    int16 *R0 = v;
+    uint16 R1[327], R2[164], R3[82], R4[41], R5[21], R6[11], R7[6], R8[3], R9[2], R10[1];
+    long long i;
+    uint16 r0;
+    uint32 r1, r2;
+
+    s += PQCLEAN_SNTRUP653_CLEAN_crypto_decode_653x4621_STRBYTES;
+    r1 = 0;
+    r1 = (r1 << 8) | *--s;
+    r1 = uint32_mod_uint14(r1, 86); /* needed only for invalid inputs */
+    R10[0] = r1;
+
+    r2 = R10[0];
+    r2 = (r2 << 8) | *--s;
+    r2 = (r2 << 8) | *--s;
+    uint32_divmod_uint14(&r1, &r0, r2, 835);
+    R9[0] = r0;
+    r1 = uint32_mod_uint14(r1, 6708); /* needed only for invalid inputs */
+    R9[1] = r1;
+
+    R8[2] = R9[1];
+    r2 = R9[0];
+    r2 = (r2 << 8) | *--s;
+    r2 = (r2 << 8) | *--s;
+    uint32_divmod_uint14(&r1, &r0, r2, 7396);
+    R8[0] = r0;
+    r1 = uint32_mod_uint14(r1, 7396); /* needed only for invalid inputs */
+    R8[1] = r1;
+
+    r2 = R8[2];
+    uint32_divmod_uint14(&r1, &r0, r2, 86);
+    R7[4] = r0;
+    r1 = uint32_mod_uint14(r1, 78); /* needed only for invalid inputs */
+    R7[5] = r1;
+    for (i = 1; i >= 0; --i) {
+        r2 = R8[i];
+        uint32_divmod_uint14(&r1, &r0, r2, 86);
+        R7[2 * i] = r0;
+        r1 = uint32_mod_uint14(r1, 86); /* needed only for invalid inputs */
+        R7[2 * i + 1] = r1;
+    }
+
+    R6[10] = R7[5];
+    for (i = 4; i >= 0; --i) {
+        r2 = R7[i];
+        r2 = (r2 << 8) | *--s;
+        r2 = (r2 << 8) | *--s;
+        uint32_divmod_uint14(&r1, &r0, r2, 2370);
+        R6[2 * i] = r0;
+        r1 = uint32_mod_uint14(r1, 2370); /* needed only for invalid inputs */
+        R6[2 * i + 1] = r1;
+    }
+
+    R5[20] = R6[10];
+    for (i = 9; i >= 0; --i) {
+        r2 = R6[i];
+        r2 = (r2 << 8) | *--s;
+        r2 = (r2 << 8) | *--s;
+        uint32_divmod_uint14(&r1, &r0, r2, 12461);
+        R5[2 * i] = r0;
+        r1 = uint32_mod_uint14(r1, 12461); /* needed only for invalid inputs */
+        R5[2 * i + 1] = r1;
+    }
+
+    R4[40] = R5[20];
+    for (i = 19; i >= 0; --i) {
+        r2 = R5[i];
+        r2 = (r2 << 8) | *--s;
+        uint32_divmod_uint14(&r1, &r0, r2, 1786);
+        R4[2 * i] = r0;
+        r1 = uint32_mod_uint14(r1, 1786); /* needed only for invalid inputs */
+        R4[2 * i + 1] = r1;
+    }
+
+    r2 = R4[40];
+    r2 = (r2 << 8) | *--s;
+    r2 = (r2 << 8) | *--s;
+    uint32_divmod_uint14(&r1, &r0, r2, 676);
+    R3[80] = r0;
+    r1 = uint32_mod_uint14(r1, 7510); /* needed only for invalid inputs */
+    R3[81] = r1;
+    for (i = 39; i >= 0; --i) {
+        r2 = R4[i];
+        r2 = (r2 << 8) | *--s;
+        uint32_divmod_uint14(&r1, &r0, r2, 676);
+        R3[2 * i] = r0;
+        r1 = uint32_mod_uint14(r1, 676); /* needed only for invalid inputs */
+        R3[2 * i + 1] = r1;
+    }
+
+    r2 = R3[81];
+    r2 = (r2 << 8) | *--s;
+    uint32_divmod_uint14(&r1, &r0, r2, 416);
+    R2[162] = r0;
+    r1 = uint32_mod_uint14(r1, 4621); /* needed only for invalid inputs */
+    R2[163] = r1;
+    for (i = 80; i >= 0; --i) {
+        r2 = R3[i];
+        r2 = (r2 << 8) | *--s;
+        uint32_divmod_uint14(&r1, &r0, r2, 416);
+        R2[2 * i] = r0;
+        r1 = uint32_mod_uint14(r1, 416); /* needed only for invalid inputs */
+        R2[2 * i + 1] = r1;
+    }
+
+    R1[326] = R2[163];
+    for (i = 162; i >= 0; --i) {
+        r2 = R2[i];
+        r2 = (r2 << 8) | *--s;
+        uint32_divmod_uint14(&r1, &r0, r2, 326);
+        R1[2 * i] = r0;
+        r1 = uint32_mod_uint14(r1, 326); /* needed only for invalid inputs */
+        R1[2 * i + 1] = r1;
+    }
+
+    R0[652] = R1[326] - 2310;
+    for (i = 325; i >= 0; --i) {
+        r2 = R1[i];
+        r2 = (r2 << 8) | *--s;
+        r2 = (r2 << 8) | *--s;
+        uint32_divmod_uint14(&r1, &r0, r2, 4621);
+        R0[2 * i] = r0 - 2310;
+        r1 = uint32_mod_uint14(r1, 4621); /* needed only for invalid inputs */
+        R0[2 * i + 1] = r1 - 2310;
+    }
+}
diff --git a/crypto_kem/sntrup653/clean/crypto_decode_653x4621.h b/crypto_kem/sntrup653/clean/crypto_decode_653x4621.h
new file mode 100644
index 00000000..e010ce8b
--- /dev/null
+++ b/crypto_kem/sntrup653/clean/crypto_decode_653x4621.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_SNTRUP653_CLEAN_CRYPTO_DECODE_653X4621_H
+#define PQCLEAN_SNTRUP653_CLEAN_CRYPTO_DECODE_653X4621_H
+
+#include <stdint.h>
+#define PQCLEAN_SNTRUP653_CLEAN_crypto_decode_653x4621_STRBYTES 994
+#define PQCLEAN_SNTRUP653_CLEAN_crypto_decode_653x4621_ITEMS 653
+#define PQCLEAN_SNTRUP653_CLEAN_crypto_decode_653x4621_ITEMBYTES 2
+
+void PQCLEAN_SNTRUP653_CLEAN_crypto_decode_653x4621(void *v, const unsigned char *s);
+#endif
diff --git a/crypto_kem/sntrup653/clean/crypto_decode_653xint16.c b/crypto_kem/sntrup653/clean/crypto_decode_653xint16.c
new file mode 100644
index 00000000..655095cd
--- /dev/null
+++ b/crypto_kem/sntrup653/clean/crypto_decode_653xint16.c
@@ -0,0 +1,16 @@
+#include "crypto_decode_653xint16.h"
+
+
+void PQCLEAN_SNTRUP653_CLEAN_crypto_decode_653xint16(void *v, const unsigned char *s) {
+    uint16_t *x = v;
+    int i;
+
+    for (i = 0; i < 653; ++i) {
+        uint16_t u0 = s[0];
+        uint16_t u1 = s[1];
+        u1 <<= 8;
+        *x = u0 | u1;
+        x += 1;
+        s += 2;
+    }
+}
diff --git a/crypto_kem/sntrup653/clean/crypto_decode_653xint16.h b/crypto_kem/sntrup653/clean/crypto_decode_653xint16.h
new file mode 100644
index 00000000..7aa7568c
--- /dev/null
+++ b/crypto_kem/sntrup653/clean/crypto_decode_653xint16.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_SNTRUP653_CLEAN_CRYPTO_DECODE_653XINT16_H
+#define PQCLEAN_SNTRUP653_CLEAN_CRYPTO_DECODE_653XINT16_H
+
+#include <stdint.h>
+#define PQCLEAN_SNTRUP653_CLEAN_crypto_decode_653xint16_STRBYTES 1306
+#define PQCLEAN_SNTRUP653_CLEAN_crypto_decode_653xint16_ITEMBYTES 2
+#define PQCLEAN_SNTRUP653_CLEAN_crypto_decode_653xint16_ITEMS 653
+
+void PQCLEAN_SNTRUP653_CLEAN_crypto_decode_653xint16(void *v, const unsigned char *s);
+#endif
diff --git a/crypto_kem/sntrup653/clean/crypto_decode_653xint32.c b/crypto_kem/sntrup653/clean/crypto_decode_653xint32.c
new file mode 100644
index 00000000..fc128fdf
--- /dev/null
+++ b/crypto_kem/sntrup653/clean/crypto_decode_653xint32.c
@@ -0,0 +1,20 @@
+#include "crypto_decode_653xint32.h"
+
+
+void PQCLEAN_SNTRUP653_CLEAN_crypto_decode_653xint32(void *v, const unsigned char *s) {
+    uint32_t *x = v;
+    int i;
+
+    for (i = 0; i < 653; ++i) {
+        uint32_t u0 = s[0];
+        uint32_t u1 = s[1];
+        uint32_t u2 = s[2];
+        uint32_t u3 = s[3];
+        u1 <<= 8;
+        u2 <<= 16;
+        u3 <<= 24;
+        *x = u0 | u1 | u2 | u3;
+        x += 1;
+        s += 4;
+    }
+}
diff --git a/crypto_kem/sntrup653/clean/crypto_decode_653xint32.h b/crypto_kem/sntrup653/clean/crypto_decode_653xint32.h
new file mode 100644
index 00000000..aa1c19e8
--- /dev/null
+++ b/crypto_kem/sntrup653/clean/crypto_decode_653xint32.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_SNTRUP653_CLEAN_CRYPTO_DECODE_653XINT32_H
+#define PQCLEAN_SNTRUP653_CLEAN_CRYPTO_DECODE_653XINT32_H
+
+#include <stdint.h>
+#define PQCLEAN_SNTRUP653_CLEAN_crypto_decode_653xint32_STRBYTES 2612
+#define PQCLEAN_SNTRUP653_CLEAN_crypto_decode_653xint32_ITEMBYTES 4
+#define PQCLEAN_SNTRUP653_CLEAN_crypto_decode_653xint32_ITEMS 653
+
+void PQCLEAN_SNTRUP653_CLEAN_crypto_decode_653xint32(void *v, const unsigned char *s);
+#endif
diff --git a/crypto_kem/sntrup653/clean/crypto_encode_653x1541.c b/crypto_kem/sntrup653/clean/crypto_encode_653x1541.c
new file mode 100644
index 00000000..6b442cb2
--- /dev/null
+++ b/crypto_kem/sntrup653/clean/crypto_encode_653x1541.c
@@ -0,0 +1,127 @@
+#include "crypto_encode_653x1541.h"
+
+/* auto-generated; do not edit */
+
+#define int16 int16_t
+#define uint16 uint16_t
+#define uint32 uint32_t
+
+void PQCLEAN_SNTRUP653_CLEAN_crypto_encode_653x1541(unsigned char *out, const void *v) {
+    const int16 *R0 = v;
+    /* XXX: caller could overlap R with input */
+    uint16 R[327];
+    long i;
+    uint16 r0, r1;
+    uint32 r2;
+
+    for (i = 0; i < 326; ++i) {
+        r0 = (((R0[2 * i] + 2310) & 16383) * 10923) >> 15;
+        r1 = (((R0[2 * i + 1] + 2310) & 16383) * 10923) >> 15;
+        r2 = r0 + r1 * (uint32)1541;
+        *out++ = r2;
+        r2 >>= 8;
+        R[i] = r2;
+    }
+    R[326] = (((R0[652] + 2310) & 16383) * 10923) >> 15;
+
+    for (i = 0; i < 163; ++i) {
+        r0 = R[2 * i];
+        r1 = R[2 * i + 1];
+        r2 = r0 + r1 * (uint32)9277;
+        *out++ = r2;
+        r2 >>= 8;
+        *out++ = r2;
+        r2 >>= 8;
+        R[i] = r2;
+    }
+    R[163] = R[326];
+
+    for (i = 0; i < 82; ++i) {
+        r0 = R[2 * i];
+        r1 = R[2 * i + 1];
+        r2 = r0 + r1 * (uint32)1314;
+        *out++ = r2;
+        r2 >>= 8;
+        R[i] = r2;
+    }
+
+    for (i = 0; i < 41; ++i) {
+        r0 = R[2 * i];
+        r1 = R[2 * i + 1];
+        r2 = r0 + r1 * (uint32)6745;
+        *out++ = r2;
+        r2 >>= 8;
+        *out++ = r2;
+        r2 >>= 8;
+        R[i] = r2;
+    }
+
+    for (i = 0; i < 20; ++i) {
+        r0 = R[2 * i];
+        r1 = R[2 * i + 1];
+        r2 = r0 + r1 * (uint32)695;
+        *out++ = r2;
+        r2 >>= 8;
+        R[i] = r2;
+    }
+    R[20] = R[40];
+
+    for (i = 0; i < 10; ++i) {
+        r0 = R[2 * i];
+        r1 = R[2 * i + 1];
+        r2 = r0 + r1 * (uint32)1887;
+        *out++ = r2;
+        r2 >>= 8;
+        R[i] = r2;
+    }
+    R[10] = R[20];
+
+    for (i = 0; i < 5; ++i) {
+        r0 = R[2 * i];
+        r1 = R[2 * i + 1];
+        r2 = r0 + r1 * (uint32)13910;
+        *out++ = r2;
+        r2 >>= 8;
+        *out++ = r2;
+        r2 >>= 8;
+        R[i] = r2;
+    }
+    R[5] = R[10];
+
+    for (i = 0; i < 2; ++i) {
+        r0 = R[2 * i];
+        r1 = R[2 * i + 1];
+        r2 = r0 + r1 * (uint32)2953;
+        *out++ = r2;
+        r2 >>= 8;
+        *out++ = r2;
+        r2 >>= 8;
+        R[i] = r2;
+    }
+    r0 = R[4];
+    r1 = R[5];
+    r2 = r0 + r1 * (uint32)2953;
+    *out++ = r2;
+    r2 >>= 8;
+    R[2] = r2;
+
+    r0 = R[0];
+    r1 = R[1];
+    r2 = r0 + r1 * (uint32)134;
+    *out++ = r2;
+    r2 >>= 8;
+    R[0] = r2;
+    R[1] = R[2];
+
+    r0 = R[0];
+    r1 = R[1];
+    r2 = r0 + r1 * (uint32)71;
+    *out++ = r2;
+    r2 >>= 8;
+    R[0] = r2;
+
+    r0 = R[0];
+    *out++ = r0;
+    r0 >>= 8;
+    *out++ = r0; /*clang-analyzer-deadcode.DeadStores*/ /*r0 >>= 8;*/
+}
diff --git a/crypto_kem/sntrup653/clean/crypto_encode_653x1541.h b/crypto_kem/sntrup653/clean/crypto_encode_653x1541.h
new file mode 100644
index 00000000..d99ce72f
--- /dev/null
+++ b/crypto_kem/sntrup653/clean/crypto_encode_653x1541.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_SNTRUP653_CLEAN_CRYPTO_ENCODE_653X1541_H
+#define PQCLEAN_SNTRUP653_CLEAN_CRYPTO_ENCODE_653X1541_H
+
+#include <stdint.h>
+#define PQCLEAN_SNTRUP653_CLEAN_crypto_encode_653x1541_STRBYTES 865
+#define PQCLEAN_SNTRUP653_CLEAN_crypto_encode_653x1541_ITEMS 653
+#define PQCLEAN_SNTRUP653_CLEAN_crypto_encode_653x1541_ITEMBYTES 2
+
+void PQCLEAN_SNTRUP653_CLEAN_crypto_encode_653x1541(unsigned char *out, const void *v);
+#endif
diff --git a/crypto_kem/sntrup653/clean/crypto_encode_653x1541round.c b/crypto_kem/sntrup653/clean/crypto_encode_653x1541round.c
new file mode 100644
index 00000000..b53d2dce
--- /dev/null
+++ b/crypto_kem/sntrup653/clean/crypto_encode_653x1541round.c
@@ -0,0 +1,17 @@
+#include "crypto_encode_653x1541.h"
+#include "crypto_encode_653x1541round.h"
+
+#define int16 int16_t
+
+#define p 653
+
+void PQCLEAN_SNTRUP653_CLEAN_crypto_encode_653x1541round(unsigned char *out, const void *v) {
+    const int16 *a = v;
+    int16 x[p];
+    int i;
+
+    for (i = 0; i < p; ++i) {
+        x[i] = 3 * ((10923 * a[i] + 16384) >> 15);
+    }
+    PQCLEAN_SNTRUP653_CLEAN_crypto_encode_653x1541(out, x);
+}
diff --git a/crypto_kem/sntrup653/clean/crypto_encode_653x1541round.h b/crypto_kem/sntrup653/clean/crypto_encode_653x1541round.h
new file mode 100644
index 00000000..11595feb
--- /dev/null
+++ b/crypto_kem/sntrup653/clean/crypto_encode_653x1541round.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_SNTRUP653_CLEAN_CRYPTO_ENCODE_653X1541ROUND_H
+#define PQCLEAN_SNTRUP653_CLEAN_CRYPTO_ENCODE_653X1541ROUND_H
+
+#include <stdint.h>
+#define PQCLEAN_SNTRUP653_CLEAN_crypto_encode_653x1541round_STRBYTES 865
+#define PQCLEAN_SNTRUP653_CLEAN_crypto_encode_653x1541round_ITEMS 653
+#define PQCLEAN_SNTRUP653_CLEAN_crypto_encode_653x1541round_ITEMBYTES 2
+
+void PQCLEAN_SNTRUP653_CLEAN_crypto_encode_653x1541round(unsigned char *out, const void *v);
+#endif
diff --git a/crypto_kem/sntrup653/clean/crypto_encode_653x3.c b/crypto_kem/sntrup653/clean/crypto_encode_653x3.c
new file mode 100644
index 00000000..f6628a47
--- /dev/null
+++ b/crypto_kem/sntrup653/clean/crypto_encode_653x3.c
@@ -0,0 +1,21 @@
+#include "crypto_encode_653x3.h"
+
+#define uint8 uint8_t
+
+#define p 653
+
+void PQCLEAN_SNTRUP653_CLEAN_crypto_encode_653x3(unsigned char *s, const void *v) {
+    const uint8 *f = v;
+    uint8 x;
+    int i;
+
+    for (i = 0; i < p / 4; ++i) {
+        x = *f++ + 1;
+        x += (*f++ + 1) << 2;
+        x += (*f++ + 1) << 4;
+        x += (*f++ + 1) << 6;
+        *s++ = x;
+    }
+    x = *f++ + 1;
+    *s++ = x;
+}
diff --git a/crypto_kem/sntrup653/clean/crypto_encode_653x3.h b/crypto_kem/sntrup653/clean/crypto_encode_653x3.h
new file mode 100644
index 00000000..b3340bdd
--- /dev/null
+++ b/crypto_kem/sntrup653/clean/crypto_encode_653x3.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_SNTRUP653_CLEAN_CRYPTO_ENCODE_653X3_H
+#define PQCLEAN_SNTRUP653_CLEAN_CRYPTO_ENCODE_653X3_H
+
+#include <stdint.h>
+#define PQCLEAN_SNTRUP653_CLEAN_crypto_encode_653x3_STRBYTES 164
+#define PQCLEAN_SNTRUP653_CLEAN_crypto_encode_653x3_ITEMS 653
+#define PQCLEAN_SNTRUP653_CLEAN_crypto_encode_653x3_ITEMBYTES 1
+
+void PQCLEAN_SNTRUP653_CLEAN_crypto_encode_653x3(unsigned char *s, const void *v);
+#endif
diff --git a/crypto_kem/sntrup653/clean/crypto_encode_653x4621.c b/crypto_kem/sntrup653/clean/crypto_encode_653x4621.c
new file mode 100644
index 00000000..7c3f120f
--- /dev/null
+++ b/crypto_kem/sntrup653/clean/crypto_encode_653x4621.c
@@ -0,0 +1,127 @@
+#include "crypto_encode_653x4621.h"
+
+/* auto-generated; do not edit */
+
+#define int16 int16_t
+#define uint16 uint16_t
+#define uint32 uint32_t
+
+void PQCLEAN_SNTRUP653_CLEAN_crypto_encode_653x4621(unsigned char *out, const void *v) {
+    const int16 *R0 = v;
+    /* XXX: caller could overlap R with input */
+    uint16 R[327];
+    long i;
+    uint16 r0, r1;
+    uint32 r2;
+
+    for (i = 0; i < 326; ++i) {
+        r0 = (R0[2 * i] + 2310) & 16383;
+        r1 = (R0[2 * i + 1] + 2310) & 16383;
+        r2 = r0 + r1 * (uint32)4621;
+        *out++ = r2;
+        r2 >>= 8;
+        *out++ = r2;
+        r2 >>= 8;
+        R[i] = r2;
+    }
+    R[326] = (R0[652] + 2310) & 16383;
+
+    for (i = 0; i < 163; ++i) {
+        r0 = R[2 * i];
+        r1 = R[2 * i + 1];
+        r2 = r0 + r1 * (uint32)326;
+        *out++ = r2;
+        r2 >>= 8;
+        R[i] = r2;
+    }
+    R[163] = R[326];
+
+    for (i = 0; i < 82; ++i) {
+        r0 = R[2 * i];
+        r1 = R[2 * i + 1];
+        r2 = r0 + r1 * (uint32)416;
+        *out++ = r2;
+        r2 >>= 8;
+        R[i] = r2;
+    }
+
+    for (i = 0; i < 40; ++i) {
+        r0 = R[2 * i];
+        r1 = R[2 * i + 1];
+        r2 = r0 + r1 * (uint32)676;
+        *out++ = r2;
+        r2 >>= 8;
+        R[i] = r2;
+    }
+    r0 = R[80];
+    r1 = R[81];
+    r2 = r0 + r1 * (uint32)676;
+    *out++ = r2;
+    r2 >>= 8;
+    *out++ = r2;
+    r2 >>= 8;
+    R[40] = r2;
+
+    for (i = 0; i < 20; ++i) {
+        r0 = R[2 * i];
+        r1 = R[2 * i + 1];
+        r2 = r0 + r1 * (uint32)1786;
+        *out++ = r2;
+        r2 >>= 8;
+        R[i] = r2;
+    }
+    R[20] = R[40];
+
+    for (i = 0; i < 10; ++i) {
+        r0 = R[2 * i];
+        r1 = R[2 * i + 1];
+        r2 = r0 + r1 * (uint32)12461;
+        *out++ = r2;
+        r2 >>= 8;
+        *out++ = r2;
+        r2 >>= 8;
+        R[i] = r2;
+    }
+    R[10] = R[20];
+
+    for (i = 0; i < 5; ++i) {
+        r0 = R[2 * i];
+        r1 = R[2 * i + 1];
+        r2 = r0 + r1 * (uint32)2370;
+        *out++ = r2;
+        r2 >>= 8;
+        *out++ = r2;
+        r2 >>= 8;
+        R[i] = r2;
+    }
+    R[5] = R[10];
+
+    for (i = 0; i < 3; ++i) {
+        r0 = R[2 * i];
+        r1 = R[2 * i + 1];
+        r2 = r0 + r1 * (uint32)86;
+        R[i] = r2;
+    }
+
+    r0 = R[0];
+    r1 = R[1];
+    r2 = r0 + r1 * (uint32)7396;
+    *out++ = r2;
+    r2 >>= 8;
+    *out++ = r2;
+    r2 >>= 8;
+    R[0] = r2;
+    R[1] = R[2];
+
+    r0 = R[0];
+    r1 = R[1];
+    r2 = r0 + r1 * (uint32)835;
+    *out++ = r2;
+    r2 >>= 8;
+    *out++ = r2;
+    r2 >>= 8;
+    R[0] = r2;
+
+    r0 = R[0];
+    *out++ = r0; /*clang-analyzer-deadcode.DeadStores*/ /*r0 >>= 8;*/
+}
diff --git a/crypto_kem/sntrup653/clean/crypto_encode_653x4621.h b/crypto_kem/sntrup653/clean/crypto_encode_653x4621.h
new file mode 100644
index 00000000..bc0ffd44
--- /dev/null
+++ b/crypto_kem/sntrup653/clean/crypto_encode_653x4621.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_SNTRUP653_CLEAN_CRYPTO_ENCODE_653X4621_H
+#define PQCLEAN_SNTRUP653_CLEAN_CRYPTO_ENCODE_653X4621_H
+
+#include <stdint.h>
+#define PQCLEAN_SNTRUP653_CLEAN_crypto_encode_653x4621_STRBYTES 994
+#define PQCLEAN_SNTRUP653_CLEAN_crypto_encode_653x4621_ITEMS 653
+#define PQCLEAN_SNTRUP653_CLEAN_crypto_encode_653x4621_ITEMBYTES 2
+
+void PQCLEAN_SNTRUP653_CLEAN_crypto_encode_653x4621(unsigned char *out, const void *v);
+#endif
diff --git a/crypto_kem/sntrup653/clean/crypto_encode_653xfreeze3.c b/crypto_kem/sntrup653/clean/crypto_encode_653xfreeze3.c
new file mode 100644
index 00000000..b938d071
--- /dev/null
+++ b/crypto_kem/sntrup653/clean/crypto_encode_653xfreeze3.c
@@ -0,0 +1,25 @@
+#include "crypto_encode_653xfreeze3.h"
+
+#define int16 int16_t
+
+#define p 653
+
+/* valid inputs: -16384 <= x < 16384 */
+/* then 3 divides x-F3_freeze(x) */
+/* and F3_freeze(x) is in {-1,0,1} */
+
+/* all inputs: 3 divides x-F3_freeze(x) */
+/* and F3_freeze(x) is in {-2,-1,0,1,2} */
+
+static inline unsigned char F3_freeze(int16 x) {
+    return x - 3 * ((10923 * x + 16384) >> 15);
+}
+
+void PQCLEAN_SNTRUP653_CLEAN_crypto_encode_653xfreeze3(unsigned char *s, const void *v) {
+    const int16 *r = v;
+
+    int i;
+    for (i = 0; i < p; ++i) {
+        s[i] = F3_freeze(r[i]);
+    }
+}
diff --git a/crypto_kem/sntrup653/clean/crypto_encode_653xfreeze3.h b/crypto_kem/sntrup653/clean/crypto_encode_653xfreeze3.h
new file mode 100644
index 00000000..2efc8170
--- /dev/null
+++ b/crypto_kem/sntrup653/clean/crypto_encode_653xfreeze3.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_SNTRUP653_CLEAN_CRYPTO_ENCODE_653XFREEZE3_H
+#define PQCLEAN_SNTRUP653_CLEAN_CRYPTO_ENCODE_653XFREEZE3_H
+
+#include <stdint.h>
+#define PQCLEAN_SNTRUP653_CLEAN_crypto_encode_653xfreeze3_STRBYTES 653
+#define PQCLEAN_SNTRUP653_CLEAN_crypto_encode_653xfreeze3_ITEMS 653
+#define PQCLEAN_SNTRUP653_CLEAN_crypto_encode_653xfreeze3_ITEMBYTES 2
+
+void PQCLEAN_SNTRUP653_CLEAN_crypto_encode_653xfreeze3(unsigned char *s, const void *v);
+#endif
diff --git a/crypto_kem/sntrup653/clean/crypto_encode_653xint16.c b/crypto_kem/sntrup653/clean/crypto_encode_653xint16.c
new file mode 100644
index 00000000..8555ae26
--- /dev/null
+++ b/crypto_kem/sntrup653/clean/crypto_encode_653xint16.c
@@ -0,0 +1,13 @@
+#include "crypto_encode_653xint16.h"
+
+
+void PQCLEAN_SNTRUP653_CLEAN_crypto_encode_653xint16(unsigned char *s, const void *v) {
+    const uint16_t *x = v;
+    int i;
+
+    for (i = 0; i < 653; ++i) {
+        uint16_t u = *x++;
+        *s++ = u;
+        *s++ = u >> 8;
+    }
+}
diff --git a/crypto_kem/sntrup653/clean/crypto_encode_653xint16.h b/crypto_kem/sntrup653/clean/crypto_encode_653xint16.h
new file mode 100644
index 00000000..a06da0aa
--- /dev/null
+++ b/crypto_kem/sntrup653/clean/crypto_encode_653xint16.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_SNTRUP653_CLEAN_CRYPTO_ENCODE_653XINT16_H
+#define PQCLEAN_SNTRUP653_CLEAN_CRYPTO_ENCODE_653XINT16_H
+
+#include <stdint.h>
+#define PQCLEAN_SNTRUP653_CLEAN_crypto_encode_653xint16_STRBYTES 1306
+#define PQCLEAN_SNTRUP653_CLEAN_crypto_encode_653xint16_ITEMBYTES 2
+#define PQCLEAN_SNTRUP653_CLEAN_crypto_encode_653xint16_ITEMS 653
+
+void PQCLEAN_SNTRUP653_CLEAN_crypto_encode_653xint16(unsigned char *s, const void *v);
+#endif
diff --git a/crypto_kem/sntrup653/clean/crypto_encode_int16.c b/crypto_kem/sntrup653/clean/crypto_encode_int16.c
new file mode 100644
index 00000000..01d126c0
--- /dev/null
+++ b/crypto_kem/sntrup653/clean/crypto_encode_int16.c
@@ -0,0 +1,9 @@
+#include "crypto_encode_int16.h"
+
+#define uint16 uint16_t
+
+void PQCLEAN_SNTRUP653_CLEAN_crypto_encode_int16(unsigned char *s, const void *x) {
+    uint16 u = *(const uint16 *) x;
+    s[0] = u;
+    s[1] = u >> 8;
+}
diff --git a/crypto_kem/sntrup653/clean/crypto_encode_int16.h b/crypto_kem/sntrup653/clean/crypto_encode_int16.h
new file mode 100644
index 00000000..27ce185f
--- /dev/null
+++ b/crypto_kem/sntrup653/clean/crypto_encode_int16.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_SNTRUP653_CLEAN_CRYPTO_ENCODE_INT16_H
+#define PQCLEAN_SNTRUP653_CLEAN_CRYPTO_ENCODE_INT16_H
+
+#include <stdint.h>
+#define PQCLEAN_SNTRUP653_CLEAN_crypto_encode_int16_STRBYTES 2
+#define PQCLEAN_SNTRUP653_CLEAN_crypto_encode_int16_ITEMBYTES 2
+#define PQCLEAN_SNTRUP653_CLEAN_crypto_encode_int16_ITEMS 1
+
+void PQCLEAN_SNTRUP653_CLEAN_crypto_encode_int16(unsigned char *s, const void *x);
+#endif
diff --git a/crypto_kem/sntrup653/clean/crypto_sort_int32.c b/crypto_kem/sntrup653/clean/crypto_sort_int32.c
new file mode 100644
index 00000000..793b7580
--- /dev/null
+++ b/crypto_kem/sntrup653/clean/crypto_sort_int32.c
@@ -0,0 +1,86 @@
+#include "crypto_sort_int32.h"
+#include <stdint.h>
+// Based on supercop-20190110/crypto_sort/int32/x86
+
+
+#define int32 int32_t
+
+#define int32_MINMAX(a,b) \
+    do { \
+        int32_t ab = (b) ^ (a); \
+        int32_t c = (int32_t)((int64_t)(b) - (int64_t)(a)); \
+        c ^= ab & (c ^ (b)); \
+        c >>= 31; \
+        c &= ab; \
+        (a) ^= c; \
+        (b) ^= c; \
+    } while(0)
+
+/* assume 2 <= n <= 0x40000000 */
+void PQCLEAN_SNTRUP653_CLEAN_crypto_sort_int32(int32 *array, size_t n) {
+    size_t top, p, q, r, i, j;
+    int32 *x = array;
+
+    top = 1;
+    while (top < n - top) {
+        top += top;
+    }
+
+    for (p = top; p >= 1; p >>= 1) {
+        i = 0;
+        while (i + 2 * p <= n) {
+            for (j = i; j < i + p; ++j) {
+                int32_MINMAX(x[j], x[j + p]);
+            }
+            i += 2 * p;
+        }
+        for (j = i; j < n - p; ++j) {
+            int32_MINMAX(x[j], x[j + p]);
+        }
+
+        i = 0;
+        j = 0;
+        for (q = top; q > p; q >>= 1) {
+            if (j != i) {
+                for (;;) {
+                    if (j == n - q) {
+                        goto done;
+                    }
+                    int32 a = x[j + p];
+                    for (r = q; r > p; r >>= 1) {
+                        int32_MINMAX(a, x[j + r]);
+                    }
+                    x[j + p] = a;
+                    ++j;
+                    if (j == i + p) {
+                        i += 2 * p;
+                        break;
+                    }
+                }
+            }
+            while (i + p <= n - q) {
+                for (j = i; j < i + p; ++j) {
+                    int32 a = x[j + p];
+                    for (r = q; r > p; r >>= 1) {
+                        int32_MINMAX(a, x[j + r]);
+                    }
+                    x[j + p] = a;
+                }
+                i += 2 * p;
+            }
+            /* now i + p > n - q */
+            j = i;
+            while (j < n - q) {
+                int32 a = x[j + p];
+                for (r = q; r > p; r >>= 1) {
+                    int32_MINMAX(a, x[j + r]);
+                }
+                x[j + p] = a;
+                ++j;
+            }
+
+done:
+            ;
+        }
+    }
+}
diff --git a/crypto_kem/sntrup653/clean/crypto_sort_int32.h b/crypto_kem/sntrup653/clean/crypto_sort_int32.h
new file mode 100644
index 00000000..7b9ba26a
--- /dev/null
+++ b/crypto_kem/sntrup653/clean/crypto_sort_int32.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_SNTRUP653_CLEAN_CRYPTO_SORT_INT32_H
+#define PQCLEAN_SNTRUP653_CLEAN_CRYPTO_SORT_INT32_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+
+void PQCLEAN_SNTRUP653_CLEAN_crypto_sort_int32(int32_t *array, size_t n);
+
+#endif
diff --git a/crypto_kem/sntrup653/clean/crypto_sort_uint32.c b/crypto_kem/sntrup653/clean/crypto_sort_uint32.c
new file mode 100644
index 00000000..1c7b815f
--- /dev/null
+++ b/crypto_kem/sntrup653/clean/crypto_sort_uint32.c
@@ -0,0 +1,20 @@
+#include "crypto_sort_int32.h"
+#include "crypto_sort_uint32.h"
+#include <stdint.h>
+
+#define uint32 uint32_t
+
+/* can save time by vectorizing xor loops */
+/* can save time by integrating xor loops with int32_sort */
+
+void PQCLEAN_SNTRUP653_CLEAN_crypto_sort_uint32(uint32_t *array, size_t n) {
+    uint32 *x = array;
+    size_t j;
+    for (j = 0; j < n; ++j) {
+        x[j] ^= 0x80000000;
+    }
+    PQCLEAN_SNTRUP653_CLEAN_crypto_sort_int32((int32_t *)array, n);
+    for (j = 0; j < n; ++j) {
+        x[j] ^= 0x80000000;
+    }
+}
diff --git a/crypto_kem/sntrup653/clean/crypto_sort_uint32.h b/crypto_kem/sntrup653/clean/crypto_sort_uint32.h
new file mode 100644
index 00000000..09b336a3
--- /dev/null
+++ b/crypto_kem/sntrup653/clean/crypto_sort_uint32.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_SNTRUP653_CLEAN_CRYPTO_SORT_UINT32_H
+#define PQCLEAN_SNTRUP653_CLEAN_CRYPTO_SORT_UINT32_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+
+void PQCLEAN_SNTRUP653_CLEAN_crypto_sort_uint32(uint32_t *array, size_t n);
+
+#endif
diff --git a/crypto_kem/sntrup653/clean/crypto_stream_aes256ctr.c b/crypto_kem/sntrup653/clean/crypto_stream_aes256ctr.c
new file mode 100644
index 00000000..61f66408
--- /dev/null
+++ b/crypto_kem/sntrup653/clean/crypto_stream_aes256ctr.c
@@ -0,0 +1,15 @@
+#include "crypto_stream_aes256ctr.h"
+
+
+int PQCLEAN_SNTRUP653_CLEAN_crypto_stream_aes256ctr(
+    uint8_t *out,
+    size_t outlen,
+    const uint8_t nonce[AESCTR_NONCEBYTES],
+    const uint8_t key[AES256_KEYBYTES]) {
+
+    aes256ctx state;
+    aes256_ctr_keyexp(&state, key);
+    aes256_ctr(out, outlen, nonce, &state);
+    aes256_ctx_release(&state);
+    return 0;
+}
diff --git a/crypto_kem/sntrup653/clean/crypto_stream_aes256ctr.h b/crypto_kem/sntrup653/clean/crypto_stream_aes256ctr.h
new file mode 100644
index 00000000..cdd53214
--- /dev/null
+++ b/crypto_kem/sntrup653/clean/crypto_stream_aes256ctr.h
@@ -0,0 +1,15 @@
+#ifndef PQCLEAN_SNTRUP653_CLEAN_CRYPTO_STREAM_AES256CTR_H
+#define PQCLEAN_SNTRUP653_CLEAN_CRYPTO_STREAM_AES256CTR_H
+#include "aes.h"
+#include <stddef.h>
+#include <stdint.h>
+
+
+
+int PQCLEAN_SNTRUP653_CLEAN_crypto_stream_aes256ctr(
+    uint8_t *out,
+    size_t outlen,
+    const uint8_t nonce[AESCTR_NONCEBYTES],
+    const uint8_t key[AES256_KEYBYTES]);
+
+#endif
diff --git a/crypto_kem/sntrup653/clean/crypto_verify_897.c b/crypto_kem/sntrup653/clean/crypto_verify_897.c
new file mode 100644
index 00000000..66b2756e
--- /dev/null
+++ b/crypto_kem/sntrup653/clean/crypto_verify_897.c
@@ -0,0 +1,13 @@
+#include "crypto_verify_897.h"
+
+
+int PQCLEAN_SNTRUP653_CLEAN_crypto_verify_897(const unsigned char *x, const unsigned char *y) {
+    unsigned int differentbits = 0;
+    int i;
+
+    for (i = 0; i < PQCLEAN_SNTRUP653_CLEAN_crypto_verify_897_BYTES; ++i) {
+        differentbits |= x[i] ^ y[i];
+    }
+
+    return (int) (1 & ((differentbits - 1) >> 8)) - 1;
+}
diff --git a/crypto_kem/sntrup653/clean/crypto_verify_897.h b/crypto_kem/sntrup653/clean/crypto_verify_897.h
new file mode 100644
index 00000000..b51896b5
--- /dev/null
+++ b/crypto_kem/sntrup653/clean/crypto_verify_897.h
@@ -0,0 +1,8 @@
+#ifndef PQCLEAN_SNTRUP653_CLEAN_CRYPTO_VERIFY_897_H
+#define PQCLEAN_SNTRUP653_CLEAN_CRYPTO_VERIFY_897_H
+
+#include <stdint.h>
+#define PQCLEAN_SNTRUP653_CLEAN_crypto_verify_897_BYTES 897
+
+int PQCLEAN_SNTRUP653_CLEAN_crypto_verify_897(const unsigned char *x, const unsigned char *y);
+#endif
diff --git a/crypto_kem/sntrup653/clean/kem.c b/crypto_kem/sntrup653/clean/kem.c
new file mode 100644
index 00000000..6f269166
--- /dev/null
+++ b/crypto_kem/sntrup653/clean/kem.c
@@ -0,0 +1,247 @@
+#include "api.h"
+#include "crypto_sort_uint32.h"
+#include "params.h"
+#include "randombytes.h"
+#include "sha2.h"
+
+
+
+#define int8 int8_t
+#define int16 int16_t
+#define int32 int32_t
+#define uint16 uint16_t
+#define uint32 uint32_t
+
+/* ----- arithmetic mod 3 */
+
+typedef int8 small;
+/* F3 is always represented as -1,0,1 */
+
+/* ----- arithmetic mod q */
+
+typedef int16 Fq;
+/* always represented as -(q-1)/2...(q-1)/2 */
+
+/* ----- small polynomials */
+
+/* R3_fromR(R_fromRq(r)) */
+static void R3_fromRq(small *out, const Fq *r) {
+    crypto_encode_pxfreeze3((unsigned char *) out, (unsigned char *) r);
+}
+
+/* h = f*g in the ring R3 */
+static void R3_mult(small *h, const small *f, const small *g) {
+    crypto_core_mult3((unsigned char *) h, (const unsigned char *) f, (const unsigned char *) g);
+}
+
+/* ----- polynomials mod q */
+
+/* h = h*g in the ring Rq */
+static void Rq_mult_small(Fq *h, const small *g) {
+    crypto_encode_pxint16((unsigned char *) h, h);
+    crypto_core_mult((unsigned char *) h, (const unsigned char *) h, (const unsigned char *) g);
+    crypto_decode_pxint16(h, (const unsigned char *) h);
+}
+
+/* h = 3f in Rq */
+static void Rq_mult3(Fq *h, const Fq *f) {
+    crypto_encode_pxint16((unsigned char *) h, f);
+    crypto_core_scale3((unsigned char *) h, (const unsigned char *) h);
+    crypto_decode_pxint16(h, (const unsigned char *) h);
+}
+
+/* out = 1/(3*in) in Rq */
+/* caller must have 2p+1 bytes free in out, not just 2p */
+static void Rq_recip3(Fq *out, const small *in) {
+    crypto_core_inv((unsigned char *) out, (const unsigned char *) in);
+    /* could check byte 2*p for failure; but, in context, inv always works */
+    crypto_decode_pxint16(out, (unsigned char *) out);
+}
+
+/* ----- underlying hash function */
+
+#define Hash_bytes 32
+
+static void Hash(unsigned char *out, const unsigned char *in, int inlen) {
+    unsigned char h[64];
+    int i;
+    sha512(h, in, inlen);
+    for (i = 0; i < 32; ++i) {
+        out[i] = h[i];
+    }
+}
+
+/* ----- higher-level randomness */
+
+static void Short_random(small *out) {
+    uint32 L[ppadsort];
+    int i;
+
+    randombytes((unsigned char *) L, 4 * p);
+    crypto_decode_pxint32(L, (unsigned char *) L);
+    for (i = 0; i < w; ++i) {
+        L[i] = L[i] & (uint32) - 2;
+    }
+    for (i = w; i < p; ++i) {
+        L[i] = (L[i] & (uint32) - 3) | 1;
+    }
+    for (i = p; i < ppadsort; ++i) {
+        L[i] = 0xffffffff;
+    }
+    PQCLEAN_SNTRUP653_CLEAN_crypto_sort_uint32(L, ppadsort);
+    for (i = 0; i < p; ++i) {
+        out[i] = (L[i] & 3) - 1;
+    }
+}
+
+static void Small_random(small *out) {
+    uint32 L[p];
+    int i;
+
+    randombytes((unsigned char *) L, sizeof L);
+    crypto_decode_pxint32(L, (unsigned char *) L);
+    for (i = 0; i < p; ++i) {
+        out[i] = (((L[i] & 0x3fffffff) * 3) >> 30) - 1;
+    }
+}
+
+/* ----- Streamlined NTRU Prime */
+
+typedef small Inputs[p]; /* passed by reference */
+#define Ciphertexts_bytes Rounded_bytes
+#define SecretKeys_bytes (2*Small_bytes)
+#define PublicKeys_bytes Rq_bytes
+#define Confirm_bytes 32
+
+/* c,r_enc[1:] = Hide(r,pk,cache); cache is Hash4(pk) */
+/* also set r_enc[0]=3 */
+/* also set x[0]=2, and x[1:1+Hash_bytes] = Hash3(r_enc) */
+/* also overwrite x[1+Hash_bytes:1+2*Hash_bytes] */
+static void Hide(unsigned char *x, unsigned char *c, unsigned char *r_enc, const Inputs r, const unsigned char *pk, const unsigned char *cache) {
+    Fq h[p];
+    int i;
+
+    Small_encode(r_enc + 1, r);
+    Rq_decode(h, pk);
+    Rq_mult_small(h, r);
+    Round_and_encode(c, h);
+    r_enc[0] = 3;
+    Hash(x + 1, r_enc, 1 + Small_bytes);
+    for (i = 0; i < Hash_bytes; ++i) {
+        x[1 + Hash_bytes + i] = cache[i];
+    }
+    x[0] = 2;
+    Hash(c + Ciphertexts_bytes, x, 1 + Hash_bytes * 2);
+}
+
+
+int PQCLEAN_SNTRUP653_CLEAN_crypto_kem_keypair(unsigned char *pk, unsigned char *sk) {
+    small g[p];
+    for (;;) {
+        Small_random(g);
+        {
+            small v[p + 1];
+            crypto_core_inv3((unsigned char *) v, (const unsigned char *) g);
+            if (v[p] == 0) {
+                Small_encode(sk + Small_bytes, v);
+                break;
+            }
+        }
+    }
+    {
+        small f[p];
+        Short_random(f);
+        Small_encode(sk, f);
+        {
+            Fq h[p + 1];
+            Rq_recip3(h, f); /* always works */
+            Rq_mult_small(h, g);
+            Rq_encode(pk, h);
+        }
+    }
+    {
+        int i;
+        unsigned char sksave = sk[SecretKeys_bytes - 1];
+        for (i = 0; i < PublicKeys_bytes; ++i) {
+            sk[SecretKeys_bytes + i] = pk[i];
+        }
+        sk[SecretKeys_bytes - 1] = 4;
+        Hash(sk + SecretKeys_bytes + PublicKeys_bytes + Small_bytes, sk + SecretKeys_bytes - 1, 1 + PublicKeys_bytes);
+        sk[SecretKeys_bytes - 1] = sksave;
+        randombytes(sk + SecretKeys_bytes + PublicKeys_bytes, Small_bytes);
+    }
+    return 0;
+}
+
+int PQCLEAN_SNTRUP653_CLEAN_crypto_kem_enc(unsigned char *c, unsigned char *k, const unsigned char *pk) {
+    unsigned char cache[Hash_bytes];
+    int i;
+    {
+        unsigned char y[1 + PublicKeys_bytes]; /* XXX: can eliminate with incremental hashing */
+        for (i = 0; i < PublicKeys_bytes; ++i) {
+            y[1 + i] = pk[i];
+        }
+        y[0] = 4;
+        Hash(cache, y, sizeof y);
+    }
+    {
+        Inputs r;
+        Short_random(r);
+        {
+            unsigned char r_enc[Small_bytes + 1];
+            unsigned char x[1 + Hash_bytes + Ciphertexts_bytes + Confirm_bytes];
+            Hide(x, c, r_enc, r, pk, cache);
+            for (i = 0; i < Ciphertexts_bytes + Confirm_bytes; ++i) {
+                x[1 + Hash_bytes + i] = c[i];
+            }
+            x[0] = 1;
+            Hash(k, x, sizeof x);
+        }
+    }
+    return 0;
+}
+
+int PQCLEAN_SNTRUP653_CLEAN_crypto_kem_dec(unsigned char *k, const unsigned char *c, const unsigned char *sk) {
+    const unsigned char *pk = sk + SecretKeys_bytes;
+    const unsigned char *rho = pk + PublicKeys_bytes;
+    const unsigned char *cache = rho + Small_bytes;
+    int mask, i;
+    Inputs r;
+    {
+        Fq d[p];
+        Rounded_decode(d, c);
+        {
+            small f[p];
+            Small_decode(f, sk);
+            Rq_mult_small(d, f);
+            Rq_mult3(d, d);
+        }
+        {
+            small e[p];
+            small v[p];
+            R3_fromRq(e, d);
+            Small_decode(v, sk + Small_bytes);
+            R3_mult(r, e, v);
+        }
+        crypto_core_wforce((unsigned char *) r, (unsigned char *) r);
+    }
+    {
+        unsigned char r_enc[1 + Small_bytes];
+        unsigned char cnew[Ciphertexts_bytes + Confirm_bytes];
+        unsigned char x[1 + Hash_bytes + Ciphertexts_bytes + Confirm_bytes];
+        /* XXX: can use incremental hashing to reduce x size */
+
+        Hide(x, cnew, r_enc, r, pk, cache);
+        mask = crypto_verify_clen(c, cnew);
+        for (i = 0; i < Small_bytes; ++i) {
+            r_enc[i + 1] ^= mask & (r_enc[i + 1] ^ rho[i]);
+        }
+        Hash(x + 1, r_enc, 1 + Small_bytes); /* XXX: can instead do cmov on cached hash of rho */
+        for (i = 0; i < Ciphertexts_bytes + Confirm_bytes; ++i) {
+            x[1 + Hash_bytes + i] = c[i];
+        }
+        x[0] = 1 + mask;
+        Hash(k, x, sizeof x);
+    }
+    return 0;
+}
diff --git a/crypto_kem/sntrup653/clean/params.h b/crypto_kem/sntrup653/clean/params.h
new file mode 100644
index 00000000..68452f4e
--- /dev/null
+++ b/crypto_kem/sntrup653/clean/params.h
@@ -0,0 +1,68 @@
+#ifndef params_H
+#define params_H
+#include "crypto_core_inv3sntrup653.h"
+#include "crypto_core_invsntrup653.h"
+#include "crypto_core_mult3sntrup653.h"
+#include "crypto_core_multsntrup653.h"
+#include "crypto_core_scale3sntrup653.h"
+#include "crypto_core_weightsntrup653.h"
+#include "crypto_core_wforcesntrup653.h"
+#include "crypto_decode_653x1541.h"
+#include "crypto_decode_653x3.h"
+#include "crypto_decode_653x4621.h"
+#include "crypto_decode_653xint16.h"
+#include "crypto_decode_653xint32.h"
+#include "crypto_encode_653x1541.h"
+#include "crypto_encode_653x1541round.h"
+#include "crypto_encode_653x3.h"
+#include "crypto_encode_653x4621.h"
+#include "crypto_encode_653xfreeze3.h"
+#include "crypto_encode_653xint16.h"
+#include "crypto_encode_int16.h"
+#include "crypto_verify_897.h"
+
+
+#define p 653
+#define q27 29045 /* closest integer to 2^27/q */
+#define q18 57 /* closest integer to 2^18/q */
+#define q 4621
+#define w 288
+
+#define ppadsort 653
+
+#define crypto_verify_clen PQCLEAN_SNTRUP653_CLEAN_crypto_verify_897
+
+#define Rq_bytes PQCLEAN_SNTRUP653_CLEAN_crypto_encode_653x4621_STRBYTES
+#define Rq_encode PQCLEAN_SNTRUP653_CLEAN_crypto_encode_653x4621
+#define Rq_decode PQCLEAN_SNTRUP653_CLEAN_crypto_decode_653x4621
+
+#define Rounded_bytes PQCLEAN_SNTRUP653_CLEAN_crypto_decode_653x1541_STRBYTES
+#define Rounded_decode PQCLEAN_SNTRUP653_CLEAN_crypto_decode_653x1541
+
+#define Round_and_encode PQCLEAN_SNTRUP653_CLEAN_crypto_encode_653x1541round
+
+#define Small_bytes PQCLEAN_SNTRUP653_CLEAN_crypto_encode_653x3_STRBYTES
+#define Small_encode PQCLEAN_SNTRUP653_CLEAN_crypto_encode_653x3
+#define Small_decode PQCLEAN_SNTRUP653_CLEAN_crypto_decode_653x3
+
+#define crypto_encode_pxfreeze3 PQCLEAN_SNTRUP653_CLEAN_crypto_encode_653xfreeze3
+
+#define crypto_decode_pxint32 PQCLEAN_SNTRUP653_CLEAN_crypto_decode_653xint32
+
+#define crypto_decode_pxint16 PQCLEAN_SNTRUP653_CLEAN_crypto_decode_653xint16
+
+#define crypto_encode_pxint16 PQCLEAN_SNTRUP653_CLEAN_crypto_encode_653xint16
+
+#define crypto_core_wforce PQCLEAN_SNTRUP653_CLEAN_crypto_core_wforcesntrup653
+
+#define crypto_core_scale3 PQCLEAN_SNTRUP653_CLEAN_crypto_core_scale3sntrup653
+
+#define crypto_core_inv PQCLEAN_SNTRUP653_CLEAN_crypto_core_invsntrup653
+
+#define crypto_core_inv3 PQCLEAN_SNTRUP653_CLEAN_crypto_core_inv3sntrup653
+
+#define crypto_core_mult PQCLEAN_SNTRUP653_CLEAN_crypto_core_multsntrup653
+
+#define crypto_core_mult3 PQCLEAN_SNTRUP653_CLEAN_crypto_core_mult3sntrup653
+
+#endif
diff --git a/crypto_kem/sntrup761/META.yml b/crypto_kem/sntrup761/META.yml
new file mode 100644
index 00000000..0cc0b1a8
--- /dev/null
+++ b/crypto_kem/sntrup761/META.yml
@@ -0,0 +1,26 @@
+name: sntrup761
+type: kem
+claimed-nist-level: 3
+claimed-security: IND-CCA2
+length-public-key: 1158
+length-secret-key: 1763
+length-ciphertext: 1039
+length-shared-secret: 32
+nistkat-sha256: 2eba10673b9077530ba9c063d22f2534e415a6da42985c333c6baee133cc0ff1
+principal-submitters:
+  - Daniel J. Bernstein
+  - Chitchanok Chuengsatiansup
+  - Tanja Lange
+  - Christine van Vredendaal
+implementations:
+    - name: clean
+      version: supercop-20200826
+    - name: avx2
+      version: supercop-20200826
+      supported_platforms:
+          - architecture: x86_64
+            operating_systems:
+                - Linux
+                - Darwin
+            required_flags:
+                - avx2
diff --git a/crypto_kem/sntrup761/avx2/LICENSE b/crypto_kem/sntrup761/avx2/LICENSE
new file mode 100644
index 00000000..d5d21fff
--- /dev/null
+++ b/crypto_kem/sntrup761/avx2/LICENSE
@@ -0,0 +1 @@
+Public Domain
diff --git a/crypto_kem/sntrup761/avx2/Makefile b/crypto_kem/sntrup761/avx2/Makefile
new file mode 100644
index 00000000..3e1c7ac2
--- /dev/null
+++ b/crypto_kem/sntrup761/avx2/Makefile
@@ -0,0 +1,22 @@
+# This Makefile can be used with GNU Make or BSD Make
+
+LIB=libsntrup761_avx2.a
+HEADERS=api.h crypto_core_inv3sntrup761.h crypto_core_invsntrup761.h crypto_core_mult3sntrup761.h crypto_core_multsntrup761.h crypto_core_multsntrup761_ntt.h crypto_core_scale3sntrup761.h crypto_core_weightsntrup761.h crypto_core_wforcesntrup761.h crypto_decode_761x1531.h crypto_decode_761x3.h crypto_decode_761x4591.h crypto_decode_761xint16.h crypto_decode_761xint32.h crypto_decode_int16.h crypto_encode_761x1531.h crypto_encode_761x1531round.h crypto_encode_761x3.h crypto_encode_761x4591.h crypto_encode_761xfreeze3.h crypto_encode_761xint16.h crypto_encode_int16.h crypto_sort_int32.h crypto_sort_uint32.h crypto_stream_aes256ctr.h crypto_verify_1039.h params.h 
+OBJECTS=crypto_core_inv3sntrup761.o crypto_core_invsntrup761.o crypto_core_mult3sntrup761.o crypto_core_multsntrup761.o crypto_core_multsntrup761_ntt.o crypto_core_scale3sntrup761.o crypto_core_weightsntrup761.o crypto_core_wforcesntrup761.o crypto_decode_761x1531.o crypto_decode_761x3.o crypto_decode_761x4591.o crypto_decode_761xint16.o crypto_decode_761xint32.o crypto_decode_int16.o crypto_encode_761x1531.o crypto_encode_761x1531round.o crypto_encode_761x3.o crypto_encode_761x4591.o crypto_encode_761xfreeze3.o crypto_encode_761xint16.o crypto_encode_int16.o crypto_sort_int32.o crypto_sort_uint32.o crypto_stream_aes256ctr.o crypto_verify_1039.o kem.o 
+
+CFLAGS=-O3 -mavx2 -mbmi2 -Wall -Wextra -Wpedantic -Wvla -Werror -Wredundant-decls -Wmissing-prototypes -std=c99 -I../../../common $(EXTRAFLAGS)
+
+all: $(LIB)
+
+%.o: %.s $(HEADERS)
+	$(AS) -o $@ $<
+
+%.o: %.c $(HEADERS)
+	$(CC) $(CFLAGS) -c -o $@ $<
+
+$(LIB): $(OBJECTS)
+	$(AR) -r $@ $(OBJECTS)
+
+clean:
+	$(RM) $(OBJECTS)
+	$(RM) $(LIB)
diff --git a/crypto_kem/sntrup761/avx2/api.h b/crypto_kem/sntrup761/avx2/api.h
new file mode 100644
index 00000000..afefdc16
--- /dev/null
+++ b/crypto_kem/sntrup761/avx2/api.h
@@ -0,0 +1,16 @@
+#ifndef PQCLEAN_SNTRUP761_AVX2_API_H
+#define PQCLEAN_SNTRUP761_AVX2_API_H
+
+
+
+#define PQCLEAN_SNTRUP761_AVX2_CRYPTO_ALGNAME "sntrup761"
+
+#define PQCLEAN_SNTRUP761_AVX2_CRYPTO_SECRETKEYBYTES 1763
+#define PQCLEAN_SNTRUP761_AVX2_CRYPTO_PUBLICKEYBYTES 1158
+#define PQCLEAN_SNTRUP761_AVX2_CRYPTO_CIPHERTEXTBYTES 1039
+#define PQCLEAN_SNTRUP761_AVX2_CRYPTO_BYTES 32
+
+int PQCLEAN_SNTRUP761_AVX2_crypto_kem_keypair(unsigned char *pk, unsigned char *sk);
+int PQCLEAN_SNTRUP761_AVX2_crypto_kem_enc(unsigned char *c, unsigned char *k, const unsigned char *pk);
+int PQCLEAN_SNTRUP761_AVX2_crypto_kem_dec(unsigned char *k, const unsigned char *c, const unsigned char *sk);
+#endif
diff --git a/crypto_kem/sntrup761/avx2/crypto_core_inv3sntrup761.c b/crypto_kem/sntrup761/avx2/crypto_core_inv3sntrup761.c
new file mode 100644
index 00000000..71fdf508
--- /dev/null
+++ b/crypto_kem/sntrup761/avx2/crypto_core_inv3sntrup761.c
@@ -0,0 +1,542 @@
+#include "crypto_core_inv3sntrup761.h"
+#include <immintrin.h>
+
+
+#define int8 int8_t
+typedef int8 small;
+
+#define p 761
+#define ppad 768
+#define numvec 3
+
+typedef __m256i vec256;
+
+/*
+This code stores 768-coeff poly as vec256[3].
+Order of 256 coefficients in each vec256
+is optimized in light of costs of vector instructions:
+  0,4,...,252 in 64-bit word;
+  1,5,...,253 in 64-bit word;
+  2,6,...,254 in 64-bit word;
+  3,7,...,255 in 64-bit word.
+*/
+
+static inline void vec256_frombits(vec256 *v, const small *b) {
+    int i;
+
+    for (i = 0; i < numvec; ++i) {
+        vec256 b0 = _mm256_loadu_si256((vec256 *) b);
+        b += 32; /* 0,1,...,31 */
+        vec256 b1 = _mm256_loadu_si256((vec256 *) b);
+        b += 32; /* 32,33,... */
+        vec256 b2 = _mm256_loadu_si256((vec256 *) b);
+        b += 32;
+        vec256 b3 = _mm256_loadu_si256((vec256 *) b);
+        b += 32;
+        vec256 b4 = _mm256_loadu_si256((vec256 *) b);
+        b += 32;
+        vec256 b5 = _mm256_loadu_si256((vec256 *) b);
+        b += 32;
+        vec256 b6 = _mm256_loadu_si256((vec256 *) b);
+        b += 32;
+        vec256 b7 = _mm256_loadu_si256((vec256 *) b);
+        b += 32;
+
+        vec256 c0 = _mm256_unpacklo_epi32(b0, b1); /* 0 1 2 3 32 33 34 35 4 5 6 7 36 37 38 39 ... 55 */
+        vec256 c1 = _mm256_unpackhi_epi32(b0, b1); /* 8 9 10 11 40 41 42 43 ... 63 */
+        vec256 c2 = _mm256_unpacklo_epi32(b2, b3);
+        vec256 c3 = _mm256_unpackhi_epi32(b2, b3);
+        vec256 c4 = _mm256_unpacklo_epi32(b4, b5);
+        vec256 c5 = _mm256_unpackhi_epi32(b4, b5);
+        vec256 c6 = _mm256_unpacklo_epi32(b6, b7);
+        vec256 c7 = _mm256_unpackhi_epi32(b6, b7);
+
+        vec256 d0 = c0 | _mm256_slli_epi32(c1, 2); /* 0 8, 1 9, 2 10, 3 11, 32 40, 33 41, ..., 55 63 */
+        vec256 d2 = c2 | _mm256_slli_epi32(c3, 2);
+        vec256 d4 = c4 | _mm256_slli_epi32(c5, 2);
+        vec256 d6 = c6 | _mm256_slli_epi32(c7, 2);
+
+        vec256 e0 = _mm256_unpacklo_epi64(d0, d2);
+        vec256 e2 = _mm256_unpackhi_epi64(d0, d2);
+        vec256 e4 = _mm256_unpacklo_epi64(d4, d6);
+        vec256 e6 = _mm256_unpackhi_epi64(d4, d6);
+
+        vec256 f0 = e0 | _mm256_slli_epi32(e2, 1);
+        vec256 f4 = e4 | _mm256_slli_epi32(e6, 1);
+
+        vec256 g0 = _mm256_permute2x128_si256(f0, f4, 0x20);
+        vec256 g4 = _mm256_permute2x128_si256(f0, f4, 0x31);
+
+        vec256 h = g0 | _mm256_slli_epi32(g4, 4);
+
+#define TRANSPOSE _mm256_set_epi8( 31,27,23,19, 30,26,22,18, 29,25,21,17, 28,24,20,16, 15,11,7,3, 14,10,6,2, 13,9,5,1, 12,8,4,0 )
+        h = _mm256_shuffle_epi8(h, TRANSPOSE);
+        h = _mm256_permute4x64_epi64(h, 0xd8);
+        h = _mm256_shuffle_epi32(h, 0xd8);
+
+        *v++ = h;
+    }
+}
+
+static inline void vec256_tobits(const vec256 *v, small *b) {
+    int i;
+
+    for (i = 0; i < numvec; ++i) {
+        vec256 h = *v++;
+
+        h = _mm256_shuffle_epi32(h, 0xd8);
+        h = _mm256_permute4x64_epi64(h, 0xd8);
+        h = _mm256_shuffle_epi8(h, TRANSPOSE);
+
+        vec256 g0 = h & _mm256_set1_epi8(15);
+        vec256 g4 = _mm256_srli_epi32(h, 4) & _mm256_set1_epi8(15);
+
+        vec256 f0 = _mm256_permute2x128_si256(g0, g4, 0x20);
+        vec256 f4 = _mm256_permute2x128_si256(g0, g4, 0x31);
+
+        vec256 e0 = f0 & _mm256_set1_epi8(5);
+        vec256 e2 = _mm256_srli_epi32(f0, 1) & _mm256_set1_epi8(5);
+        vec256 e4 = f4 & _mm256_set1_epi8(5);
+        vec256 e6 = _mm256_srli_epi32(f4, 1) & _mm256_set1_epi8(5);
+
+        vec256 d0 = _mm256_unpacklo_epi32(e0, e2);
+        vec256 d2 = _mm256_unpackhi_epi32(e0, e2);
+        vec256 d4 = _mm256_unpacklo_epi32(e4, e6);
+        vec256 d6 = _mm256_unpackhi_epi32(e4, e6);
+
+        vec256 c0 = d0 & _mm256_set1_epi8(1);
+        vec256 c1 = _mm256_srli_epi32(d0, 2) & _mm256_set1_epi8(1);
+        vec256 c2 = d2 & _mm256_set1_epi8(1);
+        vec256 c3 = _mm256_srli_epi32(d2, 2) & _mm256_set1_epi8(1);
+        vec256 c4 = d4 & _mm256_set1_epi8(1);
+        vec256 c5 = _mm256_srli_epi32(d4, 2) & _mm256_set1_epi8(1);
+        vec256 c6 = d6 & _mm256_set1_epi8(1);
+        vec256 c7 = _mm256_srli_epi32(d6, 2) & _mm256_set1_epi8(1);
+
+        vec256 b0 = _mm256_unpacklo_epi64(c0, c1);
+        vec256 b1 = _mm256_unpackhi_epi64(c0, c1);
+        vec256 b2 = _mm256_unpacklo_epi64(c2, c3);
+        vec256 b3 = _mm256_unpackhi_epi64(c2, c3);
+        vec256 b4 = _mm256_unpacklo_epi64(c4, c5);
+        vec256 b5 = _mm256_unpackhi_epi64(c4, c5);
+        vec256 b6 = _mm256_unpacklo_epi64(c6, c7);
+        vec256 b7 = _mm256_unpackhi_epi64(c6, c7);
+
+        _mm256_storeu_si256((vec256 *) b, b0);
+        b += 32;
+        _mm256_storeu_si256((vec256 *) b, b1);
+        b += 32;
+        _mm256_storeu_si256((vec256 *) b, b2);
+        b += 32;
+        _mm256_storeu_si256((vec256 *) b, b3);
+        b += 32;
+        _mm256_storeu_si256((vec256 *) b, b4);
+        b += 32;
+        _mm256_storeu_si256((vec256 *) b, b5);
+        b += 32;
+        _mm256_storeu_si256((vec256 *) b, b6);
+        b += 32;
+        _mm256_storeu_si256((vec256 *) b, b7);
+        b += 32;
+    }
+}
+
+static void vec256_init(vec256 *G0, vec256 *G1, const small *s) {
+    int i;
+    small srev[ppad + (ppad - p)];
+    small si;
+    small g0[ppad];
+    small g1[ppad];
+
+    for (i = 0; i < p; ++i) {
+        srev[ppad - 1 - i] = s[i];
+    }
+    for (i = 0; i < ppad - p; ++i) {
+        srev[i] = 0;
+    }
+    for (i = p; i < ppad; ++i) {
+        srev[i + ppad - p] = 0;
+    }
+
+    for (i = 0; i < ppad; ++i) {
+        si = srev[i + ppad - p];
+        g0[i] = si & 1;
+        g1[i] = (si >> 1) & g0[i];
+    }
+
+    vec256_frombits(G0, g0);
+    vec256_frombits(G1, g1);
+}
+
+static void vec256_final(small *out, const vec256 *V0, const vec256 *V1) {
+    int i;
+    small v0[ppad];
+    small v1[ppad];
+    small v[ppad];
+    small vrev[ppad + (ppad - p)];
+
+    vec256_tobits(V0, v0);
+    vec256_tobits(V1, v1);
+
+    for (i = 0; i < ppad; ++i) {
+        v[i] = v0[i] + 2 * v1[i] - 4 * (v0[i] & v1[i]);
+    }
+
+    for (i = 0; i < ppad; ++i) {
+        vrev[i] = v[ppad - 1 - i];
+    }
+    for (i = ppad; i < ppad + (ppad - p); ++i) {
+        vrev[i] = 0;
+    }
+
+    for (i = 0; i < p; ++i) {
+        out[i] = vrev[i + ppad - p];
+    }
+}
+
+static inline int negative_mask(int x) {
+    return x >> 31;
+}
+
+static inline void vec256_swap(vec256 *f, vec256 *g, int len, vec256 mask) {
+    vec256 flip;
+    int i;
+
+    for (i = 0; i < len; ++i) {
+        flip = mask & (f[i] ^ g[i]);
+        f[i] ^= flip;
+        g[i] ^= flip;
+    }
+}
+
+static inline void vec256_scale(vec256 *f0, vec256 *f1, const vec256 c0, const vec256 c1) {
+    int i;
+
+    for (i = 0; i < numvec; ++i) {
+        vec256 f0i = f0[i];
+        vec256 f1i = f1[i];
+
+        f0i &= c0;
+        f1i ^= c1;
+        f1i &= f0i;
+
+        f0[i] = f0i;
+        f1[i] = f1i;
+    }
+}
+
+static inline void vec256_eliminate(vec256 *f0, vec256 *f1, vec256 *g0, vec256 *g1, int len, const vec256 c0, const vec256 c1) {
+    int i;
+
+    for (i = 0; i < len; ++i) {
+        vec256 f0i = f0[i];
+        vec256 f1i = f1[i];
+        vec256 g0i = g0[i];
+        vec256 g1i = g1[i];
+        vec256 t;
+
+        f0i &= c0;
+        f1i ^= c1;
+        f1i &= f0i;
+
+        t = g0i ^ f0i;
+        g0[i] = t | (g1i ^ f1i);
+        g1[i] = (g1i ^ f0i) & (f1i ^ t);
+    }
+}
+
+static inline int vec256_bit0mask(vec256 *f) {
+    return -(_mm_cvtsi128_si32(_mm256_castsi256_si128(f[0])) & 1);
+}
+
+static inline void vec256_divx_1(vec256 *f) {
+    vec256 f0 = f[0];
+
+    unsigned long long low0 = _mm_cvtsi128_si64(_mm256_castsi256_si128(f0));
+
+    low0 = low0 >> 1;
+
+    f0 = _mm256_blend_epi32(f0, _mm256_set_epi64x(0, 0, 0, low0), 0x3);
+
+    f[0] = _mm256_permute4x64_epi64(f0, 0x39);
+}
+
+static inline void vec256_divx_2(vec256 *f) {
+    vec256 f0 = f[0];
+    vec256 f1 = f[1];
+
+    unsigned long long low0 = _mm_cvtsi128_si64(_mm256_castsi256_si128(f0));
+    unsigned long long low1 = _mm_cvtsi128_si64(_mm256_castsi256_si128(f1));
+
+    low0 = (low0 >> 1) | (low1 << 63);
+    low1 = low1 >> 1;
+
+    f0 = _mm256_blend_epi32(f0, _mm256_set_epi64x(0, 0, 0, low0), 0x3);
+    f1 = _mm256_blend_epi32(f1, _mm256_set_epi64x(0, 0, 0, low1), 0x3);
+
+    f[0] = _mm256_permute4x64_epi64(f0, 0x39);
+    f[1] = _mm256_permute4x64_epi64(f1, 0x39);
+}
+
+static inline void vec256_divx_3(vec256 *f) {
+    vec256 f0 = f[0];
+    vec256 f1 = f[1];
+    vec256 f2 = f[2];
+
+    unsigned long long low0 = _mm_cvtsi128_si64(_mm256_castsi256_si128(f0));
+    unsigned long long low1 = _mm_cvtsi128_si64(_mm256_castsi256_si128(f1));
+    unsigned long long low2 = _mm_cvtsi128_si64(_mm256_castsi256_si128(f2));
+
+    low0 = (low0 >> 1) | (low1 << 63);
+    low1 = (low1 >> 1) | (low2 << 63);
+    low2 = low2 >> 1;
+
+    f0 = _mm256_blend_epi32(f0, _mm256_set_epi64x(0, 0, 0, low0), 0x3);
+    f1 = _mm256_blend_epi32(f1, _mm256_set_epi64x(0, 0, 0, low1), 0x3);
+    f2 = _mm256_blend_epi32(f2, _mm256_set_epi64x(0, 0, 0, low2), 0x3);
+
+    f[0] = _mm256_permute4x64_epi64(f0, 0x39);
+    f[1] = _mm256_permute4x64_epi64(f1, 0x39);
+    f[2] = _mm256_permute4x64_epi64(f2, 0x39);
+}
+
+static inline void vec256_timesx_1(vec256 *f) {
+    vec256 f0 = _mm256_permute4x64_epi64(f[0], 0x93);
+
+    unsigned long long low0 = _mm_cvtsi128_si64(_mm256_castsi256_si128(f0));
+
+    low0 = low0 << 1;
+
+    f0 = _mm256_blend_epi32(f0, _mm256_set_epi64x(0, 0, 0, low0), 0x3);
+
+    f[0] = f0;
+}
+
+static inline void vec256_timesx_2(vec256 *f) {
+    vec256 f0 = _mm256_permute4x64_epi64(f[0], 0x93);
+    vec256 f1 = _mm256_permute4x64_epi64(f[1], 0x93);
+
+    unsigned long long low0 = _mm_cvtsi128_si64(_mm256_castsi256_si128(f0));
+    unsigned long long low1 = _mm_cvtsi128_si64(_mm256_castsi256_si128(f1));
+
+    low1 = (low1 << 1) | (low0 >> 63);
+    low0 = low0 << 1;
+
+    f0 = _mm256_blend_epi32(f0, _mm256_set_epi64x(0, 0, 0, low0), 0x3);
+    f1 = _mm256_blend_epi32(f1, _mm256_set_epi64x(0, 0, 0, low1), 0x3);
+
+    f[0] = f0;
+    f[1] = f1;
+}
+
+static inline void vec256_timesx_3(vec256 *f) {
+    vec256 f0 = _mm256_permute4x64_epi64(f[0], 0x93);
+    vec256 f1 = _mm256_permute4x64_epi64(f[1], 0x93);
+    vec256 f2 = _mm256_permute4x64_epi64(f[2], 0x93);
+
+    unsigned long long low0 = *(unsigned long long *) &f0;
+    unsigned long long low1 = *(unsigned long long *) &f1;
+    unsigned long long low2 = _mm_cvtsi128_si64(_mm256_castsi256_si128(f2));
+
+    low2 = (low2 << 1) | (low1 >> 63);
+    low1 = (low1 << 1) | (low0 >> 63);
+    low0 = low0 << 1;
+
+    *(unsigned long long *) &f0 = low0;
+    *(unsigned long long *) &f1 = low1;
+    f2 = _mm256_blend_epi32(f2, _mm256_set_epi64x(0, 0, 0, low2), 0x3);
+
+    f[0] = f0;
+    f[1] = f1;
+    f[2] = f2;
+}
+
+
+int PQCLEAN_SNTRUP761_AVX2_crypto_core_inv3sntrup761(unsigned char *outbytes, const unsigned char *inbytes) {
+    small *out = (void *) outbytes;
+    small *in = (void *) inbytes;
+    vec256 F0[numvec];
+    vec256 F1[numvec];
+    vec256 G0[numvec];
+    vec256 G1[numvec];
+    vec256 V0[numvec];
+    vec256 V1[numvec];
+    vec256 R0[numvec];
+    vec256 R1[numvec];
+    vec256 c0vec, c1vec;
+    int loop;
+    int c0, c1;
+    int minusdelta = -1;
+    int swapmask;
+    vec256 swapvec;
+
+    vec256_init(G0, G1, in);
+    F0[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, 1);
+    F0[1] = _mm256_set1_epi32(0);
+    F0[2] = _mm256_set_epi32(0, 0, 0, 0, 1073741824, 0, 1073741824, 0);
+    F1[0] = _mm256_set1_epi32(0);
+    F1[1] = _mm256_set1_epi32(0);
+    F1[2] = _mm256_set_epi32(0, 0, 0, 0, 1073741824, 0, 1073741824, 0);
+
+    V0[0] = _mm256_set1_epi32(0);
+    V1[0] = _mm256_set1_epi32(0);
+    V0[1] = _mm256_set1_epi32(0);
+    V1[1] = _mm256_set1_epi32(0);
+    V0[2] = _mm256_set1_epi32(0);
+    V1[2] = _mm256_set1_epi32(0);
+
+    R0[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, 1);
+    R1[0] = _mm256_set1_epi32(0);
+    R0[1] = _mm256_set1_epi32(0);
+    R1[1] = _mm256_set1_epi32(0);
+    R0[2] = _mm256_set1_epi32(0);
+    R1[2] = _mm256_set1_epi32(0);
+
+    for (loop = 256; loop > 0; --loop) {
+        vec256_timesx_1(V0);
+        vec256_timesx_1(V1);
+        swapmask = negative_mask(minusdelta) & vec256_bit0mask(G0);
+
+        c0 = vec256_bit0mask(F0) & vec256_bit0mask(G0);
+        c1 = vec256_bit0mask(F1) ^ vec256_bit0mask(G1);
+        c1 &= c0;
+
+        minusdelta ^= swapmask & (minusdelta ^ -minusdelta);
+        minusdelta -= 1;
+
+        swapvec = _mm256_set1_epi32(swapmask);
+        vec256_swap(F0, G0, 3, swapvec);
+        vec256_swap(F1, G1, 3, swapvec);
+
+        c0vec = _mm256_set1_epi32(c0);
+        c1vec = _mm256_set1_epi32(c1);
+
+        vec256_eliminate(F0, F1, G0, G1, 3, c0vec, c1vec);
+        vec256_divx_3(G0);
+        vec256_divx_3(G1);
+
+        vec256_swap(V0, R0, 1, swapvec);
+        vec256_swap(V1, R1, 1, swapvec);
+        vec256_eliminate(V0, V1, R0, R1, 1, c0vec, c1vec);
+    }
+
+    for (loop = 256; loop > 0; --loop) {
+        vec256_timesx_2(V0);
+        vec256_timesx_2(V1);
+        swapmask = negative_mask(minusdelta) & vec256_bit0mask(G0);
+
+        c0 = vec256_bit0mask(F0) & vec256_bit0mask(G0);
+        c1 = vec256_bit0mask(F1) ^ vec256_bit0mask(G1);
+        c1 &= c0;
+
+        minusdelta ^= swapmask & (minusdelta ^ -minusdelta);
+        minusdelta -= 1;
+
+        swapvec = _mm256_set1_epi32(swapmask);
+        vec256_swap(F0, G0, 3, swapvec);
+        vec256_swap(F1, G1, 3, swapvec);
+
+        c0vec = _mm256_set1_epi32(c0);
+        c1vec = _mm256_set1_epi32(c1);
+
+        vec256_eliminate(F0, F1, G0, G1, 3, c0vec, c1vec);
+        vec256_divx_3(G0);
+        vec256_divx_3(G1);
+
+        vec256_swap(V0, R0, 2, swapvec);
+        vec256_swap(V1, R1, 2, swapvec);
+        vec256_eliminate(V0, V1, R0, R1, 2, c0vec, c1vec);
+    }
+
+    for (loop = 497; loop > 0; --loop) {
+        vec256_timesx_3(V0);
+        vec256_timesx_3(V1);
+        swapmask = negative_mask(minusdelta) & vec256_bit0mask(G0);
+
+        c0 = vec256_bit0mask(F0) & vec256_bit0mask(G0);
+        c1 = vec256_bit0mask(F1) ^ vec256_bit0mask(G1);
+        c1 &= c0;
+
+        minusdelta ^= swapmask & (minusdelta ^ -minusdelta);
+        minusdelta -= 1;
+
+        swapvec = _mm256_set1_epi32(swapmask);
+        vec256_swap(F0, G0, 3, swapvec);
+        vec256_swap(F1, G1, 3, swapvec);
+
+        c0vec = _mm256_set1_epi32(c0);
+        c1vec = _mm256_set1_epi32(c1);
+
+        vec256_eliminate(F0, F1, G0, G1, 3, c0vec, c1vec);
+        vec256_divx_3(G0);
+        vec256_divx_3(G1);
+
+        vec256_swap(V0, R0, 3, swapvec);
+        vec256_swap(V1, R1, 3, swapvec);
+        vec256_eliminate(V0, V1, R0, R1, 3, c0vec, c1vec);
+    }
+
+    for (loop = 256; loop > 0; --loop) {
+        vec256_timesx_3(V0);
+        vec256_timesx_3(V1);
+        swapmask = negative_mask(minusdelta) & vec256_bit0mask(G0);
+
+        c0 = vec256_bit0mask(F0) & vec256_bit0mask(G0);
+        c1 = vec256_bit0mask(F1) ^ vec256_bit0mask(G1);
+        c1 &= c0;
+
+        minusdelta ^= swapmask & (minusdelta ^ -minusdelta);
+        minusdelta -= 1;
+
+        swapvec = _mm256_set1_epi32(swapmask);
+        vec256_swap(F0, G0, 2, swapvec);
+        vec256_swap(F1, G1, 2, swapvec);
+
+        c0vec = _mm256_set1_epi32(c0);
+        c1vec = _mm256_set1_epi32(c1);
+
+        vec256_eliminate(F0, F1, G0, G1, 2, c0vec, c1vec);
+        vec256_divx_2(G0);
+        vec256_divx_2(G1);
+
+        vec256_swap(V0, R0, 3, swapvec);
+        vec256_swap(V1, R1, 3, swapvec);
+        vec256_eliminate(V0, V1, R0, R1, 3, c0vec, c1vec);
+    }
+
+    for (loop = 256; loop > 0; --loop) {
+        vec256_timesx_3(V0);
+        vec256_timesx_3(V1);
+        swapmask = negative_mask(minusdelta) & vec256_bit0mask(G0);
+
+        c0 = vec256_bit0mask(F0) & vec256_bit0mask(G0);
+        c1 = vec256_bit0mask(F1) ^ vec256_bit0mask(G1);
+        c1 &= c0;
+
+        minusdelta ^= swapmask & (minusdelta ^ -minusdelta);
+        minusdelta -= 1;
+
+        swapvec = _mm256_set1_epi32(swapmask);
+        vec256_swap(F0, G0, 1, swapvec);
+        vec256_swap(F1, G1, 1, swapvec);
+
+        c0vec = _mm256_set1_epi32(c0);
+        c1vec = _mm256_set1_epi32(c1);
+
+        vec256_eliminate(F0, F1, G0, G1, 1, c0vec, c1vec);
+        vec256_divx_1(G0);
+        vec256_divx_1(G1);
+
+        vec256_swap(V0, R0, 3, swapvec);
+        vec256_swap(V1, R1, 3, swapvec);
+        vec256_eliminate(V0, V1, R0, R1, 3, c0vec, c1vec);
+    }
+
+    c0vec = _mm256_set1_epi32(vec256_bit0mask(F0));
+    c1vec = _mm256_set1_epi32(vec256_bit0mask(F1));
+    vec256_scale(V0, V1, c0vec, c1vec);
+
+    vec256_final(out, V0, V1);
+    out[p] = negative_mask(minusdelta);
+    return 0;
+}
diff --git a/crypto_kem/sntrup761/avx2/crypto_core_inv3sntrup761.h b/crypto_kem/sntrup761/avx2/crypto_core_inv3sntrup761.h
new file mode 100644
index 00000000..3ad25475
--- /dev/null
+++ b/crypto_kem/sntrup761/avx2/crypto_core_inv3sntrup761.h
@@ -0,0 +1,11 @@
+#ifndef PQCLEAN_SNTRUP761_AVX2_CRYPTO_CORE_INV3SNTRUP761_H
+#define PQCLEAN_SNTRUP761_AVX2_CRYPTO_CORE_INV3SNTRUP761_H
+
+#include <stdint.h>
+#define PQCLEAN_SNTRUP761_AVX2_crypto_core_inv3sntrup761_OUTPUTBYTES 762
+#define PQCLEAN_SNTRUP761_AVX2_crypto_core_inv3sntrup761_INPUTBYTES 761
+#define PQCLEAN_SNTRUP761_AVX2_crypto_core_inv3sntrup761_KEYBYTES 0
+#define PQCLEAN_SNTRUP761_AVX2_crypto_core_inv3sntrup761_CONSTBYTES 0
+
+int PQCLEAN_SNTRUP761_AVX2_crypto_core_inv3sntrup761(unsigned char *outbytes, const unsigned char *inbytes);
+#endif
diff --git a/crypto_kem/sntrup761/avx2/crypto_core_invsntrup761.c b/crypto_kem/sntrup761/avx2/crypto_core_invsntrup761.c
new file mode 100644
index 00000000..6fdd9e0d
--- /dev/null
+++ b/crypto_kem/sntrup761/avx2/crypto_core_invsntrup761.c
@@ -0,0 +1,202 @@
+#include "crypto_core_invsntrup761.h"
+#include "params.h"
+#include <immintrin.h>
+
+#define int8 int8_t
+#define int16 int16_t
+#define int32 int32_t
+#define uint16 uint16_t
+#define uint32 uint32_t
+
+
+
+/* ----- masks */
+
+/* return -1 if x!=0; else return 0 */
+static int int16_nonzero_mask(int16 x) {
+    uint16 u = x; /* 0, else 1...65535 */
+    uint32 v = u; /* 0, else 1...65535 */
+    v = -v; /* 0, else 2^32-65535...2^32-1 */
+    v >>= 31; /* 0, else 1 */
+    return -v; /* 0, else -1 */
+}
+
+/* return -1 if x<0; otherwise return 0 */
+static int int16_negative_mask(int16 x) {
+    return x >> 15; /* XXX: theoretically need gcc -fwrapv for this */
+}
+
+/* ----- arithmetic mod q */
+
+typedef int8 small;
+
+typedef int16 Fq;
+/* always represented as -(q-1)/2...(q-1)/2 */
+
+/* works for -14000000 < x < 14000000 if q in 4591, 4621, 5167 */
+static Fq Fq_freeze(int32 x) {
+    x -= q * ((q18 * x) >> 18);
+    x -= q * ((q27 * x + 67108864) >> 27);
+    return x;
+}
+
+/* nonnegative e */
+static Fq Fq_pow(Fq a, int e) {
+    if (e == 0) {
+        return 1;
+    }
+    if (e == 1) {
+        return a;
+    }
+    if (e & 1) {
+        return Fq_freeze(a * (int32)Fq_pow(a, e - 1));
+    }
+    a = Fq_freeze(a * (int32)a);
+    return Fq_pow(a, e >> 1);
+}
+
+static Fq Fq_recip(Fq a) {
+    return Fq_pow(a, q - 2);
+}
+
+/* ----- more */
+
+#define qvec _mm256_set1_epi16(q)
+#define qinvvec _mm256_set1_epi16(qinv)
+
+static inline __m256i montproduct(__m256i x, __m256i y, __m256i yqinv) {
+    __m256i hi, d, e;
+
+    d = _mm256_mullo_epi16(x, yqinv);
+    hi = _mm256_mulhi_epi16(x, y);
+    e = _mm256_mulhi_epi16(d, qvec);
+    return _mm256_sub_epi16(hi, e);
+}
+
+static inline void vectormodq_swapeliminate(Fq *f, Fq *g, int len, const Fq f0, const Fq g0, int mask) {
+    __m256i f0vec = _mm256_set1_epi16(f0);
+    __m256i g0vec = _mm256_set1_epi16(g0);
+    __m256i f0vecqinv = _mm256_mullo_epi16(f0vec, qinvvec);
+    __m256i g0vecqinv = _mm256_mullo_epi16(g0vec, qinvvec);
+    __m256i maskvec = _mm256_set1_epi32(mask);
+
+    while (len > 0) {
+        __m256i fi = _mm256_loadu_si256((__m256i *) f);
+        __m256i gi = _mm256_loadu_si256((__m256i *) g);
+        __m256i finew = _mm256_blendv_epi8(fi, gi, maskvec);
+        __m256i ginew = _mm256_blendv_epi8(gi, fi, maskvec);
+        ginew = _mm256_sub_epi16(montproduct(ginew, f0vec, f0vecqinv), montproduct(finew, g0vec, g0vecqinv));
+        _mm256_storeu_si256((__m256i *) f, finew);
+        _mm256_storeu_si256((__m256i *) (g - 1), ginew);
+        f += 16;
+        g += 16;
+        len -= 16;
+    }
+}
+
+static inline void vectormodq_xswapeliminate(Fq *f, Fq *g, int len, const Fq f0, const Fq g0, int mask) {
+    __m256i f0vec = _mm256_set1_epi16(f0);
+    __m256i g0vec = _mm256_set1_epi16(g0);
+    __m256i f0vecqinv = _mm256_mullo_epi16(f0vec, qinvvec);
+    __m256i g0vecqinv = _mm256_mullo_epi16(g0vec, qinvvec);
+    __m256i maskvec = _mm256_set1_epi32(mask);
+
+    f += len + (-len & 15);
+    g += len + (-len & 15);
+    while (len > 0) {
+        f -= 16;
+        g -= 16;
+        len -= 16;
+        __m256i fi = _mm256_loadu_si256((__m256i *) f);
+        __m256i gi = _mm256_loadu_si256((__m256i *) g);
+        __m256i finew = _mm256_blendv_epi8(fi, gi, maskvec);
+        __m256i ginew = _mm256_blendv_epi8(gi, fi, maskvec);
+        ginew = _mm256_sub_epi16(montproduct(ginew, f0vec, f0vecqinv), montproduct(finew, g0vec, g0vecqinv));
+        _mm256_storeu_si256((__m256i *) (f + 1), finew);
+        _mm256_storeu_si256((__m256i *) g, ginew);
+    }
+}
+
+int PQCLEAN_SNTRUP761_AVX2_crypto_core_invsntrup761(unsigned char *outbytes, const unsigned char *inbytes) {
+    small *in = (void *) inbytes;
+    int loop;
+    Fq out[p], f[ppad], g[ppad], v[ppad], r[ppad];
+    Fq f0, g0;
+    Fq scale;
+    int i;
+    int delta = 1;
+    int minusdelta;
+    int fgflip;
+    int swap;
+
+    for (i = 0; i < ppad; ++i) {
+        f[i] = 0;
+    }
+    f[0] = 1;
+    f[p - 1] = -1;
+    f[p] = -1;
+    /* generalization: initialize f to reversal of any deg-p polynomial m */
+
+    for (i = 0; i < p; ++i) {
+        g[i] = in[p - 1 - i];
+    }
+    for (i = p; i < ppad; ++i) {
+        g[i] = 0;
+    }
+
+    for (i = 0; i < ppad; ++i) {
+        r[i] = 0;
+    }
+    r[0] = Fq_recip(3);
+
+    for (i = 0; i < ppad; ++i) {
+        v[i] = 0;
+    }
+
+    for (loop = 0; loop < p; ++loop) {
+        g0 = Fq_freeze(g[0]);
+        f0 = f[0];
+
+        minusdelta = -delta;
+        swap = int16_negative_mask(minusdelta) & int16_nonzero_mask(g0);
+        delta ^= swap & (delta ^ minusdelta);
+        delta += 1;
+
+        fgflip = swap & (f0 ^ g0);
+        f0 ^= fgflip;
+        g0 ^= fgflip;
+
+        f[0] = f0;
+
+        vectormodq_swapeliminate(f + 1, g + 1, p, f0, g0, swap);
+        vectormodq_xswapeliminate(v, r, loop + 1, f0, g0, swap);
+    }
+
+    for (loop = p - 1; loop > 0; --loop) {
+        g0 = Fq_freeze(g[0]);
+        f0 = f[0];
+
+        minusdelta = -delta;
+        swap = int16_negative_mask(minusdelta) & int16_nonzero_mask(g0);
+        delta ^= swap & (delta ^ minusdelta);
+        delta += 1;
+
+        fgflip = swap & (f0 ^ g0);
+        f0 ^= fgflip;
+        g0 ^= fgflip;
+
+        f[0] = f0;
+
+        vectormodq_swapeliminate(f + 1, g + 1, loop, f0, g0, swap);
+        vectormodq_xswapeliminate(v, r, p, f0, g0, swap);
+    }
+
+    scale = Fq_recip(Fq_freeze(f[0]));
+    for (i = 0; i < p; ++i) {
+        out[i] = Fq_freeze(scale * (int32)Fq_freeze(v[p - i]));
+    }
+
+    crypto_encode_pxint16(outbytes, out);
+    outbytes[2 * p] = int16_nonzero_mask(delta);
+    return 0;
+}
diff --git a/crypto_kem/sntrup761/avx2/crypto_core_invsntrup761.h b/crypto_kem/sntrup761/avx2/crypto_core_invsntrup761.h
new file mode 100644
index 00000000..1a2adf16
--- /dev/null
+++ b/crypto_kem/sntrup761/avx2/crypto_core_invsntrup761.h
@@ -0,0 +1,11 @@
+#ifndef PQCLEAN_SNTRUP761_AVX2_CRYPTO_CORE_INVSNTRUP761_H
+#define PQCLEAN_SNTRUP761_AVX2_CRYPTO_CORE_INVSNTRUP761_H
+
+#include <stdint.h>
+#define PQCLEAN_SNTRUP761_AVX2_crypto_core_invsntrup761_OUTPUTBYTES 1523
+#define PQCLEAN_SNTRUP761_AVX2_crypto_core_invsntrup761_INPUTBYTES 761
+#define PQCLEAN_SNTRUP761_AVX2_crypto_core_invsntrup761_KEYBYTES 0
+#define PQCLEAN_SNTRUP761_AVX2_crypto_core_invsntrup761_CONSTBYTES 0
+
+int PQCLEAN_SNTRUP761_AVX2_crypto_core_invsntrup761(unsigned char *outbytes, const unsigned char *inbytes);
+#endif
diff --git a/crypto_kem/sntrup761/avx2/crypto_core_mult3sntrup761.c b/crypto_kem/sntrup761/avx2/crypto_core_mult3sntrup761.c
new file mode 100644
index 00000000..ad5b13ec
--- /dev/null
+++ b/crypto_kem/sntrup761/avx2/crypto_core_mult3sntrup761.c
@@ -0,0 +1,259 @@
+#include "crypto_core_mult3sntrup761.h"
+#include "crypto_core_multsntrup761_ntt.h"
+#include "crypto_decode_761xint16.h"
+#include "crypto_encode_761xint16.h"
+#include <immintrin.h>
+
+typedef int8_t int8;
+typedef int16_t int16;
+
+#define int16x16 __m256i
+#define load_x16(p) _mm256_loadu_si256((int16x16 *) (p))
+#define store_x16(p,v) _mm256_storeu_si256((int16x16 *) (p),(v))
+#define const_x16 _mm256_set1_epi16
+#define add_x16 _mm256_add_epi16
+#define sub_x16 _mm256_sub_epi16
+#define mullo_x16 _mm256_mullo_epi16
+#define mulhi_x16 _mm256_mulhi_epi16
+#define mulhrs_x16 _mm256_mulhrs_epi16
+#define signmask_x16(x) _mm256_srai_epi16((x),15)
+
+typedef union {
+    int16 v[3][512];
+    int16x16 _dummy;
+} vec3x512;
+
+typedef union {
+    int16 v[768];
+    int16x16 _dummy;
+} vec768;
+
+typedef union {
+    int16 v[3 * 512];
+    int16x16 _dummy;
+} vec1536;
+
+static int16x16 squeeze_3_x16(int16x16 x) {
+    return sub_x16(x, mullo_x16(mulhrs_x16(x, const_x16(10923)), const_x16(3)));
+}
+
+static int16x16 squeeze_7681_x16(int16x16 x) {
+    return sub_x16(x, mullo_x16(mulhrs_x16(x, const_x16(4)), const_x16(7681)));
+}
+
+static int16x16 mulmod_7681_x16(int16x16 x, int16x16 y) {
+    int16x16 yqinv = mullo_x16(y, const_x16(-7679)); /* XXX: precompute */
+    int16x16 b = mulhi_x16(x, y);
+    int16x16 d = mullo_x16(x, yqinv);
+    int16x16 e = mulhi_x16(d, const_x16(7681));
+    return sub_x16(b, e);
+}
+
+#define mask0 _mm256_set_epi16(-1,0,0,-1,0,0,-1,0,0,-1,0,0,-1,0,0,-1)
+#define mask1 _mm256_set_epi16(0,0,-1,0,0,-1,0,0,-1,0,0,-1,0,0,-1,0)
+#define mask2 _mm256_set_epi16(0,-1,0,0,-1,0,0,-1,0,0,-1,0,0,-1,0,0)
+
+static void good(int16 fpad[3][512], const int16 f[768]) {
+    int j;
+    int16x16 f0, f1;
+
+    j = 0;
+    for (;;) {
+        f0 = load_x16(f + j);
+        f1 = load_x16(f + 512 + j);
+        store_x16(&fpad[0][j], (f0 & mask0) | (f1 & mask1));
+        store_x16(&fpad[1][j], (f0 & mask1) | (f1 & mask2));
+        store_x16(&fpad[2][j], (f0 & mask2) | (f1 & mask0));
+        j += 16;
+        if (j == 256) {
+            break;
+        }
+
+        f0 = load_x16(f + j);
+        f1 = load_x16(f + 512 + j);
+        store_x16(&fpad[0][j], (f0 & mask2) | (f1 & mask0));
+        store_x16(&fpad[1][j], (f0 & mask0) | (f1 & mask1));
+        store_x16(&fpad[2][j], (f0 & mask1) | (f1 & mask2));
+        j += 16;
+
+        f0 = load_x16(f + j);
+        f1 = load_x16(f + 512 + j);
+        store_x16(&fpad[0][j], (f0 & mask1) | (f1 & mask2));
+        store_x16(&fpad[1][j], (f0 & mask2) | (f1 & mask0));
+        store_x16(&fpad[2][j], (f0 & mask0) | (f1 & mask1));
+        j += 16;
+    }
+    for (;;) {
+        f0 = load_x16(f + j);
+        store_x16(&fpad[0][j], f0 & mask2);
+        store_x16(&fpad[1][j], f0 & mask0);
+        store_x16(&fpad[2][j], f0 & mask1);
+        j += 16;
+        if (j == 512) {
+            break;
+        }
+
+        f0 = load_x16(f + j);
+        store_x16(&fpad[0][j], f0 & mask1);
+        store_x16(&fpad[1][j], f0 & mask2);
+        store_x16(&fpad[2][j], f0 & mask0);
+        j += 16;
+
+        f0 = load_x16(f + j);
+        store_x16(&fpad[0][j], f0 & mask0);
+        store_x16(&fpad[1][j], f0 & mask1);
+        store_x16(&fpad[2][j], f0 & mask2);
+        j += 16;
+    }
+}
+
+static void ungood(int16 f[1536], const int16 fpad[3][512]) {
+    int j;
+    int16x16 f0, f1, f2, g0, g1, g2;
+
+    j = 0;
+
+    for (;;) {
+        f0 = load_x16(&fpad[0][j]);
+        f1 = load_x16(&fpad[1][j]);
+        f2 = load_x16(&fpad[2][j]);
+        g0 = (f0 & mask0) | (f1 & mask1) | (f2 & mask2);
+        g1 = (f0 & mask1) | (f1 & mask2) | (f2 & mask0);
+        g2 = f0 ^ f1 ^ f2 ^ g0 ^ g1; /* same as (f0&mask2)|(f1&mask0)|(f2&mask1) */
+        store_x16(f + 0 + j, g0);
+        store_x16(f + 512 + j, g1);
+        store_x16(f + 1024 + j, g2);
+        j += 16;
+
+        f0 = load_x16(&fpad[0][j]);
+        f1 = load_x16(&fpad[1][j]);
+        f2 = load_x16(&fpad[2][j]);
+        g0 = (f0 & mask2) | (f1 & mask0) | (f2 & mask1);
+        g1 = (f0 & mask0) | (f1 & mask1) | (f2 & mask2);
+        g2 = f0 ^ f1 ^ f2 ^ g0 ^ g1; /* same as (f0&mask1)|(f1&mask2)|(f2&mask0) */
+        store_x16(f + 0 + j, g0);
+        store_x16(f + 512 + j, g1);
+        store_x16(f + 1024 + j, g2);
+        j += 16;
+        if (j == 512) {
+            break;
+        }
+
+        f0 = load_x16(&fpad[0][j]);
+        f1 = load_x16(&fpad[1][j]);
+        f2 = load_x16(&fpad[2][j]);
+        g0 = (f0 & mask1) | (f1 & mask2) | (f2 & mask0);
+        g1 = (f0 & mask2) | (f1 & mask0) | (f2 & mask1);
+        g2 = f0 ^ f1 ^ f2 ^ g0 ^ g1; /* same as (f0&mask0)|(f1&mask1)|(f2&mask2) */
+        store_x16(f + 0 + j, g0);
+        store_x16(f + 512 + j, g1);
+        store_x16(f + 1024 + j, g2);
+        j += 16;
+    }
+}
+
+static void mult768(int16 h[1536], const int16 f[768], const int16 g[768]) {
+    vec3x512 x1, x2;
+    vec1536 x3;
+#define fpad (x1.v)
+#define gpad (x2.v)
+#define hpad fpad
+#define h_7681 (x3.v)
+    int i;
+
+    good(fpad, f);
+    PQCLEAN_SNTRUP761_AVX2_ntt512_7681(fpad[0], 3);
+
+    good(gpad, g);
+    PQCLEAN_SNTRUP761_AVX2_ntt512_7681(gpad[0], 3);
+
+    for (i = 0; i < 512; i += 16) {
+        int16x16 f0 = squeeze_7681_x16(load_x16(&fpad[0][i]));
+        int16x16 f1 = squeeze_7681_x16(load_x16(&fpad[1][i]));
+        int16x16 f2 = squeeze_7681_x16(load_x16(&fpad[2][i]));
+        int16x16 g0 = squeeze_7681_x16(load_x16(&gpad[0][i]));
+        int16x16 g1 = squeeze_7681_x16(load_x16(&gpad[1][i]));
+        int16x16 g2 = squeeze_7681_x16(load_x16(&gpad[2][i]));
+        int16x16 d0 = mulmod_7681_x16(f0, g0);
+        int16x16 d1 = mulmod_7681_x16(f1, g1);
+        int16x16 d2 = mulmod_7681_x16(f2, g2);
+        int16x16 dsum = add_x16(add_x16(d0, d1), d2);
+        int16x16 h0 = add_x16(dsum, mulmod_7681_x16(sub_x16(f2, f1), sub_x16(g1, g2)));
+        int16x16 h1 = add_x16(dsum, mulmod_7681_x16(sub_x16(f1, f0), sub_x16(g0, g1)));
+        int16x16 h2 = add_x16(dsum, mulmod_7681_x16(sub_x16(f0, f2), sub_x16(g2, g0)));
+        store_x16(&hpad[0][i], squeeze_7681_x16(h0));
+        store_x16(&hpad[1][i], squeeze_7681_x16(h1));
+        store_x16(&hpad[2][i], squeeze_7681_x16(h2));
+    }
+
+    PQCLEAN_SNTRUP761_AVX2_invntt512_7681(hpad[0], 3);
+    ungood(h_7681, (const int16(*)[512]) hpad);
+
+    for (i = 0; i < 1536; i += 16) {
+        int16x16 u = load_x16(&h_7681[i]);
+        u = mulmod_7681_x16(u, const_x16(956));
+        store_x16(&h[i], u);
+    }
+}
+
+#define crypto_decode_pxint16 PQCLEAN_SNTRUP761_AVX2_crypto_decode_761xint16
+#define crypto_encode_pxint16 PQCLEAN_SNTRUP761_AVX2_crypto_encode_761xint16
+
+#define p 761
+
+static inline int16x16 freeze_3_x16(int16x16 x) {
+    int16x16 mask, x3;
+    x = add_x16(x, const_x16(3)&signmask_x16(x));
+    mask = signmask_x16(sub_x16(x, const_x16(2)));
+    x3 = sub_x16(x, const_x16(3));
+    x = _mm256_blendv_epi8(x3, x, mask);
+    return x;
+}
+
+int PQCLEAN_SNTRUP761_AVX2_crypto_core_mult3sntrup761(unsigned char *outbytes, const unsigned char *inbytes, const unsigned char *kbytes) {
+    vec768 x1, x2;
+    vec1536 x3;
+#define f (x1.v)
+#define g (x2.v)
+#define fg (x3.v)
+#define h f
+    int i;
+    int16x16 x;
+
+    x = const_x16(0);
+    for (i = p & ~15; i < 768; i += 16) {
+        store_x16(&f[i], x);
+    }
+    for (i = p & ~15; i < 768; i += 16) {
+        store_x16(&g[i], x);
+    }
+
+    for (i = 0; i < p; ++i) {
+        int8 fi = inbytes[i];
+        int8 fi0 = fi & 1;
+        f[i] = fi0 - (fi & (fi0 << 1));
+    }
+    for (i = 0; i < p; ++i) {
+        int8 gi = kbytes[i];
+        int8 gi0 = gi & 1;
+        g[i] = gi0 - (gi & (gi0 << 1));
+    }
+
+    mult768(fg, f, g);
+
+    fg[0] -= fg[p - 1];
+    for (i = 0; i < 768; i += 16) {
+        int16x16 fgi = load_x16(&fg[i]);
+        int16x16 fgip = load_x16(&fg[i + p]);
+        int16x16 fgip1 = load_x16(&fg[i + p - 1]);
+        x = add_x16(fgi, add_x16(fgip, fgip1));
+        x = freeze_3_x16(squeeze_3_x16(x));
+        store_x16(&h[i], x);
+    }
+
+    for (i = 0; i < p; ++i) {
+        outbytes[i] = h[i];
+    }
+
+    return 0;
+}
diff --git a/crypto_kem/sntrup761/avx2/crypto_core_mult3sntrup761.h b/crypto_kem/sntrup761/avx2/crypto_core_mult3sntrup761.h
new file mode 100644
index 00000000..051fd590
--- /dev/null
+++ b/crypto_kem/sntrup761/avx2/crypto_core_mult3sntrup761.h
@@ -0,0 +1,11 @@
+#ifndef PQCLEAN_SNTRUP761_AVX2_CRYPTO_CORE_MULT3SNTRUP761_H
+#define PQCLEAN_SNTRUP761_AVX2_CRYPTO_CORE_MULT3SNTRUP761_H
+
+#include <stdint.h>
+#define PQCLEAN_SNTRUP761_AVX2_crypto_core_mult3sntrup761_OUTPUTBYTES 761
+#define PQCLEAN_SNTRUP761_AVX2_crypto_core_mult3sntrup761_INPUTBYTES 761
+#define PQCLEAN_SNTRUP761_AVX2_crypto_core_mult3sntrup761_KEYBYTES 761
+#define PQCLEAN_SNTRUP761_AVX2_crypto_core_mult3sntrup761_CONSTBYTES 0
+
+int PQCLEAN_SNTRUP761_AVX2_crypto_core_mult3sntrup761(unsigned char *outbytes, const unsigned char *inbytes, const unsigned char *kbytes);
+#endif
diff --git a/crypto_kem/sntrup761/avx2/crypto_core_multsntrup761.c b/crypto_kem/sntrup761/avx2/crypto_core_multsntrup761.c
new file mode 100644
index 00000000..58b83ed2
--- /dev/null
+++ b/crypto_kem/sntrup761/avx2/crypto_core_multsntrup761.c
@@ -0,0 +1,314 @@
+#include "crypto_core_multsntrup761.h"
+#include "crypto_core_multsntrup761_ntt.h"
+#include "crypto_decode_761xint16.h"
+#include "crypto_encode_761xint16.h"
+#include <immintrin.h>
+
+typedef int8_t int8;
+typedef int16_t int16;
+
+#define int16x16 __m256i
+#define load_x16(p) _mm256_loadu_si256((int16x16 *) (p))
+#define store_x16(p,v) _mm256_storeu_si256((int16x16 *) (p),(v))
+#define const_x16 _mm256_set1_epi16
+#define add_x16 _mm256_add_epi16
+#define sub_x16 _mm256_sub_epi16
+#define mullo_x16 _mm256_mullo_epi16
+#define mulhi_x16 _mm256_mulhi_epi16
+#define mulhrs_x16 _mm256_mulhrs_epi16
+#define signmask_x16(x) _mm256_srai_epi16((x),15)
+
+typedef union {
+    int16 v[3][512];
+    int16x16 _dummy;
+} vec3x512;
+
+typedef union {
+    int16 v[768];
+    int16x16 _dummy;
+} vec768;
+
+typedef union {
+    int16 v[3 * 512];
+    int16x16 _dummy;
+} vec1536;
+
+static inline int16x16 squeeze_4591_x16(int16x16 x) {
+    return sub_x16(x, mullo_x16(mulhrs_x16(x, const_x16(7)), const_x16(4591)));
+}
+
+static inline int16x16 squeeze_7681_x16(int16x16 x) {
+    return sub_x16(x, mullo_x16(mulhrs_x16(x, const_x16(4)), const_x16(7681)));
+}
+
+static inline int16x16 squeeze_10753_x16(int16x16 x) {
+    return sub_x16(x, mullo_x16(mulhrs_x16(x, const_x16(3)), const_x16(10753)));
+}
+
+static inline int16x16 mulmod_4591_x16(int16x16 x, int16x16 y) {
+    int16x16 yqinv = mullo_x16(y, const_x16(15631)); /* XXX: precompute */
+    int16x16 b = mulhi_x16(x, y);
+    int16x16 d = mullo_x16(x, yqinv);
+    int16x16 e = mulhi_x16(d, const_x16(4591));
+    return sub_x16(b, e);
+}
+
+static inline int16x16 mulmod_7681_x16(int16x16 x, int16x16 y) {
+    int16x16 yqinv = mullo_x16(y, const_x16(-7679)); /* XXX: precompute */
+    int16x16 b = mulhi_x16(x, y);
+    int16x16 d = mullo_x16(x, yqinv);
+    int16x16 e = mulhi_x16(d, const_x16(7681));
+    return sub_x16(b, e);
+}
+
+static inline int16x16 mulmod_10753_x16(int16x16 x, int16x16 y) {
+    int16x16 yqinv = mullo_x16(y, const_x16(-10751)); /* XXX: precompute */
+    int16x16 b = mulhi_x16(x, y);
+    int16x16 d = mullo_x16(x, yqinv);
+    int16x16 e = mulhi_x16(d, const_x16(10753));
+    return sub_x16(b, e);
+}
+
+#define mask0 _mm256_set_epi16(-1,0,0,-1,0,0,-1,0,0,-1,0,0,-1,0,0,-1)
+#define mask1 _mm256_set_epi16(0,0,-1,0,0,-1,0,0,-1,0,0,-1,0,0,-1,0)
+#define mask2 _mm256_set_epi16(0,-1,0,0,-1,0,0,-1,0,0,-1,0,0,-1,0,0)
+
+static void good(int16 fpad[3][512], const int16 f[768]) {
+    int j;
+    int16x16 f0, f1;
+
+    j = 0;
+    for (;;) {
+        f0 = load_x16(f + j);
+        f1 = load_x16(f + 512 + j);
+        store_x16(&fpad[0][j], (f0 & mask0) | (f1 & mask1));
+        store_x16(&fpad[1][j], (f0 & mask1) | (f1 & mask2));
+        store_x16(&fpad[2][j], (f0 & mask2) | (f1 & mask0));
+        j += 16;
+        if (j == 256) {
+            break;
+        }
+
+        f0 = load_x16(f + j);
+        f1 = load_x16(f + 512 + j);
+        store_x16(&fpad[0][j], (f0 & mask2) | (f1 & mask0));
+        store_x16(&fpad[1][j], (f0 & mask0) | (f1 & mask1));
+        store_x16(&fpad[2][j], (f0 & mask1) | (f1 & mask2));
+        j += 16;
+
+        f0 = load_x16(f + j);
+        f1 = load_x16(f + 512 + j);
+        store_x16(&fpad[0][j], (f0 & mask1) | (f1 & mask2));
+        store_x16(&fpad[1][j], (f0 & mask2) | (f1 & mask0));
+        store_x16(&fpad[2][j], (f0 & mask0) | (f1 & mask1));
+        j += 16;
+    }
+    for (;;) {
+        f0 = load_x16(f + j);
+        store_x16(&fpad[0][j], f0 & mask2);
+        store_x16(&fpad[1][j], f0 & mask0);
+        store_x16(&fpad[2][j], f0 & mask1);
+        j += 16;
+        if (j == 512) {
+            break;
+        }
+
+        f0 = load_x16(f + j);
+        store_x16(&fpad[0][j], f0 & mask1);
+        store_x16(&fpad[1][j], f0 & mask2);
+        store_x16(&fpad[2][j], f0 & mask0);
+        j += 16;
+
+        f0 = load_x16(f + j);
+        store_x16(&fpad[0][j], f0 & mask0);
+        store_x16(&fpad[1][j], f0 & mask1);
+        store_x16(&fpad[2][j], f0 & mask2);
+        j += 16;
+    }
+}
+
+static void ungood(int16 f[1536], const int16 fpad[3][512]) {
+    int j;
+    int16x16 f0, f1, f2, g0, g1, g2;
+
+    j = 0;
+
+    for (;;) {
+        f0 = load_x16(&fpad[0][j]);
+        f1 = load_x16(&fpad[1][j]);
+        f2 = load_x16(&fpad[2][j]);
+        g0 = (f0 & mask0) | (f1 & mask1) | (f2 & mask2);
+        g1 = (f0 & mask1) | (f1 & mask2) | (f2 & mask0);
+        g2 = f0 ^ f1 ^ f2 ^ g0 ^ g1; /* same as (f0&mask2)|(f1&mask0)|(f2&mask1) */
+        store_x16(f + 0 + j, g0);
+        store_x16(f + 512 + j, g1);
+        store_x16(f + 1024 + j, g2);
+        j += 16;
+
+        f0 = load_x16(&fpad[0][j]);
+        f1 = load_x16(&fpad[1][j]);
+        f2 = load_x16(&fpad[2][j]);
+        g0 = (f0 & mask2) | (f1 & mask0) | (f2 & mask1);
+        g1 = (f0 & mask0) | (f1 & mask1) | (f2 & mask2);
+        g2 = f0 ^ f1 ^ f2 ^ g0 ^ g1; /* same as (f0&mask1)|(f1&mask2)|(f2&mask0) */
+        store_x16(f + 0 + j, g0);
+        store_x16(f + 512 + j, g1);
+        store_x16(f + 1024 + j, g2);
+        j += 16;
+        if (j == 512) {
+            break;
+        }
+
+        f0 = load_x16(&fpad[0][j]);
+        f1 = load_x16(&fpad[1][j]);
+        f2 = load_x16(&fpad[2][j]);
+        g0 = (f0 & mask1) | (f1 & mask2) | (f2 & mask0);
+        g1 = (f0 & mask2) | (f1 & mask0) | (f2 & mask1);
+        g2 = f0 ^ f1 ^ f2 ^ g0 ^ g1; /* same as (f0&mask0)|(f1&mask1)|(f2&mask2) */
+        store_x16(f + 0 + j, g0);
+        store_x16(f + 512 + j, g1);
+        store_x16(f + 1024 + j, g2);
+        j += 16;
+    }
+}
+
+static void mult768(int16 h[1536], const int16 f[768], const int16 g[768]) {
+    vec3x512 x1, x2;
+    vec1536 x3, x4;
+#define fpad (x1.v)
+#define gpad (x2.v)
+#define hpad fpad
+#define h_7681 (x3.v)
+#define h_10753 (x4.v)
+    int i;
+
+    good(fpad, f);
+    PQCLEAN_SNTRUP761_AVX2_ntt512_7681(fpad[0], 3);
+
+    good(gpad, g);
+    PQCLEAN_SNTRUP761_AVX2_ntt512_7681(gpad[0], 3);
+
+    for (i = 0; i < 512; i += 16) {
+        int16x16 f0 = squeeze_7681_x16(load_x16(&fpad[0][i]));
+        int16x16 f1 = squeeze_7681_x16(load_x16(&fpad[1][i]));
+        int16x16 f2 = squeeze_7681_x16(load_x16(&fpad[2][i]));
+        int16x16 g0 = squeeze_7681_x16(load_x16(&gpad[0][i]));
+        int16x16 g1 = squeeze_7681_x16(load_x16(&gpad[1][i]));
+        int16x16 g2 = squeeze_7681_x16(load_x16(&gpad[2][i]));
+        int16x16 d0 = mulmod_7681_x16(f0, g0);
+        int16x16 d1 = mulmod_7681_x16(f1, g1);
+        int16x16 d2 = mulmod_7681_x16(f2, g2);
+        int16x16 dsum = add_x16(add_x16(d0, d1), d2);
+        int16x16 h0 = add_x16(dsum, mulmod_7681_x16(sub_x16(f2, f1), sub_x16(g1, g2)));
+        int16x16 h1 = add_x16(dsum, mulmod_7681_x16(sub_x16(f1, f0), sub_x16(g0, g1)));
+        int16x16 h2 = add_x16(dsum, mulmod_7681_x16(sub_x16(f0, f2), sub_x16(g2, g0)));
+        store_x16(&hpad[0][i], squeeze_7681_x16(h0));
+        store_x16(&hpad[1][i], squeeze_7681_x16(h1));
+        store_x16(&hpad[2][i], squeeze_7681_x16(h2));
+    }
+
+    PQCLEAN_SNTRUP761_AVX2_invntt512_7681(hpad[0], 3);
+    ungood(h_7681, (const int16(*)[512]) hpad);
+
+    good(fpad, f);
+    PQCLEAN_SNTRUP761_AVX2_ntt512_10753(fpad[0], 3);
+
+    good(gpad, g);
+    PQCLEAN_SNTRUP761_AVX2_ntt512_10753(gpad[0], 3);
+
+    for (i = 0; i < 512; i += 16) {
+        int16x16 f0 = squeeze_10753_x16(load_x16(&fpad[0][i]));
+        int16x16 f1 = squeeze_10753_x16(load_x16(&fpad[1][i]));
+        int16x16 f2 = squeeze_10753_x16(load_x16(&fpad[2][i]));
+        int16x16 g0 = squeeze_10753_x16(load_x16(&gpad[0][i]));
+        int16x16 g1 = squeeze_10753_x16(load_x16(&gpad[1][i]));
+        int16x16 g2 = squeeze_10753_x16(load_x16(&gpad[2][i]));
+        int16x16 d0 = mulmod_10753_x16(f0, g0);
+        int16x16 d1 = mulmod_10753_x16(f1, g1);
+        int16x16 d2 = mulmod_10753_x16(f2, g2);
+        int16x16 dsum = add_x16(add_x16(d0, d1), d2);
+        int16x16 h0 = add_x16(dsum, mulmod_10753_x16(sub_x16(f2, f1), sub_x16(g1, g2)));
+        int16x16 h1 = add_x16(dsum, mulmod_10753_x16(sub_x16(f1, f0), sub_x16(g0, g1)));
+        int16x16 h2 = add_x16(dsum, mulmod_10753_x16(sub_x16(f0, f2), sub_x16(g2, g0)));
+        store_x16(&hpad[0][i], squeeze_10753_x16(h0));
+        store_x16(&hpad[1][i], squeeze_10753_x16(h1));
+        store_x16(&hpad[2][i], squeeze_10753_x16(h2));
+    }
+
+    PQCLEAN_SNTRUP761_AVX2_invntt512_10753(hpad[0], 3);
+    ungood(h_10753, (const int16(*)[512]) hpad);
+
+    for (i = 0; i < 1536; i += 16) {
+        int16x16 u1 = load_x16(&h_10753[i]);
+        int16x16 u2 = load_x16(&h_7681[i]);
+        int16x16 t;
+        u1 = mulmod_10753_x16(u1, const_x16(1268));
+        u2 = mulmod_7681_x16(u2, const_x16(956));
+        t = mulmod_7681_x16(sub_x16(u2, u1), const_x16(-2539));
+        t = add_x16(u1, mulmod_4591_x16(t, const_x16(-710)));
+        store_x16(&h[i], t);
+    }
+}
+
+#define crypto_decode_pxint16 PQCLEAN_SNTRUP761_AVX2_crypto_decode_761xint16
+#define crypto_encode_pxint16 PQCLEAN_SNTRUP761_AVX2_crypto_encode_761xint16
+
+#define p 761
+#define q 4591
+
+static inline int16x16 freeze_4591_x16(int16x16 x) {
+    int16x16 mask, xq;
+    x = add_x16(x, const_x16(q)&signmask_x16(x));
+    mask = signmask_x16(sub_x16(x, const_x16((q + 1) / 2)));
+    xq = sub_x16(x, const_x16(q));
+    x = _mm256_blendv_epi8(xq, x, mask);
+    return x;
+}
+
+int PQCLEAN_SNTRUP761_AVX2_crypto_core_multsntrup761(unsigned char *outbytes, const unsigned char *inbytes, const unsigned char *kbytes) {
+    vec768 x1, x2;
+    vec1536 x3;
+#define f (x1.v)
+#define g (x2.v)
+#define fg (x3.v)
+#define h f
+    int i;
+    int16x16 x;
+
+    x = const_x16(0);
+    for (i = p & ~15; i < 768; i += 16) {
+        store_x16(&f[i], x);
+    }
+    for (i = p & ~15; i < 768; i += 16) {
+        store_x16(&g[i], x);
+    }
+
+    crypto_decode_pxint16(f, inbytes);
+
+    for (i = 0; i < 768; i += 16) {
+        x = load_x16(&f[i]);
+        x = freeze_4591_x16(squeeze_4591_x16(x));
+        store_x16(&f[i], x);
+    }
+    for (i = 0; i < p; ++i) {
+        int8 gi = kbytes[i];
+        int8 gi0 = gi & 1;
+        g[i] = gi0 - (gi & (gi0 << 1));
+    }
+
+    mult768(fg, f, g);
+
+    fg[0] -= fg[p - 1];
+    for (i = 0; i < 768; i += 16) {
+        int16x16 fgi = load_x16(&fg[i]);
+        int16x16 fgip = load_x16(&fg[i + p]);
+        int16x16 fgip1 = load_x16(&fg[i + p - 1]);
+        x = add_x16(fgi, add_x16(fgip, fgip1));
+        x = freeze_4591_x16(squeeze_4591_x16(x));
+        store_x16(&h[i], x);
+    }
+
+    crypto_encode_pxint16(outbytes, h);
+
+    return 0;
+}
diff --git a/crypto_kem/sntrup761/avx2/crypto_core_multsntrup761.h b/crypto_kem/sntrup761/avx2/crypto_core_multsntrup761.h
new file mode 100644
index 00000000..846aea2e
--- /dev/null
+++ b/crypto_kem/sntrup761/avx2/crypto_core_multsntrup761.h
@@ -0,0 +1,11 @@
+#ifndef PQCLEAN_SNTRUP761_AVX2_CRYPTO_CORE_MULTSNTRUP761_H
+#define PQCLEAN_SNTRUP761_AVX2_CRYPTO_CORE_MULTSNTRUP761_H
+
+#include <stdint.h>
+#define PQCLEAN_SNTRUP761_AVX2_crypto_core_multsntrup761_OUTPUTBYTES 1522
+#define PQCLEAN_SNTRUP761_AVX2_crypto_core_multsntrup761_INPUTBYTES 1522
+#define PQCLEAN_SNTRUP761_AVX2_crypto_core_multsntrup761_KEYBYTES 761
+#define PQCLEAN_SNTRUP761_AVX2_crypto_core_multsntrup761_CONSTBYTES 0
+
+int PQCLEAN_SNTRUP761_AVX2_crypto_core_multsntrup761(unsigned char *outbytes, const unsigned char *inbytes, const unsigned char *kbytes);
+#endif
diff --git a/crypto_kem/sntrup761/avx2/crypto_core_multsntrup761_ntt.c b/crypto_kem/sntrup761/avx2/crypto_core_multsntrup761_ntt.c
new file mode 100644
index 00000000..67464046
--- /dev/null
+++ b/crypto_kem/sntrup761/avx2/crypto_core_multsntrup761_ntt.c
@@ -0,0 +1,927 @@
+#include "crypto_core_multsntrup761.h"
+#include "crypto_core_multsntrup761_ntt.h"
+#include <immintrin.h>
+#include <stdint.h>
+
+/* auto-generated; do not edit */
+
+
+typedef int8_t int8;
+typedef int16_t int16;
+
+#define zeta(n,i) (((__m256i *) zeta_##n)[(i)])
+#define zeta_x4(n,i) (((__m256i *) zeta_x4_##n)[(i)])
+#define zeta_qinv(n,i) (((__m256i *) qinvzeta_##n)[(i)])
+#define zeta_x4_qinv(n,i) (((__m256i *) qinvzeta_x4_##n)[(i)])
+#define zetainv(n,i) _mm256_loadu_reverse16((__m256i *) ((int16 *) zeta_##n+(n)/2+1-16*((i)+1)))
+#define zetainv_x4(n,i) _mm256_loadu_reverse16((__m256i *) ((int16 *) zeta_x4_##n+2*(n)+4-16*((i)+1)))
+#define zetainv_qinv(n,i) _mm256_loadu_reverse16((__m256i *) ((int16 *) qinvzeta_##n+(n)/2+1-16*((i)+1)))
+#define zetainv_x4_qinv(n,i) _mm256_loadu_reverse16((__m256i *) ((int16 *) qinvzeta_x4_##n+2*(n)+4-16*((i)+1)))
+
+typedef union {
+    int16 data[93 * 16];
+    __m256i _dummy;
+} vec1488;
+
+static const vec1488 qdata_7681 = { .data = {
+
+#define q_x16 (qdata[0])
+        7681, 7681, 7681, 7681, 7681, 7681, 7681, 7681, 7681, 7681, 7681, 7681, 7681, 7681, 7681, 7681,
+
+#define qrecip_x16 (qdata[1])
+        17474, 17474, 17474, 17474, 17474, 17474, 17474, 17474, 17474, 17474, 17474, 17474, 17474, 17474, 17474, 17474,
+
+#define qshift_x16 (qdata[2])
+        16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+
+#define zeta4_x16 (qdata[3])
+        -3777, -3777, -3777, -3777, -3777, -3777, -3777, -3777, -3777, -3777, -3777, -3777, -3777, -3777, -3777, -3777,
+
+#define zeta4_x16_qinv (qdata[4])
+        -28865, -28865, -28865, -28865, -28865, -28865, -28865, -28865, -28865, -28865, -28865, -28865, -28865, -28865, -28865, -28865,
+
+#define zeta8_x16 (qdata[5])
+        -3625, -3625, -3625, -3625, -3625, -3625, -3625, -3625, -3625, -3625, -3625, -3625, -3625, -3625, -3625, -3625,
+
+#define zeta8_x16_qinv (qdata[6])
+        -16425, -16425, -16425, -16425, -16425, -16425, -16425, -16425, -16425, -16425, -16425, -16425, -16425, -16425, -16425, -16425,
+
+#define zetainv8_x16 (qdata[7])
+        -3182, -3182, -3182, -3182, -3182, -3182, -3182, -3182, -3182, -3182, -3182, -3182, -3182, -3182, -3182, -3182,
+
+#define zetainv8_x16_qinv (qdata[8])
+        -10350, -10350, -10350, -10350, -10350, -10350, -10350, -10350, -10350, -10350, -10350, -10350, -10350, -10350, -10350, -10350,
+
+#define zeta_x4_16 (qdata+9)
+        -3593, -3593, -3593, -3593, -2194, -2194, -2194, -2194, -3625, -3625, -3625, -3625, 1100, 1100, 1100, 1100,
+        -3777, -3777, -3777, -3777, -2456, -2456, -2456, -2456, 3182, 3182, 3182, 3182, 3696, 3696, 3696, 3696,
+        3593, 3593, 3593, 3593, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+#define qinvzeta_x4_16 (qdata+12)
+        -9, -9, -9, -9, 4974, 4974, 4974, 4974, -16425, -16425, -16425, -16425, 7244, 7244, 7244, 7244,
+        -28865, -28865, -28865, -28865, -14744, -14744, -14744, -14744, 10350, 10350, 10350, 10350, -4496, -4496, -4496, -4496,
+        9, 9, 9, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+#define zeta_x4_32 (qdata+15)
+        -3593, -3593, -3593, -3593, 1414, 1414, 1414, 1414, -2194, -2194, -2194, -2194, -2495, -2495, -2495, -2495,
+        -3625, -3625, -3625, -3625, 2876, 2876, 2876, 2876, 1100, 1100, 1100, 1100, -2250, -2250, -2250, -2250,
+        -3777, -3777, -3777, -3777, -1701, -1701, -1701, -1701, -2456, -2456, -2456, -2456, 834, 834, 834, 834,
+        3182, 3182, 3182, 3182, -2319, -2319, -2319, -2319, 3696, 3696, 3696, 3696, 121, 121, 121, 121,
+        3593, 3593, 3593, 3593, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+#define qinvzeta_x4_32 (qdata+20)
+        -9, -9, -9, -9, 20870, 20870, 20870, 20870, 4974, 4974, 4974, 4974, 22593, 22593, 22593, 22593,
+        -16425, -16425, -16425, -16425, 828, 828, 828, 828, 7244, 7244, 7244, 7244, -23754, -23754, -23754, -23754,
+        -28865, -28865, -28865, -28865, 20315, 20315, 20315, 20315, -14744, -14744, -14744, -14744, 18242, 18242, 18242, 18242,
+        10350, 10350, 10350, 10350, -18191, -18191, -18191, -18191, -4496, -4496, -4496, -4496, -11655, -11655, -11655, -11655,
+        9, 9, 9, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+#define zeta_64 (qdata+25)
+        -3593, -617, 1414, 3706, -2194, -1296, -2495, -2237, -3625, 2830, 2876, -1599, 1100, 1525, -2250, 2816,
+        -3777, 1921, -1701, 2006, -2456, 1483, 834, -1986, 3182, 3364, -2319, -1993, 3696, -2557, 121, 2088,
+        3593, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+#define qinvzeta_64 (qdata+28)
+        -9, 19351, 20870, -15750, 4974, -9488, 22593, 7491, -16425, 26382, 828, 23489, 7244, 20469, -23754, 2816,
+        -28865, -5759, 20315, -3114, -14744, 15307, 18242, -19394, 10350, -10972, -18191, -31177, -4496, -25597, -11655, 22568,
+        9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+#define zeta_128 (qdata+31)
+        -3593, -2804, -617, -396, 1414, -549, 3706, 810, -2194, -1321, -1296, 438, -2495, -2535, -2237, -3689,
+        -3625, 2043, 2830, -1881, 2876, 3153, -1599, 7, 1100, -514, 1525, -1760, -2250, -2440, 2816, 3600,
+        -3777, 103, 1921, -3174, -1701, 1535, 2006, -1887, -2456, 1399, 1483, -679, 834, 3772, -1986, 1738,
+        3182, -1431, 3364, -3555, -2319, -2310, -1993, 638, 3696, -2956, -2557, -1305, 121, 2555, 2088, -3266,
+        3593, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+#define qinvzeta_128 (qdata+36)
+        -9, -29428, 19351, 26228, 20870, 21467, -15750, 5930, 4974, -14121, -9488, -21066, 22593, 2073, 7491, 16279,
+        -16425, -25093, 26382, 26279, 828, -29103, 23489, 11783, 7244, 14846, 20469, 14624, -23754, -6536, 2816, 11792,
+        -28865, -4505, -5759, -6246, 20315, 9215, -3114, 6817, -14744, 4983, 15307, -28839, 18242, 1724, -19394, 23242,
+        10350, -21399, -10972, -29667, -18191, -21766, -31177, 15998, -4496, 23668, -25597, -5913, -11655, -24581, 22568, -20674,
+        9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+#define zeta_256 (qdata+41)
+        -3593, 2665, -2804, -2572, -617, 727, -396, 3417, 1414, 2579, -549, 373, 3706, 3750, 810, -1054,
+        -2194, -2133, -1321, 1681, -1296, -1386, 438, -2732, -2495, 1919, -2535, -2391, -2237, 2835, -3689, 2,
+        -3625, -783, 2043, 3145, 2830, 1533, -1881, 2789, 2876, 2649, 3153, 3692, -1599, -1390, 7, -1166,
+        1100, 3310, -514, 2224, 1525, -2743, -1760, 2385, -2250, -486, -2440, -1756, 2816, -3816, 3600, -3831,
+        -3777, -1799, 103, 1497, 1921, 1521, -3174, -194, -1701, -859, 1535, 2175, 2006, -2762, -1887, -1698,
+        -2456, -3480, 1399, 2883, 1483, -3428, -679, -2113, 834, 1532, 3772, -660, -1986, -2764, 1738, -915,
+        3182, 1056, -1431, 1350, 3364, 1464, -3555, 2919, -2319, -2160, -2310, 730, -1993, -1598, 638, 3456,
+        3696, -1168, -2956, -3588, -2557, -921, -1305, 3405, 121, -404, 2555, -3135, 2088, 2233, -3266, -2426,
+        3593, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+#define qinvzeta_256 (qdata+50)
+        -9, -17303, -29428, 24052, 19351, -12073, 26228, -24743, 20870, -12269, 21467, 19317, -15750, -25946, 5930, 32738,
+        4974, -4693, -14121, 2193, -9488, 26262, -21066, 7508, 22593, 9599, 2073, 10409, 7491, -12013, 16279, -15358,
+        -16425, -16655, -25093, 32329, 26382, 24573, 26279, 13541, 828, -25511, -29103, 26220, 23489, -8558, 11783, -24718,
+        7244, 10478, 14846, 26800, 20469, 26441, 14624, -29871, -23754, -3558, -6536, -16092, 2816, 8472, 11792, -7415,
+        -28865, -13575, -4505, -26663, -5759, -14351, -6246, -17602, 20315, -22875, 9215, 9855, -3114, -24266, 6817, -2722,
+        -14744, -15768, 4983, 12611, 15307, -21860, -28839, -27201, 18242, 32252, 1724, 21868, -19394, -8908, 23242, 13933,
+        10350, 17440, -21399, -11962, -10972, 30136, -29667, -1689, -18191, 6032, -21766, 30426, -31177, 15810, 15998, 3456,
+        -4496, -9360, 23668, 27132, -25597, -5529, -5913, 1869, -11655, 22124, -24581, 21953, 22568, 23225, -20674, 17030,
+        9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+#define zeta_512 (qdata+59)
+        -3593, 2005, 2665, 2891, -2804, 2345, -2572, 1121, -617, -188, 727, 2786, -396, -3208, 3417, -17,
+        1414, -3752, 2579, 2815, -549, 1837, 373, 151, 3706, -1012, 3750, -1509, 810, -3214, -1054, 3177,
+        -2194, -1403, -2133, -3314, -1321, 83, 1681, -658, -1296, 2070, -1386, -3547, 438, 3781, -2732, 2230,
+        -2495, -1669, 1919, 2589, -2535, -3312, -2391, -3542, -2237, -1441, 2835, -3568, -3689, -402, 2, -1070,
+        -3625, 3763, -783, -3550, 2043, -2303, 3145, -436, 2830, -893, 1533, 1712, -1881, 124, 2789, -2001,
+        2876, -2460, 2649, 3770, 3153, 2965, 3692, -1203, -1599, 2874, -1390, -1407, 7, -3745, -1166, 1649,
+        1100, 2937, 3310, 3461, -514, -1526, 2224, 715, 1525, -1689, -2743, 434, -1760, -3163, 2385, -929,
+        -2250, -2167, -486, -1144, -2440, -370, -1756, 2378, 2816, -1084, -3816, -1586, 3600, 1931, -3831, -1242,
+        -3777, 592, -1799, 2340, 103, -1338, 1497, -2071, 1921, 1519, 1521, 451, -3174, 589, -194, -3744,
+        -1701, 3677, -859, -1295, 1535, 642, 2175, -3794, 2006, 2130, -2762, 2918, -1887, 3334, -1698, 2072,
+        -2456, 509, -3480, 2998, 1399, -3408, 2883, 1476, 1483, -2262, -3428, -1779, -679, 2258, -2113, 1348,
+        834, -692, 1532, 2247, 3772, 2083, -660, -226, -1986, 2532, -2764, -3693, 1738, -429, -915, -2059,
+        3182, 2812, 1056, 3434, -1431, -2515, 1350, -236, 3364, -2386, 1464, 222, -3555, -2963, 2919, -2422,
+        -2319, -3657, -2160, 3450, -2310, -791, 730, 1181, -1993, -1404, -1598, 2339, 638, -3366, 3456, 2161,
+        3696, -3343, -1168, 2719, -2956, -826, -3588, -670, -2557, 777, -921, 1151, -1305, -796, 3405, -1278,
+        121, -3287, -404, 1072, 2555, 293, -3135, 2767, 2088, -3335, 2233, 3581, -3266, 3723, -2426, -179,
+        3593, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+#define qinvzeta_512 (qdata+76)
+        -9, 4565, -17303, 16715, -29428, 15145, 24052, -22943, 19351, 1860, -12073, -28958, 26228, -7304, -24743, -529,
+        20870, -24232, -12269, 10495, 21467, -16083, 19317, 20119, -15750, -27636, -25946, -12261, 5930, -26766, 32738, -16791,
+        4974, 25733, -4693, 20238, -14121, 18003, 2193, 6510, -9488, 29718, 26262, -25563, -21066, -1851, 7508, -19274,
+        22593, -28805, 9599, -23523, 2073, 4880, 10409, 1578, 7491, -10145, -12013, 4624, 16279, 6766, -15358, 24530,
+        -16425, 5299, -16655, -2526, -25093, -9983, 32329, 5708, 26382, -23933, 24573, 26288, 26279, 30844, 13541, 30255,
+        828, 15972, -25511, 17082, -29103, -27243, 26220, -2739, 23489, 16186, -8558, -9087, 11783, -12449, -24718, -14223,
+        7244, -8839, 10478, 30597, 14846, -12790, 26800, 14539, 20469, -6297, 26441, 9650, 14624, -25179, -29871, -9633,
+        -23754, -5751, -3558, 2952, -6536, 23182, -16092, 23882, 2816, 964, 8472, -10802, 11792, -17013, -7415, -30938,
+        -28865, -23984, -13575, -11996, -4505, -14650, -26663, -22039, -5759, 1007, -14351, 10179, -6246, -947, -17602, -20128,
+        20315, 10333, -22875, -17167, 9215, -14718, 9855, -29394, -3114, 27730, -24266, 5990, 6817, 22790, -2722, 14360,
+        -14744, 23549, -15768, -18506, 4983, 21168, 12611, 3524, 15307, 2858, -21860, 29453, -28839, 27858, -27201, 3396,
+        18242, 5452, 32252, -18745, 1724, -4573, 21868, 31518, -19394, 20964, -8908, -18541, 23242, 17491, 13933, 16885,
+        10350, -32004, 17440, -24214, -21399, -20435, -11962, -22764, -10972, -27986, 30136, -802, -29667, 11885, -1689, -13686,
+        -18191, 32695, 6032, -16006, -21766, -20759, 30426, -24931, -31177, -32124, 15810, -4317, 15998, 26330, 3456, -13711,
+        -4496, -19215, -9360, 26783, 23668, -14138, 27132, -32414, -25597, -2807, -5529, 8831, -5913, 17636, 1869, -16638,
+        -11655, 9513, 22124, 25648, -24581, -21723, 21953, -14129, 22568, -15111, 23225, 26621, -20674, -15221, 17030, -1715,
+        9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    }
+};
+
+static const vec1488 qdata_10753 = { .data = {
+
+        10753, 10753, 10753, 10753, 10753, 10753, 10753, 10753, 10753, 10753, 10753, 10753, 10753, 10753, 10753, 10753,
+
+        24964, 24964, 24964, 24964, 24964, 24964, 24964, 24964, 24964, 24964, 24964, 24964, 24964, 24964, 24964, 24964,
+
+        8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+
+        223, 223, 223, 223, 223, 223, 223, 223, 223, 223, 223, 223, 223, 223, 223, 223,
+
+        27359, 27359, 27359, 27359, 27359, 27359, 27359, 27359, 27359, 27359, 27359, 27359, 27359, 27359, 27359, 27359,
+
+        4188, 4188, 4188, 4188, 4188, 4188, 4188, 4188, 4188, 4188, 4188, 4188, 4188, 4188, 4188, 4188,
+
+        -1956, -1956, -1956, -1956, -1956, -1956, -1956, -1956, -1956, -1956, -1956, -1956, -1956, -1956, -1956, -1956,
+
+        3688, 3688, 3688, 3688, 3688, 3688, 3688, 3688, 3688, 3688, 3688, 3688, 3688, 3688, 3688, 3688,
+
+        -408, -408, -408, -408, -408, -408, -408, -408, -408, -408, -408, -408, -408, -408, -408, -408,
+
+        1018, 1018, 1018, 1018, 2413, 2413, 2413, 2413, 4188, 4188, 4188, 4188, 357, 357, 357, 357,
+        223, 223, 223, 223, -3686, -3686, -3686, -3686, -3688, -3688, -3688, -3688, -376, -376, -376, -376,
+        -1018, -1018, -1018, -1018, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+        -6, -6, -6, -6, 10093, 10093, 10093, 10093, -1956, -1956, -1956, -1956, 28517, 28517, 28517, 28517,
+        27359, 27359, 27359, 27359, -21094, -21094, -21094, -21094, 408, 408, 408, 408, -20856, -20856, -20856, -20856,
+        6, 6, 6, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+        1018, 1018, 1018, 1018, -2695, -2695, -2695, -2695, 2413, 2413, 2413, 2413, 425, 425, 425, 425,
+        4188, 4188, 4188, 4188, -4855, -4855, -4855, -4855, 357, 357, 357, 357, -3364, -3364, -3364, -3364,
+        223, 223, 223, 223, 730, 730, 730, 730, -3686, -3686, -3686, -3686, -4544, -4544, -4544, -4544,
+        -3688, -3688, -3688, -3688, -2236, -2236, -2236, -2236, -376, -376, -376, -376, 3784, 3784, 3784, 3784,
+        -1018, -1018, -1018, -1018, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+        -6, -6, -6, -6, 7033, 7033, 7033, 7033, 10093, 10093, 10093, 10093, 18345, 18345, 18345, 18345,
+        -1956, -1956, -1956, -1956, 29449, 29449, 29449, 29449, 28517, 28517, 28517, 28517, -9508, -9508, -9508, -9508,
+        27359, 27359, 27359, 27359, 16090, 16090, 16090, 16090, -21094, -21094, -21094, -21094, 28224, 28224, 28224, 28224,
+        408, 408, 408, 408, -12476, -12476, -12476, -12476, -20856, -20856, -20856, -20856, 16072, 16072, 16072, 16072,
+        6, 6, 6, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+        1018, -1520, -2695, 1341, 2413, 918, 425, 5175, 4188, -4035, -4855, 341, 357, 4347, -3364, 5213,
+        223, -4875, 730, 1931, -3686, -2503, -4544, -4095, -3688, 5063, -2236, -3823, -376, 3012, 3784, -2629,
+        -1018, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+        -6, 23056, 7033, 829, 10093, 26518, 18345, 3639, -1956, -4547, 29449, 3925, 28517, -7429, -9508, -11683,
+        27359, -17675, 16090, 14731, -21094, -25543, 28224, -14847, 408, 28103, -12476, 10001, -20856, -7228, 16072, 18363,
+        6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+        1018, -2935, -1520, -4744, -2695, -205, 1341, 1299, 2413, 4, 918, -4379, 425, -4616, 5175, -544,
+        4188, 4129, -4035, 4102, -4855, -1287, 341, -2388, 357, 1284, 4347, 2984, -3364, 2178, 5213, -2576,
+        223, 2790, -4875, 4876, 730, -4513, 1931, -3085, -3686, 3550, -2503, 847, -4544, 193, -4095, 1085,
+        -3688, 3091, 5063, -4742, -2236, 2982, -3823, -1009, -376, -268, 3012, 3062, 3784, -2565, -2629, 4189,
+        -1018, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+        -6, 31369, 23056, 15736, 7033, -24269, 829, -6381, 10093, 22532, 26518, 23781, 18345, 15864, 3639, 15840,
+        -1956, -23007, -4547, 5126, 29449, 8441, 3925, -16724, 28517, 23812, -7429, 31656, -9508, -19326, -11683, -27152,
+        27359, 20198, -17675, 6924, 16090, 22623, 14731, 5619, -21094, -24098, -25543, 3407, 28224, 22209, -14847, 573,
+        408, -4589, 28103, -5766, -12476, -12378, 10001, -31217, -20856, -2316, -7228, -20490, 16072, -14341, 18363, -12707,
+        6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+        1018, -4734, -2935, -400, -1520, 4977, -4744, -2973, -2695, 512, -205, -779, 1341, -1356, 1299, 635,
+        2413, 567, 4, -4286, 918, -5114, -4379, -1586, 425, 1615, -4616, -336, 5175, -1841, -544, 2234,
+        4188, -3441, 4129, 636, -4035, -4580, 4102, 2684, -4855, 3057, -1287, -2740, 341, -5156, -2388, -472,
+        357, -794, 1284, 578, 4347, 3615, 2984, -3715, -3364, 2271, 2178, -326, 5213, 454, -2576, -3337,
+        223, 2998, 2790, -151, -4875, 2981, 4876, 1324, 730, 2774, -4513, 2206, 1931, 886, -3085, -970,
+        -3686, 3198, 3550, 2737, -2503, -909, 847, 1068, -4544, -2213, 193, 2884, -4095, -4808, 1085, 4123,
+        -3688, 5341, 3091, 5294, 5063, -116, -4742, -5116, -2236, -2045, 2982, -1572, -3823, 4828, -1009, 467,
+        -376, 5023, -268, -3169, 3012, -1458, 3062, -1268, 3784, -675, -2565, 1006, -2629, 5064, 4189, 864,
+        -1018, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+        -6, -26238, 31369, -24976, 23056, -30351, 15736, -18845, 7033, 512, -24269, -13579, 829, 29364, -6381, -11141,
+        10093, -969, 22532, 6978, 26518, -4090, 23781, 11726, 18345, 4175, 15864, 7856, 3639, 719, 15840, -31558,
+        -1956, 31887, -23007, -21892, -4547, 22044, 5126, -19844, 29449, -32271, 8441, 32076, 3925, -11300, -16724, 28200,
+        28517, 16614, 23812, 11842, -7429, -2017, 31656, 28541, -9508, 29407, -19326, 31418, -11683, -31290, -27152, 27895,
+        27359, 12214, 20198, -14999, -17675, -1627, 6924, -13012, 16090, -4394, 22623, 7326, 14731, -22666, 5619, 8246,
+        -21094, 24702, -24098, 177, -25543, 7795, 3407, -13268, 28224, 2395, 22209, -7356, -14847, -17096, 573, -24037,
+        408, -11555, -4589, -30546, 28103, 1932, -5766, 17412, -12476, 31235, -12378, -7716, 10001, -1316, -31217, 25555,
+        -20856, -609, -2316, -8801, -7228, 11854, -20490, 780, 16072, -17571, -14341, -2066, 18363, 17352, -12707, 17248,
+        6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+        1018, 3453, -4734, 4519, -2935, 2118, -400, -554, -1520, 2196, 4977, 1893, -4744, -1409, -2973, -1053,
+        -2695, 4601, 512, 279, -205, -3241, -779, 4889, 1341, 3524, -1356, -1663, 1299, 2283, 635, 73,
+        2413, 2428, 567, 624, 4, -1930, -4286, 3419, 918, -2062, -5114, 5068, -4379, -97, -1586, 1782,
+        425, 4621, 1615, 355, -4616, 1349, -336, 825, 5175, 3135, -1841, 1160, -544, 4408, 2234, -2605,
+        4188, 854, -3441, -1056, 4129, 2439, 636, 4967, -4035, -4782, -4580, -5268, 4102, -663, 2684, -4670,
+        -4855, 3760, 3057, 3535, -1287, 2680, -2740, -569, 341, 2139, -5156, 3827, -2388, 1639, -472, 1927,
+        357, 5172, -794, -4003, 1284, 4144, 578, 693, 4347, 4784, 3615, 3125, 2984, 1122, -3715, 2113,
+        -3364, -573, 2271, -4328, 2178, 2909, -326, -4000, 5213, -4447, 454, -3995, -2576, -4428, -3337, 2529,
+        223, 5309, 2998, 5120, 2790, -2050, -151, 2963, -4875, 2657, 2981, -2807, 4876, 2237, 1324, -4403,
+        730, 2624, 2774, -5083, -4513, 40, 2206, 152, 1931, -1573, 886, 2625, -3085, -778, -970, -5107,
+        -3686, 4250, 3198, -5356, 3550, -3148, 2737, -3360, -2503, -2015, -909, 3096, 847, 5313, 1068, 834,
+        -4544, -1132, -2213, -2151, 193, -1722, 2884, -4393, -4095, 2662, -4808, -2788, 1085, -1992, 4123, 5334,
+        -3688, 5215, 5341, -1689, 3091, -2117, 5294, 4859, 5063, 3410, -116, 2205, -4742, -2374, -5116, -4720,
+        -2236, 3570, -2045, 2813, 2982, 2087, -1572, -4973, -3823, 458, 4828, 3891, -1009, -2419, 467, -4891,
+        -376, -1381, 5023, 1204, -268, 274, -3169, -3260, 3012, -1635, -1458, 4540, 3062, -4254, -1268, -1111,
+        3784, 2230, -675, -2279, -2565, -4359, 1006, -1510, -2629, 5015, 5064, -2449, 4189, -5005, 864, 2487,
+        -1018, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+        -6, -29827, -26238, -21593, 31369, -29626, -24976, -7722, 23056, -16236, -30351, 30053, 15736, 9343, -18845, -16925,
+        7033, 14329, 512, 15127, -24269, -21161, -13579, -1767, 829, -6716, 29364, -12415, -6381, 31467, -11141, 1609,
+        10093, -20100, -969, -23952, 22532, -25482, 6978, 8027, 26518, 17394, -4090, -25652, 23781, -5729, 11726, -21770,
+        18345, -4083, 4175, -15517, 15864, -19643, 7856, -22215, 3639, -18881, 719, -19320, 15840, -7880, -31558, 22483,
+        -1956, -6314, 31887, 15328, -23007, -7289, -21892, 11623, -4547, 31058, 22044, 13164, 5126, -15511, -19844, 6594,
+        29449, 11952, -32271, 6095, 8441, 23160, 32076, 22471, 3925, 6747, -11300, 12531, -16724, 8295, 28200, -7801,
+        28517, -29644, 16614, -20899, 23812, 12336, 11842, 20661, -7429, 12976, -2017, 23093, 31656, -3998, 28541, 24129,
+        -9508, -61, 29407, -232, -19326, -13987, 31418, 12384, -11683, -31583, -31290, 24165, -27152, 26292, 27895, 8161,
+        27359, 4797, 12214, 5120, 20198, 19454, -14999, -4717, -17675, 8289, -1627, 31497, 6924, 1725, -13012, 19661,
+        16090, -30144, -4394, -9691, 22623, 28712, 7326, 4248, 14731, 3035, -22666, 24641, 5619, -24330, 8246, -13811,
+        -21094, -13158, 24702, -23788, -24098, 27572, 177, 13024, -25543, -29151, 7795, 7192, 3407, 27329, -13268, 12098,
+        28224, -19564, 2395, -8807, 22209, 32070, -7356, -22313, -14847, 20070, -17096, 23836, 573, -14280, -24037, -1834,
+        408, 32351, -11555, 4967, -4589, 18875, -30546, -6917, 28103, -26286, 1932, 18077, -5766, 29370, 17412, 19856,
+        -12476, 23026, 31235, -30467, -12378, -24025, -7716, -12653, 10001, -8758, -1316, -20173, -31217, -11123, 25555, 23269,
+        -20856, -29541, -609, 31924, -2316, 3346, -8801, -13500, -7228, 14237, 11854, 14780, -20490, -9374, 780, 16809,
+        16072, 11446, -17571, -8935, -14341, 5369, -2066, -18918, 18363, 19863, 17352, -16273, -12707, 3699, 17248, 951,
+        6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    }
+};
+
+static inline __m256i sub_x16(__m256i a, __m256i b) {
+    //__asm__("vpsubw %1,%0,%0" : "+x"(a),"+x"(b));
+    return _mm256_sub_epi16(a, b);
+}
+
+static inline __m256i add_x16(__m256i a, __m256i b) {
+    return _mm256_add_epi16(a, b);
+}
+
+static inline __m256i reduce_x16(const __m256i *qdata, __m256i x) {
+    __m256i y = _mm256_mulhi_epi16(x, qrecip_x16);
+    y = _mm256_mulhrs_epi16(y, qshift_x16);
+    y = _mm256_mullo_epi16(y, q_x16);
+    return sub_x16(x, y);
+}
+
+static inline __m256i mulmod_x16_scaled(const __m256i *qdata, __m256i x, __m256i y, __m256i yqinv) {
+    __m256i b = _mm256_mulhi_epi16(x, y);
+    __m256i d = _mm256_mullo_epi16(x, yqinv);
+    __m256i e = _mm256_mulhi_epi16(d, q_x16);
+    return sub_x16(b, e);
+}
+
+typedef union {
+    int8 data[32];
+    __m256i _dummy;
+} byte32;
+static const byte32 shuffle_buf = { .data = {
+        14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1,
+        14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1,
+    }
+};
+#define shuffle (*(__m256i *) shuffle_buf.data)
+
+static inline __m256i _mm256_loadu_reverse16(const __m256i *p) {
+    __m256i x = _mm256_loadu_si256(p);
+    x = _mm256_permute2x128_si256(x, x, 1);
+    x = _mm256_shuffle_epi8(x, shuffle);
+    return x;
+}
+
+static void ntt128(int16 *f, int reps, const __m256i *qdata) {
+    __m256i f0, f1, f2, f3, g0, g1, g2, g3, h0, h1, h2, h3;
+    int16 *origf = f;
+    int rep;
+    __m256i zetainv_128_0 = zetainv(128, 0);
+    __m256i zetainv_qinv_128_0 = zetainv_qinv(128, 0);
+    __m256i zetainv_x4_32_0 = zetainv_x4(32, 0);
+    __m256i zetainv_x4_qinv_32_0 = zetainv_x4_qinv(32, 0);
+    __m256i zetainv_128_1 = zetainv(128, 1);
+    __m256i zetainv_qinv_128_1 = zetainv_qinv(128, 1);
+    __m256i zetainv_x4_32_1 = zetainv_x4(32, 1);
+    __m256i zetainv_x4_qinv_32_1 = zetainv_x4_qinv(32, 1);
+    for (rep = 0; rep < reps; ++rep) {
+        f1 = _mm256_loadu_si256((__m256i *) (f + 32));
+        f3 = _mm256_loadu_si256((__m256i *) (f + 96));
+        g3 = sub_x16(f1, f3);
+        g3 = mulmod_x16_scaled(qdata, g3, zeta4_x16, zeta4_x16_qinv);
+        g1 = add_x16(f1, f3);
+
+        f0 = _mm256_loadu_si256((__m256i *) (f + 0));
+        f2 = _mm256_loadu_si256((__m256i *) (f + 64));
+        g2 = sub_x16(f0, f2);
+        g0 = add_x16(f0, f2);
+
+        f3 = sub_x16(g3, g2);
+        f2 = add_x16(g2, g3);
+        f3 = mulmod_x16_scaled(qdata, f3, zetainv_128_0, zetainv_qinv_128_0);
+        f2 = mulmod_x16_scaled(qdata, f2, zeta(128, 0), zeta_qinv(128, 0));
+
+        g2 = _mm256_unpacklo_epi16(f2, f3);
+        g3 = _mm256_unpackhi_epi16(f2, f3);
+
+        f1 = sub_x16(g0, g1);
+        f0 = add_x16(g0, g1);
+        f1 = mulmod_x16_scaled(qdata, f1, zeta(64, 0), zeta_qinv(64, 0));
+        f0 = reduce_x16(qdata, f0);
+
+        g0 = _mm256_unpacklo_epi16(f0, f1);
+        h0 = _mm256_unpacklo_epi32(g0, g2);
+        h1 = _mm256_unpackhi_epi32(g0, g2);
+        g1 = _mm256_unpackhi_epi16(f0, f1);
+        h2 = _mm256_unpacklo_epi32(g1, g3);
+        h3 = _mm256_unpackhi_epi32(g1, g3);
+        f0 = _mm256_permute2x128_si256(h0, h1, 0x20);
+        f2 = _mm256_permute2x128_si256(h0, h1, 0x31);
+        f1 = _mm256_permute2x128_si256(h2, h3, 0x20);
+        f3 = _mm256_permute2x128_si256(h2, h3, 0x31);
+
+        _mm256_storeu_si256((__m256i *) (f + 0), f0);
+        _mm256_storeu_si256((__m256i *) (f + 64), f2);
+        _mm256_storeu_si256((__m256i *) (f + 32), f1);
+        _mm256_storeu_si256((__m256i *) (f + 96), f3);
+
+        f1 = _mm256_loadu_si256((__m256i *) (f + 48));
+        f3 = _mm256_loadu_si256((__m256i *) (f + 112));
+        g3 = sub_x16(f1, f3);
+        g3 = mulmod_x16_scaled(qdata, g3, zeta4_x16, zeta4_x16_qinv);
+        g1 = add_x16(f1, f3);
+
+        f0 = _mm256_loadu_si256((__m256i *) (f + 16));
+        f2 = _mm256_loadu_si256((__m256i *) (f + 80));
+        g2 = sub_x16(f0, f2);
+        g0 = add_x16(f0, f2);
+
+        f3 = sub_x16(g3, g2);
+        f2 = add_x16(g2, g3);
+        f3 = mulmod_x16_scaled(qdata, f3, zetainv_128_1, zetainv_qinv_128_1);
+        f2 = mulmod_x16_scaled(qdata, f2, zeta(128, 1), zeta_qinv(128, 1));
+
+        g2 = _mm256_unpacklo_epi16(f2, f3);
+        g3 = _mm256_unpackhi_epi16(f2, f3);
+
+        f1 = sub_x16(g0, g1);
+        f0 = add_x16(g0, g1);
+        f1 = mulmod_x16_scaled(qdata, f1, zeta(64, 1), zeta_qinv(64, 1));
+        f0 = reduce_x16(qdata, f0);
+
+        g0 = _mm256_unpacklo_epi16(f0, f1);
+        h0 = _mm256_unpacklo_epi32(g0, g2);
+        h1 = _mm256_unpackhi_epi32(g0, g2);
+        g1 = _mm256_unpackhi_epi16(f0, f1);
+        h2 = _mm256_unpacklo_epi32(g1, g3);
+        h3 = _mm256_unpackhi_epi32(g1, g3);
+        f0 = _mm256_permute2x128_si256(h0, h1, 0x20);
+        f2 = _mm256_permute2x128_si256(h0, h1, 0x31);
+        f1 = _mm256_permute2x128_si256(h2, h3, 0x20);
+        f3 = _mm256_permute2x128_si256(h2, h3, 0x31);
+
+        _mm256_storeu_si256((__m256i *) (f + 16), f0);
+        _mm256_storeu_si256((__m256i *) (f + 80), f2);
+        _mm256_storeu_si256((__m256i *) (f + 48), f1);
+        _mm256_storeu_si256((__m256i *) (f + 112), f3);
+
+        f += 128;
+    }
+    f = origf;
+    for (rep = 0; rep < reps; ++rep) {
+        f1 = _mm256_loadu_si256((__m256i *) (f + 64));
+        f3 = _mm256_loadu_si256((__m256i *) (f + 80));
+        g3 = sub_x16(f1, f3);
+        g3 = mulmod_x16_scaled(qdata, g3, zeta4_x16, zeta4_x16_qinv);
+        g1 = add_x16(f1, f3);
+
+        f0 = _mm256_loadu_si256((__m256i *) (f + 0));
+        f2 = _mm256_loadu_si256((__m256i *) (f + 16));
+        g2 = sub_x16(f0, f2);
+        g0 = add_x16(f0, f2);
+
+        f3 = sub_x16(g3, g2);
+        f2 = add_x16(g2, g3);
+        f3 = mulmod_x16_scaled(qdata, f3, zetainv_x4_32_0, zetainv_x4_qinv_32_0);
+        f2 = mulmod_x16_scaled(qdata, f2, zeta_x4(32, 0), zeta_x4_qinv(32, 0));
+
+        g2 = _mm256_unpacklo_epi64(f2, f3);
+        g3 = _mm256_unpackhi_epi64(f2, f3);
+
+        f1 = sub_x16(g0, g1);
+        f0 = add_x16(g0, g1);
+        f1 = mulmod_x16_scaled(qdata, f1, zeta_x4(16, 0), zeta_x4_qinv(16, 0));
+        f0 = reduce_x16(qdata, f0);
+
+        g1 = _mm256_unpackhi_epi64(f0, f1);
+        g0 = _mm256_unpacklo_epi64(f0, f1);
+        f1 = _mm256_permute2x128_si256(g1, g3, 0x20);
+        f3 = _mm256_permute2x128_si256(g1, g3, 0x31);
+        f0 = _mm256_permute2x128_si256(g0, g2, 0x20);
+        f2 = _mm256_permute2x128_si256(g0, g2, 0x31);
+
+        _mm256_storeu_si256((__m256i *) (f + 64), f1);
+        _mm256_storeu_si256((__m256i *) (f + 80), f3);
+        _mm256_storeu_si256((__m256i *) (f + 0), f0);
+        _mm256_storeu_si256((__m256i *) (f + 16), f2);
+
+        f1 = _mm256_loadu_si256((__m256i *) (f + 96));
+        f3 = _mm256_loadu_si256((__m256i *) (f + 112));
+        g3 = sub_x16(f1, f3);
+        g3 = mulmod_x16_scaled(qdata, g3, zeta4_x16, zeta4_x16_qinv);
+        g1 = add_x16(f1, f3);
+
+        f0 = _mm256_loadu_si256((__m256i *) (f + 32));
+        f2 = _mm256_loadu_si256((__m256i *) (f + 48));
+        g2 = sub_x16(f0, f2);
+        g0 = add_x16(f0, f2);
+
+        f3 = sub_x16(g3, g2);
+        f2 = add_x16(g2, g3);
+        f3 = mulmod_x16_scaled(qdata, f3, zetainv_x4_32_1, zetainv_x4_qinv_32_1);
+        f2 = mulmod_x16_scaled(qdata, f2, zeta_x4(32, 1), zeta_x4_qinv(32, 1));
+
+        g2 = _mm256_unpacklo_epi64(f2, f3);
+        g3 = _mm256_unpackhi_epi64(f2, f3);
+
+        f1 = sub_x16(g0, g1);
+        f0 = add_x16(g0, g1);
+        f1 = mulmod_x16_scaled(qdata, f1, zeta_x4(16, 1), zeta_x4_qinv(16, 1));
+        f0 = reduce_x16(qdata, f0);
+
+        g1 = _mm256_unpackhi_epi64(f0, f1);
+        g0 = _mm256_unpacklo_epi64(f0, f1);
+        f1 = _mm256_permute2x128_si256(g1, g3, 0x20);
+        f3 = _mm256_permute2x128_si256(g1, g3, 0x31);
+        f0 = _mm256_permute2x128_si256(g0, g2, 0x20);
+        f2 = _mm256_permute2x128_si256(g0, g2, 0x31);
+
+        _mm256_storeu_si256((__m256i *) (f + 96), f1);
+        _mm256_storeu_si256((__m256i *) (f + 112), f3);
+        _mm256_storeu_si256((__m256i *) (f + 32), f0);
+        _mm256_storeu_si256((__m256i *) (f + 48), f2);
+
+        f += 128;
+    }
+    f = origf;
+    for (rep = 0; rep < reps; ++rep) {
+
+        f1 = _mm256_loadu_si256((__m256i *) (f + 16));
+        f3 = _mm256_loadu_si256((__m256i *) (f + 48));
+        g3 = sub_x16(f1, f3);
+        g3 = mulmod_x16_scaled(qdata, g3, zeta4_x16, zeta4_x16_qinv);
+        g1 = add_x16(f1, f3);
+
+        f0 = _mm256_loadu_si256((__m256i *) (f + 0));
+        f2 = _mm256_loadu_si256((__m256i *) (f + 32));
+        g2 = sub_x16(f0, f2);
+        g0 = add_x16(f0, f2);
+
+        f2 = add_x16(g2, g3);
+        f3 = sub_x16(g2, g3);
+        f2 = reduce_x16(qdata, f2);
+        f3 = reduce_x16(qdata, f3);
+
+        f1 = sub_x16(g0, g1);
+        f0 = add_x16(g0, g1);
+        f0 = reduce_x16(qdata, f0);
+
+        h0 = f0;
+        h1 = f1;
+        h2 = f2;
+        h3 = f3;
+
+        f1 = _mm256_loadu_si256((__m256i *) (f + 80));
+        f3 = _mm256_loadu_si256((__m256i *) (f + 112));
+        g3 = sub_x16(f1, f3);
+        g3 = mulmod_x16_scaled(qdata, g3, zeta4_x16, zeta4_x16_qinv);
+        g1 = add_x16(f1, f3);
+
+        f0 = _mm256_loadu_si256((__m256i *) (f + 64));
+        f2 = _mm256_loadu_si256((__m256i *) (f + 96));
+        g2 = sub_x16(f0, f2);
+        g0 = add_x16(f0, f2);
+
+        f3 = sub_x16(g3, g2);
+        f2 = add_x16(g2, g3);
+        f3 = mulmod_x16_scaled(qdata, f3, zetainv8_x16, zetainv8_x16_qinv);
+        f2 = mulmod_x16_scaled(qdata, f2, zeta8_x16, zeta8_x16_qinv);
+
+        f1 = sub_x16(g0, g1);
+        f0 = add_x16(g0, g1);
+        f1 = mulmod_x16_scaled(qdata, f1, zeta4_x16, zeta4_x16_qinv);
+        f0 = reduce_x16(qdata, f0);
+
+        g0 = add_x16(h0, f0);
+        g1 = add_x16(h1, f1);
+        g2 = add_x16(h2, f2);
+        g3 = add_x16(h3, f3);
+        _mm256_storeu_si256((__m256i *) (f + 0), g0);
+        _mm256_storeu_si256((__m256i *) (f + 16), g1);
+        _mm256_storeu_si256((__m256i *) (f + 32), g2);
+        _mm256_storeu_si256((__m256i *) (f + 48), g3);
+        g0 = sub_x16(h0, f0);
+        g1 = sub_x16(h1, f1);
+        g2 = sub_x16(h2, f2);
+        g3 = sub_x16(h3, f3);
+        _mm256_storeu_si256((__m256i *) (f + 64), g0);
+        _mm256_storeu_si256((__m256i *) (f + 80), g1);
+        _mm256_storeu_si256((__m256i *) (f + 96), g2);
+        _mm256_storeu_si256((__m256i *) (f + 112), g3);
+        f += 128;
+    }
+}
+
+static void ntt512(int16 *f, int reps, const __m256i *qdata) {
+    __m256i f0, f1, f2, f3, g0, g1, g2, g3; /* [-Werror=unused-variable] */ /* ,h0,h1,h2,h3; */
+    int16 *origf = f;
+    int rep;
+    __m256i zetainv_512[8];
+    __m256i zetainv_qinv_512[8];
+    int i;
+    for (i = 0; i < 8; ++i) {
+        zetainv_512[i] = zetainv(512, i);
+    }
+    for (i = 0; i < 8; ++i) {
+        zetainv_qinv_512[i] = zetainv_qinv(512, i);
+    }
+    for (rep = 0; rep < reps; ++rep) {
+        for (i = 0; i < 8; ++i) {
+            f1 = _mm256_loadu_si256((__m256i *) (f + 16 * i + 128));
+            f3 = _mm256_loadu_si256((__m256i *) (f + 16 * i + 384));
+            g3 = sub_x16(f1, f3);
+            g3 = mulmod_x16_scaled(qdata, g3, zeta4_x16, zeta4_x16_qinv);
+            g1 = add_x16(f1, f3);
+
+            f0 = _mm256_loadu_si256((__m256i *) (f + 16 * i));
+            f2 = _mm256_loadu_si256((__m256i *) (f + 16 * i + 256));
+            g2 = sub_x16(f0, f2);
+            g0 = add_x16(f0, f2);
+
+            f3 = sub_x16(g3, g2);
+            f2 = add_x16(g2, g3);
+            f3 = mulmod_x16_scaled(qdata, f3, zetainv_512[i], zetainv_qinv_512[i]);
+            f2 = mulmod_x16_scaled(qdata, f2, zeta(512, i), zeta_qinv(512, i));
+
+            f1 = sub_x16(g0, g1);
+            f0 = add_x16(g0, g1);
+            f1 = mulmod_x16_scaled(qdata, f1, zeta(256, i), zeta_qinv(256, i));
+            f0 = reduce_x16(qdata, f0);
+
+            _mm256_storeu_si256((__m256i *) (f + 16 * i + 384), f3);
+            _mm256_storeu_si256((__m256i *) (f + 16 * i + 256), f2);
+            _mm256_storeu_si256((__m256i *) (f + 16 * i + 128), f1);
+            _mm256_storeu_si256((__m256i *) (f + 16 * i), f0);
+
+        }
+        f += 512;
+    }
+    f = origf;
+    ntt128(f, reps * 4, qdata);
+}
+
+void PQCLEAN_SNTRUP761_AVX2_ntt512_7681(int16 *f, int reps) {
+    ntt512(f, reps, (const __m256i *) qdata_7681.data);
+}
+
+void PQCLEAN_SNTRUP761_AVX2_ntt512_10753(int16 *f, int reps) {
+    ntt512(f, reps, (const __m256i *) qdata_10753.data);
+}
+
+static void invntt128(int16 *f, int reps, const __m256i *qdata) {
+    __m256i f0, f1, f2, f3, g0, g1, g2, g3, h0, h1, h2, h3;
+    int16 *origf = f;
+    int rep;
+    __m256i zetainv_x4_16_0 = zetainv_x4(16, 0);
+    __m256i zetainv_x4_qinv_16_0 = zetainv_x4_qinv(16, 0);
+    __m256i zetainv_x4_32_0 = zetainv_x4(32, 0);
+    __m256i zetainv_x4_qinv_32_0 = zetainv_x4_qinv(32, 0);
+    __m256i zetainv_64_0 = zetainv(64, 0);
+    __m256i zetainv_qinv_64_0 = zetainv_qinv(64, 0);
+    __m256i zetainv_128_0 = zetainv(128, 0);
+    __m256i zetainv_qinv_128_0 = zetainv_qinv(128, 0);
+    __m256i zetainv_x4_16_1 = zetainv_x4(16, 1);
+    __m256i zetainv_x4_qinv_16_1 = zetainv_x4_qinv(16, 1);
+    __m256i zetainv_x4_32_1 = zetainv_x4(32, 1);
+    __m256i zetainv_x4_qinv_32_1 = zetainv_x4_qinv(32, 1);
+    __m256i zetainv_64_1 = zetainv(64, 1);
+    __m256i zetainv_qinv_64_1 = zetainv_qinv(64, 1);
+    __m256i zetainv_128_1 = zetainv(128, 1);
+    __m256i zetainv_qinv_128_1 = zetainv_qinv(128, 1);
+    for (rep = 0; rep < reps; ++rep) {
+        f0 = _mm256_loadu_si256((__m256i *) (f +   0));
+        f1 = _mm256_loadu_si256((__m256i *) (f +  64));
+        f2 = _mm256_loadu_si256((__m256i *) (f +  16));
+        f3 = _mm256_loadu_si256((__m256i *) (f +  80));
+        g0 = _mm256_loadu_si256((__m256i *) (f +  32));
+        g1 = _mm256_loadu_si256((__m256i *) (f +  96));
+        g2 = _mm256_loadu_si256((__m256i *) (f +  48));
+        g3 = _mm256_loadu_si256((__m256i *) (f + 112));
+
+        h1 = sub_x16(f0, f1);
+        h1 = reduce_x16(qdata, h1);
+        h0 = add_x16(f0, f1);
+        h3 = sub_x16(f2, f3);
+        h3 = mulmod_x16_scaled(qdata, h3, zeta4_x16, zeta4_x16_qinv);
+        h2 = add_x16(f2, f3);
+        f1 = sub_x16(g0, g1);
+        f1 = mulmod_x16_scaled(qdata, f1, zetainv8_x16, zetainv8_x16_qinv);
+        f0 = add_x16(g0, g1);
+        f3 = sub_x16(g2, g3);
+        f3 = mulmod_x16_scaled(qdata, f3, zeta8_x16, zeta8_x16_qinv);
+        f2 = add_x16(g2, g3);
+
+        g0 = add_x16(h0, h2);
+        g0 = reduce_x16(qdata, g0);
+        g2 = sub_x16(h0, h2);
+        g2 = reduce_x16(qdata, g2);
+        g1 = sub_x16(h1, h3);
+        g3 = add_x16(h1, h3);
+        h2 = sub_x16(f0, f2);
+        h2 = mulmod_x16_scaled(qdata, h2, zeta4_x16, zeta4_x16_qinv);
+        h0 = add_x16(f0, f2);
+        h3 = add_x16(f1, f3);
+        h3 = mulmod_x16_scaled(qdata, h3, zeta4_x16, zeta4_x16_qinv);
+        h1 = sub_x16(f1, f3);
+
+        f0 = add_x16(g0, h0);
+        g0 = sub_x16(g0, h0);
+        f1 = add_x16(g1, h1);
+        g1 = sub_x16(g1, h1);
+        f2 = sub_x16(g2, h2);
+        g2 = add_x16(g2, h2);
+        f3 = sub_x16(g3, h3);
+        g3 = add_x16(g3, h3);
+
+        _mm256_storeu_si256((__m256i *) (f +   0), f0);
+        _mm256_storeu_si256((__m256i *) (f +  32), g0);
+        _mm256_storeu_si256((__m256i *) (f +  64), f1);
+        _mm256_storeu_si256((__m256i *) (f +  96), g1);
+        _mm256_storeu_si256((__m256i *) (f +  16), f2);
+        _mm256_storeu_si256((__m256i *) (f +  48), g2);
+        _mm256_storeu_si256((__m256i *) (f +  80), f3);
+        _mm256_storeu_si256((__m256i *) (f + 112), g3);
+
+        f += 128;
+    }
+    f = origf;
+    for (rep = 0; rep < reps; ++rep) {
+        f0 = _mm256_loadu_si256((__m256i *) (f + 0));
+        f1 = _mm256_loadu_si256((__m256i *) (f + 64));
+        f2 = _mm256_loadu_si256((__m256i *) (f + 16));
+        f3 = _mm256_loadu_si256((__m256i *) (f + 80));
+
+        g0 = _mm256_unpacklo_epi64(f0, f1);
+        g1 = _mm256_unpacklo_epi64(f2, f3);
+        g2 = _mm256_unpackhi_epi64(f0, f1);
+        g3 = _mm256_unpackhi_epi64(f2, f3);
+        f2 = _mm256_permute2x128_si256(g0, g1, 0x31);
+        f3 = _mm256_permute2x128_si256(g2, g3, 0x31);
+        f0 = _mm256_permute2x128_si256(g0, g1, 0x20);
+        f1 = _mm256_permute2x128_si256(g2, g3, 0x20);
+
+        f2 = mulmod_x16_scaled(qdata, f2, zetainv_x4_32_0, zetainv_x4_qinv_32_0);
+        f3 = mulmod_x16_scaled(qdata, f3, zeta_x4(32, 0), zeta_x4_qinv(32, 0));
+
+        g3 = add_x16(f3, f2);
+        g3 = mulmod_x16_scaled(qdata, g3, zeta4_x16, zeta4_x16_qinv);
+        g2 = sub_x16(f3, f2);
+
+        f0 = reduce_x16(qdata, f0);
+        f1 = mulmod_x16_scaled(qdata, f1, zetainv_x4_16_0, zetainv_x4_qinv_16_0);
+
+        g1 = add_x16(f0, f1);
+        g0 = sub_x16(f0, f1);
+
+        f1 = add_x16(g1, g3);
+        f3 = sub_x16(g1, g3);
+        f0 = add_x16(g0, g2);
+        f2 = sub_x16(g0, g2);
+
+        _mm256_storeu_si256((__m256i *) (f + 64), f1);
+        _mm256_storeu_si256((__m256i *) (f + 80), f3);
+        _mm256_storeu_si256((__m256i *) (f + 0), f0);
+        _mm256_storeu_si256((__m256i *) (f + 16), f2);
+
+        f0 = _mm256_loadu_si256((__m256i *) (f + 32));
+        f1 = _mm256_loadu_si256((__m256i *) (f + 96));
+        f2 = _mm256_loadu_si256((__m256i *) (f + 48));
+        f3 = _mm256_loadu_si256((__m256i *) (f + 112));
+
+        g0 = _mm256_unpacklo_epi64(f0, f1);
+        g1 = _mm256_unpacklo_epi64(f2, f3);
+        g2 = _mm256_unpackhi_epi64(f0, f1);
+        g3 = _mm256_unpackhi_epi64(f2, f3);
+        f2 = _mm256_permute2x128_si256(g0, g1, 0x31);
+        f3 = _mm256_permute2x128_si256(g2, g3, 0x31);
+        f0 = _mm256_permute2x128_si256(g0, g1, 0x20);
+        f1 = _mm256_permute2x128_si256(g2, g3, 0x20);
+
+        f2 = mulmod_x16_scaled(qdata, f2, zetainv_x4_32_1, zetainv_x4_qinv_32_1);
+        f3 = mulmod_x16_scaled(qdata, f3, zeta_x4(32, 1), zeta_x4_qinv(32, 1));
+
+        g3 = add_x16(f3, f2);
+        g3 = mulmod_x16_scaled(qdata, g3, zeta4_x16, zeta4_x16_qinv);
+        g2 = sub_x16(f3, f2);
+
+        f0 = reduce_x16(qdata, f0);
+        f1 = mulmod_x16_scaled(qdata, f1, zetainv_x4_16_1, zetainv_x4_qinv_16_1);
+
+        g1 = add_x16(f0, f1);
+        g0 = sub_x16(f0, f1);
+
+        f1 = add_x16(g1, g3);
+        f3 = sub_x16(g1, g3);
+        f0 = add_x16(g0, g2);
+        f2 = sub_x16(g0, g2);
+
+        _mm256_storeu_si256((__m256i *) (f + 96), f1);
+        _mm256_storeu_si256((__m256i *) (f + 112), f3);
+        _mm256_storeu_si256((__m256i *) (f + 32), f0);
+        _mm256_storeu_si256((__m256i *) (f + 48), f2);
+
+        f += 128;
+    }
+    f = origf;
+    for (rep = 0; rep < reps; ++rep) {
+        f0 = _mm256_loadu_si256((__m256i *) (f + 0));
+        f2 = _mm256_loadu_si256((__m256i *) (f + 64));
+        f1 = _mm256_loadu_si256((__m256i *) (f + 32));
+        f3 = _mm256_loadu_si256((__m256i *) (f + 96));
+
+        g0 = _mm256_permute2x128_si256(f0, f2, 0x20);
+        g2 = _mm256_permute2x128_si256(f0, f2, 0x31);
+        f0 = _mm256_unpacklo_epi16(g0, g2);
+        f2 = _mm256_unpackhi_epi16(g0, g2);
+        g1 = _mm256_permute2x128_si256(f1, f3, 0x20);
+        g3 = _mm256_permute2x128_si256(f1, f3, 0x31);
+        f1 = _mm256_unpacklo_epi16(g1, g3);
+        f3 = _mm256_unpackhi_epi16(g1, g3);
+        g1 = _mm256_unpackhi_epi16(f0, f2);
+        g0 = _mm256_unpacklo_epi16(f0, f2);
+        g3 = _mm256_unpackhi_epi16(f1, f3);
+        g2 = _mm256_unpacklo_epi16(f1, f3);
+        f2 = _mm256_unpacklo_epi64(g1, g3);
+        f3 = _mm256_unpackhi_epi64(g1, g3);
+        f0 = _mm256_unpacklo_epi64(g0, g2);
+        f1 = _mm256_unpackhi_epi64(g0, g2);
+
+        f2 = mulmod_x16_scaled(qdata, f2, zetainv_128_0, zetainv_qinv_128_0);
+        f3 = mulmod_x16_scaled(qdata, f3, zeta(128, 0), zeta_qinv(128, 0));
+        f0 = reduce_x16(qdata, f0);
+        f1 = mulmod_x16_scaled(qdata, f1, zetainv_64_0, zetainv_qinv_64_0);
+
+        g3 = add_x16(f3, f2);
+        g3 = mulmod_x16_scaled(qdata, g3, zeta4_x16, zeta4_x16_qinv);
+        g1 = add_x16(f0, f1);
+        g2 = sub_x16(f3, f2);
+        g0 = sub_x16(f0, f1);
+
+        f1 = add_x16(g1, g3);
+        f3 = sub_x16(g1, g3);
+        f0 = add_x16(g0, g2);
+        f2 = sub_x16(g0, g2);
+
+        _mm256_storeu_si256((__m256i *) (f + 32), f1);
+        _mm256_storeu_si256((__m256i *) (f + 96), f3);
+        _mm256_storeu_si256((__m256i *) (f + 0), f0);
+        _mm256_storeu_si256((__m256i *) (f + 64), f2);
+
+        f0 = _mm256_loadu_si256((__m256i *) (f + 16));
+        f2 = _mm256_loadu_si256((__m256i *) (f + 80));
+        f1 = _mm256_loadu_si256((__m256i *) (f + 48));
+        f3 = _mm256_loadu_si256((__m256i *) (f + 112));
+
+        g0 = _mm256_permute2x128_si256(f0, f2, 0x20);
+        g2 = _mm256_permute2x128_si256(f0, f2, 0x31);
+        f0 = _mm256_unpacklo_epi16(g0, g2);
+        f2 = _mm256_unpackhi_epi16(g0, g2);
+        g1 = _mm256_permute2x128_si256(f1, f3, 0x20);
+        g3 = _mm256_permute2x128_si256(f1, f3, 0x31);
+        f1 = _mm256_unpacklo_epi16(g1, g3);
+        f3 = _mm256_unpackhi_epi16(g1, g3);
+        g1 = _mm256_unpackhi_epi16(f0, f2);
+        g0 = _mm256_unpacklo_epi16(f0, f2);
+        g3 = _mm256_unpackhi_epi16(f1, f3);
+        g2 = _mm256_unpacklo_epi16(f1, f3);
+        f2 = _mm256_unpacklo_epi64(g1, g3);
+        f3 = _mm256_unpackhi_epi64(g1, g3);
+        f0 = _mm256_unpacklo_epi64(g0, g2);
+        f1 = _mm256_unpackhi_epi64(g0, g2);
+
+        f2 = mulmod_x16_scaled(qdata, f2, zetainv_128_1, zetainv_qinv_128_1);
+        f3 = mulmod_x16_scaled(qdata, f3, zeta(128, 1), zeta_qinv(128, 1));
+        f0 = reduce_x16(qdata, f0);
+        f1 = mulmod_x16_scaled(qdata, f1, zetainv_64_1, zetainv_qinv_64_1);
+
+        g3 = add_x16(f3, f2);
+        g3 = mulmod_x16_scaled(qdata, g3, zeta4_x16, zeta4_x16_qinv);
+        g1 = add_x16(f0, f1);
+        g2 = sub_x16(f3, f2);
+        g0 = sub_x16(f0, f1);
+
+        f1 = add_x16(g1, g3);
+        f3 = sub_x16(g1, g3);
+        f0 = add_x16(g0, g2);
+        f2 = sub_x16(g0, g2);
+
+        _mm256_storeu_si256((__m256i *) (f + 48), f1);
+        _mm256_storeu_si256((__m256i *) (f + 112), f3);
+        _mm256_storeu_si256((__m256i *) (f + 16), f0);
+        _mm256_storeu_si256((__m256i *) (f + 80), f2);
+
+        f += 128;
+    }
+}
+
+static void invntt512(int16 *f, int reps, const __m256i *qdata) {
+    __m256i f0, f1, f2, f3, g0, g1, g2, g3; /* [-Werror=unused-variable] */ /* ,h0,h1,h2,h3; */
+    /* [-Werror=unused-variable] */ /* int16 *origf = f; */
+    int rep;
+    __m256i zetainv_512[8];
+    __m256i zetainv_qinv_512[8];
+    __m256i zetainv_256[8];
+    __m256i zetainv_qinv_256[8];
+    int i;
+    for (i = 0; i < 8; ++i) {
+        zetainv_512[i] = zetainv(512, i);
+    }
+    for (i = 0; i < 8; ++i) {
+        zetainv_qinv_512[i] = zetainv_qinv(512, i);
+    }
+    for (i = 0; i < 8; ++i) {
+        zetainv_256[i] = zetainv(256, i);
+    }
+    for (i = 0; i < 8; ++i) {
+        zetainv_qinv_256[i] = zetainv_qinv(256, i);
+    }
+    invntt128(f, 4 * reps, qdata);
+    for (rep = 0; rep < reps; ++rep) {
+        for (i = 0; i < 8; ++i) {
+            f2 = _mm256_loadu_si256((__m256i *) (f + 16 * i + 256));
+            f3 = _mm256_loadu_si256((__m256i *) (f + 16 * i + 384));
+
+            f2 = mulmod_x16_scaled(qdata, f2, zetainv_512[i], zetainv_qinv_512[i]);
+            f3 = mulmod_x16_scaled(qdata, f3, zeta(512, i), zeta_qinv(512, i));
+            g3 = add_x16(f3, f2);
+            g3 = mulmod_x16_scaled(qdata, g3, zeta4_x16, zeta4_x16_qinv);
+            g2 = sub_x16(f3, f2);
+
+            f0 = _mm256_loadu_si256((__m256i *) (f + 16 * i + 0));
+            f1 = _mm256_loadu_si256((__m256i *) (f + 16 * i + 128));
+
+            f0 = reduce_x16(qdata, f0);
+            f1 = mulmod_x16_scaled(qdata, f1, zetainv_256[i], zetainv_qinv_256[i]);
+            g1 = add_x16(f0, f1);
+            g0 = sub_x16(f0, f1);
+
+            f1 = add_x16(g1, g3);
+            f3 = sub_x16(g1, g3);
+            f0 = add_x16(g0, g2);
+            f2 = sub_x16(g0, g2);
+
+            _mm256_storeu_si256((__m256i *) (f + 16 * i + 128), f1);
+            _mm256_storeu_si256((__m256i *) (f + 16 * i + 384), f3);
+            _mm256_storeu_si256((__m256i *) (f + 16 * i + 0), f0);
+            _mm256_storeu_si256((__m256i *) (f + 16 * i + 256), f2);
+        }
+        f += 512;
+    }
+}
+
+void PQCLEAN_SNTRUP761_AVX2_invntt512_7681(int16 *f, int reps) {
+    invntt512(f, reps, (const __m256i *) qdata_7681.data);
+}
+
+void PQCLEAN_SNTRUP761_AVX2_invntt512_10753(int16 *f, int reps) {
+    invntt512(f, reps, (const __m256i *) qdata_10753.data);
+}
diff --git a/crypto_kem/sntrup761/avx2/crypto_core_multsntrup761_ntt.h b/crypto_kem/sntrup761/avx2/crypto_core_multsntrup761_ntt.h
new file mode 100644
index 00000000..8005ff81
--- /dev/null
+++ b/crypto_kem/sntrup761/avx2/crypto_core_multsntrup761_ntt.h
@@ -0,0 +1,13 @@
+#ifndef ntt_H
+#define ntt_H
+
+#include <stdint.h>
+
+
+
+extern void PQCLEAN_SNTRUP761_AVX2_ntt512_7681(int16_t *f, int reps);
+extern void PQCLEAN_SNTRUP761_AVX2_ntt512_10753(int16_t *f, int reps);
+extern void PQCLEAN_SNTRUP761_AVX2_invntt512_7681(int16_t *f, int reps);
+extern void PQCLEAN_SNTRUP761_AVX2_invntt512_10753(int16_t *f, int reps);
+
+#endif
diff --git a/crypto_kem/sntrup761/avx2/crypto_core_scale3sntrup761.c b/crypto_kem/sntrup761/avx2/crypto_core_scale3sntrup761.c
new file mode 100644
index 00000000..477fe041
--- /dev/null
+++ b/crypto_kem/sntrup761/avx2/crypto_core_scale3sntrup761.c
@@ -0,0 +1,47 @@
+#include "crypto_core_scale3sntrup761.h"
+#include "crypto_decode_761xint16.h"
+#include "crypto_encode_761xint16.h"
+#include <immintrin.h>
+
+#define p 761
+#define q 4591
+
+#define crypto_decode_pxint16 PQCLEAN_SNTRUP761_AVX2_crypto_decode_761xint16
+#define crypto_encode_pxint16 PQCLEAN_SNTRUP761_AVX2_crypto_encode_761xint16
+
+typedef int16_t Fq;
+
+/* out = 3*in in Rq */
+int PQCLEAN_SNTRUP761_AVX2_crypto_core_scale3sntrup761(unsigned char *outbytes, const unsigned char *inbytes) {
+    int i = p - 16;
+
+    __m256i save = _mm256_loadu_si256((__m256i *) (inbytes + 2 * i));
+    /* in case outbytes = inbytes */
+
+    for (;;) {
+        do {
+            __m256i x = _mm256_loadu_si256((__m256i *) inbytes);
+            __m256i xneg;
+            x = _mm256_mullo_epi16(x, _mm256_set1_epi16(3));
+            x = _mm256_sub_epi16(x, _mm256_set1_epi16((q + 1) / 2));
+            xneg = _mm256_srai_epi16(x, 15);
+            x = _mm256_add_epi16(x, _mm256_set1_epi16(q)&xneg);
+            xneg = _mm256_srai_epi16(x, 15);
+            x = _mm256_add_epi16(x, _mm256_set1_epi16(q)&xneg);
+            x = _mm256_sub_epi16(x, _mm256_set1_epi16((q - 1) / 2));
+            _mm256_storeu_si256((__m256i *) outbytes, x);
+
+            inbytes += 32;
+            outbytes += 32;
+            i -= 16;
+        } while (i >= 0);
+        if (i <= -16) {
+            break;
+        }
+        inbytes += 2 * i;
+        outbytes += 2 * i;
+        _mm256_storeu_si256((__m256i *) outbytes, save);
+    }
+
+    return 0;
+}
diff --git a/crypto_kem/sntrup761/avx2/crypto_core_scale3sntrup761.h b/crypto_kem/sntrup761/avx2/crypto_core_scale3sntrup761.h
new file mode 100644
index 00000000..954872f7
--- /dev/null
+++ b/crypto_kem/sntrup761/avx2/crypto_core_scale3sntrup761.h
@@ -0,0 +1,11 @@
+#ifndef PQCLEAN_SNTRUP761_AVX2_CRYPTO_CORE_SCALE3SNTRUP761_H
+#define PQCLEAN_SNTRUP761_AVX2_CRYPTO_CORE_SCALE3SNTRUP761_H
+
+#include <stdint.h>
+#define PQCLEAN_SNTRUP761_AVX2_crypto_core_scale3sntrup761_OUTPUTBYTES 1522
+#define PQCLEAN_SNTRUP761_AVX2_crypto_core_scale3sntrup761_INPUTBYTES 1522
+#define PQCLEAN_SNTRUP761_AVX2_crypto_core_scale3sntrup761_KEYBYTES 0
+#define PQCLEAN_SNTRUP761_AVX2_crypto_core_scale3sntrup761_CONSTBYTES 0
+
+int PQCLEAN_SNTRUP761_AVX2_crypto_core_scale3sntrup761(unsigned char *outbytes, const unsigned char *inbytes);
+#endif
diff --git a/crypto_kem/sntrup761/avx2/crypto_core_weightsntrup761.c b/crypto_kem/sntrup761/avx2/crypto_core_weightsntrup761.c
new file mode 100644
index 00000000..3398fe2f
--- /dev/null
+++ b/crypto_kem/sntrup761/avx2/crypto_core_weightsntrup761.c
@@ -0,0 +1,44 @@
+#include "crypto_core_weightsntrup761.h"
+#include "crypto_encode_int16.h"
+#include "params.h"
+#include <immintrin.h>
+
+#define int8 int8_t
+#define int16 int16_t
+
+
+/* out = little-endian weight of bottom bits of in */
+int PQCLEAN_SNTRUP761_AVX2_crypto_core_weightsntrup761(unsigned char *outbytes, const unsigned char *inbytes) {
+    int8 *in = (void *) inbytes;
+    int i;
+    __m256i sum, sumhi;
+    int16 weight;
+
+    sum = _mm256_loadu_si256((__m256i *) (in + p - 32));
+    sum &= _mm256_set_epi8(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0);
+    for (i = p - 32; i >= 0; i -= 32) {
+        __m256i bits = _mm256_loadu_si256((__m256i *) in);
+        bits &= _mm256_set1_epi8(1);
+        sum = _mm256_add_epi8(sum, bits);
+        in += 32;
+    }
+
+    /* sum is 32xint8; want to add these int8 */
+    sumhi = _mm256_srli_epi16(sum, 8);
+    sum &= _mm256_set1_epi16(0xff);
+    sum = _mm256_add_epi16(sum, sumhi);
+
+    /* sum is 16xint16; want to add these int16 */
+    sum = _mm256_hadd_epi16(sum, sum);
+    /* want sum[0]+sum[1]+sum[2]+sum[3]+sum[8]+sum[9]+sum[10]+sum[11] */
+    sum = _mm256_hadd_epi16(sum, sum);
+    /* want sum[0]+sum[1]+sum[8]+sum[9] */
+    sum = _mm256_hadd_epi16(sum, sum);
+    /* want sum[0]+sum[8] */
+
+    weight = _mm256_extract_epi16(sum, 0);
+    weight += _mm256_extract_epi16(sum, 8);
+
+    PQCLEAN_SNTRUP761_AVX2_crypto_encode_int16(outbytes, &weight);
+    return 0;
+}
diff --git a/crypto_kem/sntrup761/avx2/crypto_core_weightsntrup761.h b/crypto_kem/sntrup761/avx2/crypto_core_weightsntrup761.h
new file mode 100644
index 00000000..a2e3cd44
--- /dev/null
+++ b/crypto_kem/sntrup761/avx2/crypto_core_weightsntrup761.h
@@ -0,0 +1,11 @@
+#ifndef PQCLEAN_SNTRUP761_AVX2_CRYPTO_CORE_WEIGHTSNTRUP761_H
+#define PQCLEAN_SNTRUP761_AVX2_CRYPTO_CORE_WEIGHTSNTRUP761_H
+
+#include <stdint.h>
+#define PQCLEAN_SNTRUP761_AVX2_crypto_core_weightsntrup761_OUTPUTBYTES 2
+#define PQCLEAN_SNTRUP761_AVX2_crypto_core_weightsntrup761_INPUTBYTES 761
+#define PQCLEAN_SNTRUP761_AVX2_crypto_core_weightsntrup761_KEYBYTES 0
+#define PQCLEAN_SNTRUP761_AVX2_crypto_core_weightsntrup761_CONSTBYTES 0
+
+int PQCLEAN_SNTRUP761_AVX2_crypto_core_weightsntrup761(unsigned char *outbytes, const unsigned char *inbytes);
+#endif
diff --git a/crypto_kem/sntrup761/avx2/crypto_core_wforcesntrup761.c b/crypto_kem/sntrup761/avx2/crypto_core_wforcesntrup761.c
new file mode 100644
index 00000000..0a89659a
--- /dev/null
+++ b/crypto_kem/sntrup761/avx2/crypto_core_wforcesntrup761.c
@@ -0,0 +1,61 @@
+#include "crypto_core_wforcesntrup761.h"
+#include "crypto_decode_int16.h"
+#include "params.h"
+#include <immintrin.h>
+
+#define int16 int16_t
+
+
+/* out = in if bottom bits of in have weight w */
+/* otherwise out = (1,1,...,1,0,0,...,0) */
+int PQCLEAN_SNTRUP761_AVX2_crypto_core_wforcesntrup761(unsigned char *out, const unsigned char *in) {
+    int16 weight;
+    int16 mask;
+    __m256i maskvec;
+    int i;
+
+    crypto_core_weight((unsigned char *) &weight, in);
+    PQCLEAN_SNTRUP761_AVX2_crypto_decode_int16(&weight, (unsigned char *) &weight);
+
+    mask = (weight - w) | (w - weight);
+    mask >>= 15;
+    maskvec = _mm256_set1_epi16((short) ~mask);
+
+    i = w - 32;
+    for (;;) {
+        do {
+            __m256i x = _mm256_loadu_si256((__m256i *) in);
+            x ^= _mm256_set1_epi8(1);
+            x &= maskvec;
+            x ^= _mm256_set1_epi8(1);
+            _mm256_storeu_si256((__m256i *) out, x);
+            in += 32;
+            out += 32;
+            i -= 32;
+        } while (i >= 0);
+        if (i <= -32) {
+            break;
+        }
+        in += i;
+        out += i;
+    }
+
+    i = p - w - 32;
+    for (;;) {
+        do {
+            __m256i x = _mm256_loadu_si256((__m256i *) in);
+            x &= maskvec;
+            _mm256_storeu_si256((__m256i *) out, x);
+            in += 32;
+            out += 32;
+            i -= 32;
+        } while (i >= 0);
+        if (i <= -32) {
+            break;
+        }
+        in += i;
+        out += i;
+    }
+
+    return 0;
+}
diff --git a/crypto_kem/sntrup761/avx2/crypto_core_wforcesntrup761.h b/crypto_kem/sntrup761/avx2/crypto_core_wforcesntrup761.h
new file mode 100644
index 00000000..78876a54
--- /dev/null
+++ b/crypto_kem/sntrup761/avx2/crypto_core_wforcesntrup761.h
@@ -0,0 +1,11 @@
+#ifndef PQCLEAN_SNTRUP761_AVX2_CRYPTO_CORE_WFORCESNTRUP761_H
+#define PQCLEAN_SNTRUP761_AVX2_CRYPTO_CORE_WFORCESNTRUP761_H
+
+#include <stdint.h>
+#define PQCLEAN_SNTRUP761_AVX2_crypto_core_wforcesntrup761_OUTPUTBYTES 761
+#define PQCLEAN_SNTRUP761_AVX2_crypto_core_wforcesntrup761_INPUTBYTES 761
+#define PQCLEAN_SNTRUP761_AVX2_crypto_core_wforcesntrup761_KEYBYTES 0
+#define PQCLEAN_SNTRUP761_AVX2_crypto_core_wforcesntrup761_CONSTBYTES 0
+
+int PQCLEAN_SNTRUP761_AVX2_crypto_core_wforcesntrup761(unsigned char *out, const unsigned char *in);
+#endif
diff --git a/crypto_kem/sntrup761/avx2/crypto_decode_761x1531.c b/crypto_kem/sntrup761/avx2/crypto_decode_761x1531.c
new file mode 100644
index 00000000..83dae782
--- /dev/null
+++ b/crypto_kem/sntrup761/avx2/crypto_decode_761x1531.c
@@ -0,0 +1,436 @@
+#include "crypto_decode_761x1531.h"
+#include <immintrin.h>
+/* auto-generated; do not edit */
+
+#define int16 int16_t
+#define int32 int32_t
+
+static inline int16 mullo(int16 x, int16 y) {
+    return x * y;
+}
+
+static inline int16 mulhi(int16 x, int16 y) {
+    return (x * (int32)y) >> 16;
+}
+
+static inline __m256i add(__m256i x, __m256i y) {
+    return _mm256_add_epi16(x, y);
+}
+
+static inline __m256i sub(__m256i x, __m256i y) {
+    return _mm256_sub_epi16(x, y);
+}
+
+static inline __m256i shiftleftconst(__m256i x, int16 y) {
+    return _mm256_slli_epi16(x, y);
+}
+
+static inline __m256i signedshiftrightconst(__m256i x, int16 y) {
+    return _mm256_srai_epi16(x, y);
+}
+
+static inline __m256i addconst(__m256i x, int16 y) {
+    return add(x, _mm256_set1_epi16(y));
+}
+
+static inline __m256i subconst(__m256i x, int16 y) {
+    return sub(x, _mm256_set1_epi16(y));
+}
+
+static inline __m256i mulloconst(__m256i x, int16 y) {
+    return _mm256_mullo_epi16(x, _mm256_set1_epi16(y));
+}
+
+static inline __m256i mulhiconst(__m256i x, int16 y) {
+    return _mm256_mulhi_epi16(x, _mm256_set1_epi16(y));
+}
+
+static inline __m256i ifgesubconst(__m256i x, int16 y) {
+    __m256i y16 = _mm256_set1_epi16(y);
+    __m256i top16 = _mm256_set1_epi16((int16)(y - 1));
+    return sub(x, _mm256_cmpgt_epi16(x, top16) & y16);
+}
+
+static inline __m256i ifnegaddconst(__m256i x, int16 y) {
+    return add(x, signedshiftrightconst(x, 15) & _mm256_set1_epi16(y));
+}
+
+void PQCLEAN_SNTRUP761_AVX2_crypto_decode_761x1531(void *v, const unsigned char *s) {
+    int16 *R0 = v;
+    int16 R1[381], R2[191], R3[96], R4[48], R5[24], R6[12], R7[6], R8[3], R9[2], R10[1];
+    long long i;
+    int16 a0, a1, a2;
+    __m256i A0, A1, A2, S0, S1, B0, B1, C0, C1;
+
+    s += PQCLEAN_SNTRUP761_AVX2_crypto_decode_761x1531_STRBYTES;
+    a1 = 0;
+    a1 += *--s; /* 0...255 */
+    a1 = mulhi(a1, -84) - mulhi(mullo(a1, -4828), 3475);
+    a1 += *--s; /* -1738...1992 */
+    a1 += (a1 >> 15) & 3475; /* 0...3474 */
+    R10[0] = a1;
+
+    /* R10 ------> R9: reconstruct mod 1*[593]+[1500] */
+
+    i = 0;
+    s -= 1;
+    a2 = a0 = R10[0];
+    a0 = mulhi(a0, 60) - mulhi(mullo(a0, -28292), 593); /* -297...311 */
+    a0 += s[1 * i + 0]; /* -297...566 */
+    a0 += (a0 >> 15) & 593; /* 0...592 */
+    a1 = (a2 << 8) + s[i] - a0;
+    a1 = mullo(a1, -31055);
+
+    /* invalid inputs might need reduction mod 1500 */
+    a1 -= 1500;
+    a1 += (a1 >> 15) & 1500;
+
+    R9[0] = a0;
+    R9[1] = a1;
+    s -= 0;
+
+    /* R9 ------> R8: reconstruct mod 2*[6232]+[1500] */
+
+    R8[2] = R9[1];
+    s -= 2;
+    for (i = 0; i >= 0; --i) {
+        a2 = a0 = R9[i];
+        a0 = mulhi(a0, 672) - mulhi(mullo(a0, -2692), 6232); /* -3116...3284 */
+        a0 += s[2 * i + 1]; /* -3116...3539 */
+        a0 = mulhi(a0, 672) - mulhi(mullo(a0, -2692), 6232); /* -3148...3152 */
+        a0 += s[2 * i + 0]; /* -3148...3407 */
+        a0 += (a0 >> 15) & 6232; /* 0...6231 */
+        a1 = (a2 << 13) + (s[2 * i + 1] << 5) + ((s[2 * i] - a0) >> 3);
+        a1 = mullo(a1, 12451);
+
+        /* invalid inputs might need reduction mod 6232 */
+        a1 -= 6232;
+        a1 += (a1 >> 15) & 6232;
+
+        R8[2 * i] = a0;
+        R8[2 * i + 1] = a1;
+    }
+
+    /* R8 ------> R7: reconstruct mod 5*[1263]+[304] */
+
+    i = 0;
+    s -= 1;
+    a2 = a0 = R8[2];
+    a0 = mulhi(a0, -476) - mulhi(mullo(a0, -13284), 1263); /* -751...631 */
+    a0 += s[1 * i + 0]; /* -751...886 */
+    a0 += (a0 >> 15) & 1263; /* 0...1262 */
+    a1 = (a2 << 8) + s[i] - a0;
+    a1 = mullo(a1, -22001);
+
+    /* invalid inputs might need reduction mod 304 */
+    a1 -= 304;
+    a1 += (a1 >> 15) & 304;
+
+    R7[4] = a0;
+    R7[5] = a1;
+    s -= 2;
+    for (i = 1; i >= 0; --i) {
+        a2 = a0 = R8[i];
+        a0 = mulhi(a0, -476) - mulhi(mullo(a0, -13284), 1263); /* -751...631 */
+        a0 += s[1 * i + 0]; /* -751...886 */
+        a0 += (a0 >> 15) & 1263; /* 0...1262 */
+        a1 = (a2 << 8) + s[i] - a0;
+        a1 = mullo(a1, -22001);
+
+        /* invalid inputs might need reduction mod 1263 */
+        a1 -= 1263;
+        a1 += (a1 >> 15) & 1263;
+
+        R7[2 * i] = a0;
+        R7[2 * i + 1] = a1;
+    }
+
+    /* R7 ------> R6: reconstruct mod 11*[9097]+[2188] */
+
+    i = 0;
+    s -= 2;
+    a0 = R7[5];
+    a0 = mulhi(a0, 2348) - mulhi(mullo(a0, -1844), 9097); /* -4549...5135 */
+    a0 += s[2 * i + 1]; /* -4549...5390 */
+    a0 = mulhi(a0, 2348) - mulhi(mullo(a0, -1844), 9097); /* -4712...4741 */
+    a0 += s[2 * i + 0]; /* -4712...4996 */
+    a0 += (a0 >> 15) & 9097; /* 0...9096 */
+    a1 = (s[2 * i + 1] << 8) + s[2 * i] - a0;
+    a1 = mullo(a1, 17081);
+
+    /* invalid inputs might need reduction mod 2188 */
+    a1 -= 2188;
+    a1 += (a1 >> 15) & 2188;
+
+    R6[10] = a0;
+    R6[11] = a1;
+    s -= 10;
+    for (i = 4; i >= 0; --i) {
+        a0 = R7[i];
+        a0 = mulhi(a0, 2348) - mulhi(mullo(a0, -1844), 9097); /* -4549...5135 */
+        a0 += s[2 * i + 1]; /* -4549...5390 */
+        a0 = mulhi(a0, 2348) - mulhi(mullo(a0, -1844), 9097); /* -4712...4741 */
+        a0 += s[2 * i + 0]; /* -4712...4996 */
+        a0 += (a0 >> 15) & 9097; /* 0...9096 */
+        a1 = (s[2 * i + 1] << 8) + s[2 * i] - a0;
+        a1 = mullo(a1, 17081);
+
+        /* invalid inputs might need reduction mod 9097 */
+        a1 -= 9097;
+        a1 += (a1 >> 15) & 9097;
+
+        R6[2 * i] = a0;
+        R6[2 * i + 1] = a1;
+    }
+
+    /* R6 ------> R5: reconstruct mod 23*[1526]+[367] */
+
+    i = 0;
+    s -= 1;
+    a2 = a0 = R6[11];
+    a0 = mulhi(a0, 372) - mulhi(mullo(a0, -10994), 1526); /* -763...856 */
+    a0 += s[1 * i + 0]; /* -763...1111 */
+    a0 += (a0 >> 15) & 1526; /* 0...1525 */
+    a1 = (a2 << 7) + ((s[i] - a0) >> 1);
+    a1 = mullo(a1, -18381);
+
+    /* invalid inputs might need reduction mod 367 */
+    a1 -= 367;
+    a1 += (a1 >> 15) & 367;
+
+    R5[22] = a0;
+    R5[23] = a1;
+    s -= 11;
+    for (i = 10; i >= 0; --i) {
+        a2 = a0 = R6[i];
+        a0 = mulhi(a0, 372) - mulhi(mullo(a0, -10994), 1526); /* -763...856 */
+        a0 += s[1 * i + 0]; /* -763...1111 */
+        a0 += (a0 >> 15) & 1526; /* 0...1525 */
+        a1 = (a2 << 7) + ((s[i] - a0) >> 1);
+        a1 = mullo(a1, -18381);
+
+        /* invalid inputs might need reduction mod 1526 */
+        a1 -= 1526;
+        a1 += (a1 >> 15) & 1526;
+
+        R5[2 * i] = a0;
+        R5[2 * i + 1] = a1;
+    }
+
+    /* R5 ------> R4: reconstruct mod 47*[625]+[150] */
+
+    i = 0;
+    s -= 1;
+    a2 = a0 = R5[23];
+    a0 = mulhi(a0, -284) - mulhi(mullo(a0, -26844), 625); /* -384...312 */
+    a0 += s[1 * i + 0]; /* -384...567 */
+    a0 += (a0 >> 15) & 625; /* 0...624 */
+    a1 = (a2 << 8) + s[i] - a0;
+    a1 = mullo(a1, 32401);
+
+    /* invalid inputs might need reduction mod 150 */
+    a1 -= 150;
+    a1 += (a1 >> 15) & 150;
+
+    R4[46] = a0;
+    R4[47] = a1;
+    s -= 23;
+    i = 7;
+    for (;;) {
+        A2 = A0 = _mm256_loadu_si256((__m256i *) &R5[i]);
+        S0 = _mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i *) (s + i)));
+        A0 = sub(mulhiconst(A0, -284), mulhiconst(mulloconst(A0, -26844), 625)); /* -384...312 */
+        A0 = add(A0, S0); /* -384...567 */
+        A0 = ifnegaddconst(A0, 625); /* 0...624 */
+        A1 = add(shiftleftconst(A2, 8), sub(S0, A0));
+        A1 = mulloconst(A1, 32401);
+
+        /* invalid inputs might need reduction mod 625 */
+        A1 = ifgesubconst(A1, 625);
+
+        /* A0: r0r2r4r6r8r10r12r14 r16r18r20r22r24r26r28r30 */
+        /* A1: r1r3r5r7r9r11r13r15 r17r19r21r23r25r27r29r31 */
+        B0 = _mm256_unpacklo_epi16(A0, A1);
+        B1 = _mm256_unpackhi_epi16(A0, A1);
+        /* B0: r0r1r2r3r4r5r6r7 r16r17r18r19r20r21r22r23 */
+        /* B1: r8r9r10r11r12r13r14r15 r24r25r26r27r28r29r30r31 */
+        C0 = _mm256_permute2x128_si256(B0, B1, 0x20);
+        C1 = _mm256_permute2x128_si256(B0, B1, 0x31);
+        /* C0: r0r1r2r3r4r5r6r7 r8r9r10r11r12r13r14r15 */
+        /* C1: r16r17r18r19r20r21r22r23 r24r25r26r27r28r29r30r31 */
+        _mm256_storeu_si256((__m256i *) (&R4[2 * i]), C0);
+        _mm256_storeu_si256((__m256i *) (16 + &R4[2 * i]), C1);
+        if (!i) {
+            break;
+        }
+        i = -16 - ((~15) & -i);
+    }
+
+    /* R4 ------> R3: reconstruct mod 95*[6400]+[1531] */
+
+    i = 0;
+    s -= 2;
+    a2 = a0 = R4[47];
+    a0 = mulhi(a0, 2816) - mulhi(mullo(a0, -2621), 6400); /* -3200...3904 */
+    a0 += s[2 * i + 1]; /* -3200...4159 */
+    a0 = mulhi(a0, 2816) - mulhi(mullo(a0, -2621), 6400); /* -3338...3378 */
+    a0 += s[2 * i + 0]; /* -3338...3633 */
+    a0 += (a0 >> 15) & 6400; /* 0...6399 */
+    a1 = (a2 << 8) + s[2 * i + 1] + ((s[2 * i] - a0) >> 8);
+    a1 = mullo(a1, 23593);
+
+    /* invalid inputs might need reduction mod 1531 */
+    a1 -= 1531;
+    a1 += (a1 >> 15) & 1531;
+
+    R3[94] = a0;
+    R3[95] = a1;
+    s -= 94;
+    i = 31;
+    for (;;) {
+        A2 = A0 = _mm256_loadu_si256((__m256i *) &R4[i]);
+        S0 = _mm256_loadu_si256((__m256i *) (s + 2 * i));
+        S1 = _mm256_srli_epi16(S0, 8);
+        S0 &= _mm256_set1_epi16(255);
+        A0 = sub(mulhiconst(A0, 2816), mulhiconst(mulloconst(A0, -2621), 6400)); /* -3200...3904 */
+        A0 = add(A0, S1); /* -3200...4159 */
+        A0 = sub(mulhiconst(A0, 2816), mulhiconst(mulloconst(A0, -2621), 6400)); /* -3338...3378 */
+        A0 = add(A0, S0); /* -3338...3633 */
+        A0 = ifnegaddconst(A0, 6400); /* 0...6399 */
+        A1 = add(add(shiftleftconst(A2, 8), S1), signedshiftrightconst(sub(S0, A0), 8));
+        A1 = mulloconst(A1, 23593);
+
+        /* invalid inputs might need reduction mod 6400 */
+        A1 = ifgesubconst(A1, 6400);
+
+        /* A0: r0r2r4r6r8r10r12r14 r16r18r20r22r24r26r28r30 */
+        /* A1: r1r3r5r7r9r11r13r15 r17r19r21r23r25r27r29r31 */
+        B0 = _mm256_unpacklo_epi16(A0, A1);
+        B1 = _mm256_unpackhi_epi16(A0, A1);
+        /* B0: r0r1r2r3r4r5r6r7 r16r17r18r19r20r21r22r23 */
+        /* B1: r8r9r10r11r12r13r14r15 r24r25r26r27r28r29r30r31 */
+        C0 = _mm256_permute2x128_si256(B0, B1, 0x20);
+        C1 = _mm256_permute2x128_si256(B0, B1, 0x31);
+        /* C0: r0r1r2r3r4r5r6r7 r8r9r10r11r12r13r14r15 */
+        /* C1: r16r17r18r19r20r21r22r23 r24r25r26r27r28r29r30r31 */
+        _mm256_storeu_si256((__m256i *) (&R3[2 * i]), C0);
+        _mm256_storeu_si256((__m256i *) (16 + &R3[2 * i]), C1);
+        if (!i) {
+            break;
+        }
+        i = -16 - ((~15) & -i);
+    }
+
+    /* R3 ------> R2: reconstruct mod 190*[1280]+[1531] */
+
+    R2[190] = R3[95];
+    s -= 95;
+    i = 79;
+    for (;;) {
+        A2 = A0 = _mm256_loadu_si256((__m256i *) &R3[i]);
+        S0 = _mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i *) (s + i)));
+        A0 = sub(mulhiconst(A0, 256), mulhiconst(mulloconst(A0, -13107), 1280)); /* -640...704 */
+        A0 = add(A0, S0); /* -640...959 */
+        A0 = ifnegaddconst(A0, 1280); /* 0...1279 */
+        A1 = add(A2, signedshiftrightconst(sub(S0, A0), 8));
+        A1 = mulloconst(A1, -13107);
+
+        /* invalid inputs might need reduction mod 1280 */
+        A1 = ifgesubconst(A1, 1280);
+
+        /* A0: r0r2r4r6r8r10r12r14 r16r18r20r22r24r26r28r30 */
+        /* A1: r1r3r5r7r9r11r13r15 r17r19r21r23r25r27r29r31 */
+        B0 = _mm256_unpacklo_epi16(A0, A1);
+        B1 = _mm256_unpackhi_epi16(A0, A1);
+        /* B0: r0r1r2r3r4r5r6r7 r16r17r18r19r20r21r22r23 */
+        /* B1: r8r9r10r11r12r13r14r15 r24r25r26r27r28r29r30r31 */
+        C0 = _mm256_permute2x128_si256(B0, B1, 0x20);
+        C1 = _mm256_permute2x128_si256(B0, B1, 0x31);
+        /* C0: r0r1r2r3r4r5r6r7 r8r9r10r11r12r13r14r15 */
+        /* C1: r16r17r18r19r20r21r22r23 r24r25r26r27r28r29r30r31 */
+        _mm256_storeu_si256((__m256i *) (&R2[2 * i]), C0);
+        _mm256_storeu_si256((__m256i *) (16 + &R2[2 * i]), C1);
+        if (!i) {
+            break;
+        }
+        i = -16 - ((~15) & -i);
+    }
+
+    /* R2 ------> R1: reconstruct mod 380*[9157]+[1531] */
+
+    R1[380] = R2[190];
+    s -= 380;
+    i = 174;
+    for (;;) {
+        A0 = _mm256_loadu_si256((__m256i *) &R2[i]);
+        S0 = _mm256_loadu_si256((__m256i *) (s + 2 * i));
+        S1 = _mm256_srli_epi16(S0, 8);
+        S0 &= _mm256_set1_epi16(255);
+        A0 = sub(mulhiconst(A0, 1592), mulhiconst(mulloconst(A0, -1832), 9157)); /* -4579...4976 */
+        A0 = add(A0, S1); /* -4579...5231 */
+        A0 = sub(mulhiconst(A0, 1592), mulhiconst(mulloconst(A0, -1832), 9157)); /* -4690...4705 */
+        A0 = add(A0, S0); /* -4690...4960 */
+        A0 = ifnegaddconst(A0, 9157); /* 0...9156 */
+        A1 = add(shiftleftconst(S1, 8), sub(S0, A0));
+        A1 = mulloconst(A1, 25357);
+
+        /* invalid inputs might need reduction mod 9157 */
+        A1 = ifgesubconst(A1, 9157);
+
+        /* A0: r0r2r4r6r8r10r12r14 r16r18r20r22r24r26r28r30 */
+        /* A1: r1r3r5r7r9r11r13r15 r17r19r21r23r25r27r29r31 */
+        B0 = _mm256_unpacklo_epi16(A0, A1);
+        B1 = _mm256_unpackhi_epi16(A0, A1);
+        /* B0: r0r1r2r3r4r5r6r7 r16r17r18r19r20r21r22r23 */
+        /* B1: r8r9r10r11r12r13r14r15 r24r25r26r27r28r29r30r31 */
+        C0 = _mm256_permute2x128_si256(B0, B1, 0x20);
+        C1 = _mm256_permute2x128_si256(B0, B1, 0x31);
+        /* C0: r0r1r2r3r4r5r6r7 r8r9r10r11r12r13r14r15 */
+        /* C1: r16r17r18r19r20r21r22r23 r24r25r26r27r28r29r30r31 */
+        _mm256_storeu_si256((__m256i *) (&R1[2 * i]), C0);
+        _mm256_storeu_si256((__m256i *) (16 + &R1[2 * i]), C1);
+        if (!i) {
+            break;
+        }
+        i = -16 - ((~15) & -i);
+    }
+
+    /* R1 ------> R0: reconstruct mod 761*[1531] */
+
+    R0[760] = 3 * R1[380] - 2295;
+    s -= 380;
+    i = 364;
+    for (;;) {
+        A2 = A0 = _mm256_loadu_si256((__m256i *) &R1[i]);
+        S0 = _mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i *) (s + i)));
+        A0 = sub(mulhiconst(A0, 518), mulhiconst(mulloconst(A0, -10958), 1531)); /* -766...895 */
+        A0 = add(A0, S0); /* -766...1150 */
+        A0 = ifnegaddconst(A0, 1531); /* 0...1530 */
+        A1 = add(shiftleftconst(A2, 8), sub(S0, A0));
+        A1 = mulloconst(A1, 15667);
+
+        /* invalid inputs might need reduction mod 1531 */
+        A1 = ifgesubconst(A1, 1531);
+
+        A0 = mulloconst(A0, 3);
+        A1 = mulloconst(A1, 3);
+        A0 = subconst(A0, 2295);
+        A1 = subconst(A1, 2295);
+        /* A0: r0r2r4r6r8r10r12r14 r16r18r20r22r24r26r28r30 */
+        /* A1: r1r3r5r7r9r11r13r15 r17r19r21r23r25r27r29r31 */
+        B0 = _mm256_unpacklo_epi16(A0, A1);
+        B1 = _mm256_unpackhi_epi16(A0, A1);
+        /* B0: r0r1r2r3r4r5r6r7 r16r17r18r19r20r21r22r23 */
+        /* B1: r8r9r10r11r12r13r14r15 r24r25r26r27r28r29r30r31 */
+        C0 = _mm256_permute2x128_si256(B0, B1, 0x20);
+        C1 = _mm256_permute2x128_si256(B0, B1, 0x31);
+        /* C0: r0r1r2r3r4r5r6r7 r8r9r10r11r12r13r14r15 */
+        /* C1: r16r17r18r19r20r21r22r23 r24r25r26r27r28r29r30r31 */
+        _mm256_storeu_si256((__m256i *) (&R0[2 * i]), C0);
+        _mm256_storeu_si256((__m256i *) (16 + &R0[2 * i]), C1);
+        if (!i) {
+            break;
+        }
+        i = -16 - ((~15) & -i);
+    }
+}
diff --git a/crypto_kem/sntrup761/avx2/crypto_decode_761x1531.h b/crypto_kem/sntrup761/avx2/crypto_decode_761x1531.h
new file mode 100644
index 00000000..02ee10a8
--- /dev/null
+++ b/crypto_kem/sntrup761/avx2/crypto_decode_761x1531.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_SNTRUP761_AVX2_CRYPTO_DECODE_761X1531_H
+#define PQCLEAN_SNTRUP761_AVX2_CRYPTO_DECODE_761X1531_H
+
+#include <stdint.h>
+#define PQCLEAN_SNTRUP761_AVX2_crypto_decode_761x1531_STRBYTES 1007
+#define PQCLEAN_SNTRUP761_AVX2_crypto_decode_761x1531_ITEMS 761
+#define PQCLEAN_SNTRUP761_AVX2_crypto_decode_761x1531_ITEMBYTES 2
+
+void PQCLEAN_SNTRUP761_AVX2_crypto_decode_761x1531(void *v, const unsigned char *s);
+#endif
diff --git a/crypto_kem/sntrup761/avx2/crypto_decode_761x3.c b/crypto_kem/sntrup761/avx2/crypto_decode_761x3.c
new file mode 100644
index 00000000..a377eca4
--- /dev/null
+++ b/crypto_kem/sntrup761/avx2/crypto_decode_761x3.c
@@ -0,0 +1,65 @@
+#include "crypto_decode_761x3.h"
+#include <immintrin.h>
+#define uint8 uint8_t
+
+#define p 761
+#define loops 6
+#define overshoot 2
+
+void PQCLEAN_SNTRUP761_AVX2_crypto_decode_761x3(void *v, const unsigned char *s) {
+    uint8 *f = v;
+    int loop;
+    uint8 *nextf = f + 128 - 4 * overshoot;
+    const unsigned char *nexts = s + 32 - overshoot;
+
+    for (loop = loops; loop > 0; --loop) {
+        __m256i s0 = _mm256_loadu_si256((const __m256i *) s);
+        s = nexts;
+        nexts += 32;
+
+        __m256i s1 = _mm256_srli_epi16(s0 & _mm256_set1_epi8(-16), 4);
+        s0 &= _mm256_set1_epi8(15);
+
+        __m256i a0 = _mm256_unpacklo_epi8(s0, s1);
+        /* 0 0>>4 1 1>>4 2 2>>4 3 3>>4 4 4>>4 5 5>>4 6 6>>4 7 7>>4 */
+        /* 16 16>>4 ... */
+        __m256i a1 = _mm256_unpackhi_epi8(s0, s1);
+        /* 8 8>>4 9 9>>4 10 10>>4 ... */
+        /* 24 24>>4 ... */
+
+        __m256i a2 = _mm256_srli_epi16(a0 & _mm256_set1_epi8(12), 2);
+        __m256i a3 = _mm256_srli_epi16(a1 & _mm256_set1_epi8(12), 2);
+        a0 &= _mm256_set1_epi8(3);
+        a1 &= _mm256_set1_epi8(3);
+
+        __m256i b0 = _mm256_unpacklo_epi8(a0, a2);
+        /* 0 0>>2 0>>4 0>>6 1 1>>2 1>>4 1>>6 */
+        /* 2 2>>2 2>>4 2>>6 3 3>>2 3>>4 3>.6 */
+        /* 16 16>>2 16>>4 16>>6 ... */
+        __m256i b2 = _mm256_unpackhi_epi8(a0, a2);
+        /* 4 4>>2 ... */
+        __m256i b1 = _mm256_unpacklo_epi8(a1, a3);
+        /* 8 8>>2 ... */
+        __m256i b3 = _mm256_unpackhi_epi8(a1, a3);
+        /* 12 12>>2 ... */
+
+        __m256i f0 = _mm256_permute2x128_si256(b0, b2, 0x20);
+        __m256i f2 = _mm256_permute2x128_si256(b0, b2, 0x31);
+        __m256i f1 = _mm256_permute2x128_si256(b1, b3, 0x20);
+        __m256i f3 = _mm256_permute2x128_si256(b1, b3, 0x31);
+
+        f0 = _mm256_add_epi8(f0, _mm256_set1_epi8(-1));
+        f1 = _mm256_add_epi8(f1, _mm256_set1_epi8(-1));
+        f2 = _mm256_add_epi8(f2, _mm256_set1_epi8(-1));
+        f3 = _mm256_add_epi8(f3, _mm256_set1_epi8(-1));
+
+        _mm256_storeu_si256((__m256i *) (f + 0), f0);
+        _mm256_storeu_si256((__m256i *) (f + 32), f1);
+        _mm256_storeu_si256((__m256i *) (f + 64), f2);
+        _mm256_storeu_si256((__m256i *) (f + 96), f3);
+        f = nextf;
+        nextf += 128;
+    }
+
+    *f = ((uint8)(*s & 3)) - 1;
+}
diff --git a/crypto_kem/sntrup761/avx2/crypto_decode_761x3.h b/crypto_kem/sntrup761/avx2/crypto_decode_761x3.h
new file mode 100644
index 00000000..f72e26ad
--- /dev/null
+++ b/crypto_kem/sntrup761/avx2/crypto_decode_761x3.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_SNTRUP761_AVX2_CRYPTO_DECODE_761X3_H
+#define PQCLEAN_SNTRUP761_AVX2_CRYPTO_DECODE_761X3_H
+
+#include <stdint.h>
+#define PQCLEAN_SNTRUP761_AVX2_crypto_decode_761x3_STRBYTES 191
+#define PQCLEAN_SNTRUP761_AVX2_crypto_decode_761x3_ITEMS 761
+#define PQCLEAN_SNTRUP761_AVX2_crypto_decode_761x3_ITEMBYTES 1
+
+void PQCLEAN_SNTRUP761_AVX2_crypto_decode_761x3(void *v, const unsigned char *s);
+#endif
diff --git a/crypto_kem/sntrup761/avx2/crypto_decode_761x4591.c b/crypto_kem/sntrup761/avx2/crypto_decode_761x4591.c
new file mode 100644
index 00000000..35cd3196
--- /dev/null
+++ b/crypto_kem/sntrup761/avx2/crypto_decode_761x4591.c
@@ -0,0 +1,436 @@
+#include "crypto_decode_761x4591.h"
+#include <immintrin.h>
+/* auto-generated; do not edit */
+
+#define int16 int16_t
+#define int32 int32_t
+
+static inline int16 mullo(int16 x, int16 y) {
+    return x * y;
+}
+
+static inline int16 mulhi(int16 x, int16 y) {
+    return (x * (int32)y) >> 16;
+}
+
+static inline __m256i add(__m256i x, __m256i y) {
+    return _mm256_add_epi16(x, y);
+}
+
+static inline __m256i sub(__m256i x, __m256i y) {
+    return _mm256_sub_epi16(x, y);
+}
+
+static inline __m256i shiftleftconst(__m256i x, int16 y) {
+    return _mm256_slli_epi16(x, y);
+}
+
+static inline __m256i signedshiftrightconst(__m256i x, int16 y) {
+    return _mm256_srai_epi16(x, y);
+}
+
+static inline __m256i addconst(__m256i x, int16 y) {
+    return add(x, _mm256_set1_epi16(y));
+}
+
+static inline __m256i subconst(__m256i x, int16 y) {
+    return sub(x, _mm256_set1_epi16(y));
+}
+
+static inline __m256i mulloconst(__m256i x, int16 y) {
+    return _mm256_mullo_epi16(x, _mm256_set1_epi16(y));
+}
+
+static inline __m256i mulhiconst(__m256i x, int16 y) {
+    return _mm256_mulhi_epi16(x, _mm256_set1_epi16(y));
+}
+
+static inline __m256i ifgesubconst(__m256i x, int16 y) {
+    __m256i y16 = _mm256_set1_epi16(y);
+    __m256i top16 = _mm256_set1_epi16((int16)(y - 1));
+    return sub(x, _mm256_cmpgt_epi16(x, top16) & y16);
+}
+
+static inline __m256i ifnegaddconst(__m256i x, int16 y) {
+    return add(x, signedshiftrightconst(x, 15) & _mm256_set1_epi16(y));
+}
+
+void PQCLEAN_SNTRUP761_AVX2_crypto_decode_761x4591(void *v, const unsigned char *s) {
+    int16 *R0 = v;
+    int16 R1[381], R2[191], R3[96], R4[48], R5[24], R6[12], R7[6], R8[3], R9[2], R10[1];
+    long long i;
+    int16 a0, a1, a2;
+    __m256i A0, A1, A2, S0, S1, B0, B1, C0, C1;
+
+    s += PQCLEAN_SNTRUP761_AVX2_crypto_decode_761x4591_STRBYTES;
+    a1 = 0;
+    a1 += *--s; /* 0...255 */
+    a1 = mulhi(a1, -656) - mulhi(mullo(a1, -10434), 1608);
+    a1 += *--s; /* -804...1056 */
+    a1 += (a1 >> 15) & 1608; /* 0...1607 */
+    R10[0] = a1;
+
+    /* R10 ------> R9: reconstruct mod 1*[9470]+[11127] */
+
+    i = 0;
+    s -= 2;
+    a2 = a0 = R10[0];
+    a0 = mulhi(a0, -3624) - mulhi(mullo(a0, -1772), 9470); /* -5641...4735 */
+    a0 += s[2 * i + 1]; /* -5641...4990 */
+    a0 = mulhi(a0, -3624) - mulhi(mullo(a0, -1772), 9470); /* -5011...5046 */
+    a0 += s[2 * i + 0]; /* -5011...5301 */
+    a0 += (a0 >> 15) & 9470; /* 0...9469 */
+    a1 = (a2 << 15) + (s[2 * i + 1] << 7) + ((s[2 * i] - a0) >> 1);
+    a1 = mullo(a1, -21121);
+
+    /* invalid inputs might need reduction mod 11127 */
+    a1 -= 11127;
+    a1 += (a1 >> 15) & 11127;
+
+    R9[0] = a0;
+    R9[1] = a1;
+    s -= 0;
+
+    /* R9 ------> R8: reconstruct mod 2*[1557]+[11127] */
+
+    R8[2] = R9[1];
+    s -= 1;
+    for (i = 0; i >= 0; --i) {
+        a2 = a0 = R9[i];
+        a0 = mulhi(a0, 541) - mulhi(mullo(a0, -10775), 1557); /* -779...913 */
+        a0 += s[1 * i + 0]; /* -779...1168 */
+        a0 += (a0 >> 15) & 1557; /* 0...1556 */
+        a1 = (a2 << 8) + s[i] - a0;
+        a1 = mullo(a1, -26307);
+
+        /* invalid inputs might need reduction mod 1557 */
+        a1 -= 1557;
+        a1 += (a1 >> 15) & 1557;
+
+        R8[2 * i] = a0;
+        R8[2 * i + 1] = a1;
+    }
+
+    /* R8 ------> R7: reconstruct mod 5*[10101]+[282] */
+
+    i = 0;
+    s -= 1;
+    a2 = a0 = R8[2];
+    a0 = mulhi(a0, -545) - mulhi(mullo(a0, -1661), 10101); /* -5187...5050 */
+    a0 += s[1 * i + 0]; /* -5187...5305 */
+    a0 += (a0 >> 15) & 10101; /* 0...10100 */
+    a1 = (a2 << 8) + s[i] - a0;
+    a1 = mullo(a1, 12509);
+
+    /* invalid inputs might need reduction mod 282 */
+    a1 -= 282;
+    a1 += (a1 >> 15) & 282;
+
+    R7[4] = a0;
+    R7[5] = a1;
+    s -= 4;
+    for (i = 1; i >= 0; --i) {
+        a0 = R8[i];
+        a0 = mulhi(a0, -545) - mulhi(mullo(a0, -1661), 10101); /* -5187...5050 */
+        a0 += s[2 * i + 1]; /* -5187...5305 */
+        a0 = mulhi(a0, -545) - mulhi(mullo(a0, -1661), 10101); /* -5095...5093 */
+        a0 += s[2 * i + 0]; /* -5095...5348 */
+        a0 += (a0 >> 15) & 10101; /* 0...10100 */
+        a1 = (s[2 * i + 1] << 8) + s[2 * i] - a0;
+        a1 = mullo(a1, 12509);
+
+        /* invalid inputs might need reduction mod 10101 */
+        a1 -= 10101;
+        a1 += (a1 >> 15) & 10101;
+
+        R7[2 * i] = a0;
+        R7[2 * i + 1] = a1;
+    }
+
+    /* R7 ------> R6: reconstruct mod 11*[1608]+[11468] */
+
+    i = 0;
+    s -= 2;
+    a2 = a0 = R7[5];
+    a0 = mulhi(a0, -656) - mulhi(mullo(a0, -10434), 1608); /* -968...804 */
+    a0 += s[2 * i + 1]; /* -968...1059 */
+    a0 = mulhi(a0, -656) - mulhi(mullo(a0, -10434), 1608); /* -815...813 */
+    a0 += s[2 * i + 0]; /* -815...1068 */
+    a0 += (a0 >> 15) & 1608; /* 0...1607 */
+    a1 = (a2 << 13) + (s[2 * i + 1] << 5) + ((s[2 * i] - a0) >> 3);
+    a1 = mullo(a1, 6521);
+
+    /* invalid inputs might need reduction mod 11468 */
+    a1 -= 11468;
+    a1 += (a1 >> 15) & 11468;
+
+    R6[10] = a0;
+    R6[11] = a1;
+    s -= 5;
+    for (i = 4; i >= 0; --i) {
+        a2 = a0 = R7[i];
+        a0 = mulhi(a0, -656) - mulhi(mullo(a0, -10434), 1608); /* -968...804 */
+        a0 += s[1 * i + 0]; /* -968...1059 */
+        a0 += (a0 >> 15) & 1608; /* 0...1607 */
+        a1 = (a2 << 5) + ((s[i] - a0) >> 3);
+        a1 = mullo(a1, 6521);
+
+        /* invalid inputs might need reduction mod 1608 */
+        a1 -= 1608;
+        a1 += (a1 >> 15) & 1608;
+
+        R6[2 * i] = a0;
+        R6[2 * i + 1] = a1;
+    }
+
+    /* R6 ------> R5: reconstruct mod 23*[10265]+[286] */
+
+    i = 0;
+    s -= 1;
+    a2 = a0 = R6[11];
+    a0 = mulhi(a0, 4206) - mulhi(mullo(a0, -1634), 10265); /* -5133...6184 */
+    a0 += s[1 * i + 0]; /* -5133...6439 */
+    a0 += (a0 >> 15) & 10265; /* 0...10264 */
+    a1 = (a2 << 8) + s[i] - a0;
+    a1 = mullo(a1, -19415);
+
+    /* invalid inputs might need reduction mod 286 */
+    a1 -= 286;
+    a1 += (a1 >> 15) & 286;
+
+    R5[22] = a0;
+    R5[23] = a1;
+    s -= 22;
+    for (i = 10; i >= 0; --i) {
+        a0 = R6[i];
+        a0 = mulhi(a0, 4206) - mulhi(mullo(a0, -1634), 10265); /* -5133...6184 */
+        a0 += s[2 * i + 1]; /* -5133...6439 */
+        a0 = mulhi(a0, 4206) - mulhi(mullo(a0, -1634), 10265); /* -5462...5545 */
+        a0 += s[2 * i + 0]; /* -5462...5800 */
+        a0 += (a0 >> 15) & 10265; /* 0...10264 */
+        a1 = (s[2 * i + 1] << 8) + s[2 * i] - a0;
+        a1 = mullo(a1, -19415);
+
+        /* invalid inputs might need reduction mod 10265 */
+        a1 -= 10265;
+        a1 += (a1 >> 15) & 10265;
+
+        R5[2 * i] = a0;
+        R5[2 * i + 1] = a1;
+    }
+
+    /* R5 ------> R4: reconstruct mod 47*[1621]+[11550] */
+
+    i = 0;
+    s -= 2;
+    a0 = R5[23];
+    a0 = mulhi(a0, -134) - mulhi(mullo(a0, -10350), 1621); /* -844...810 */
+    a0 += s[2 * i + 1]; /* -844...1065 */
+    a0 = mulhi(a0, -134) - mulhi(mullo(a0, -10350), 1621); /* -813...812 */
+    a0 += s[2 * i + 0]; /* -813...1067 */
+    a0 += (a0 >> 15) & 1621; /* 0...1620 */
+    a1 = (s[2 * i + 1] << 8) + s[2 * i] - a0;
+    a1 = mullo(a1, -14595);
+
+    /* invalid inputs might need reduction mod 11550 */
+    a1 -= 11550;
+    a1 += (a1 >> 15) & 11550;
+
+    R4[46] = a0;
+    R4[47] = a1;
+    s -= 23;
+    i = 7;
+    for (;;) {
+        A2 = A0 = _mm256_loadu_si256((__m256i *) &R5[i]);
+        S0 = _mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i *) (s + i)));
+        A0 = sub(mulhiconst(A0, -134), mulhiconst(mulloconst(A0, -10350), 1621)); /* -844...810 */
+        A0 = add(A0, S0); /* -844...1065 */
+        A0 = ifnegaddconst(A0, 1621); /* 0...1620 */
+        A1 = add(shiftleftconst(A2, 8), sub(S0, A0));
+        A1 = mulloconst(A1, -14595);
+
+        /* invalid inputs might need reduction mod 1621 */
+        A1 = ifgesubconst(A1, 1621);
+
+        /* A0: r0r2r4r6r8r10r12r14 r16r18r20r22r24r26r28r30 */
+        /* A1: r1r3r5r7r9r11r13r15 r17r19r21r23r25r27r29r31 */
+        B0 = _mm256_unpacklo_epi16(A0, A1);
+        B1 = _mm256_unpackhi_epi16(A0, A1);
+        /* B0: r0r1r2r3r4r5r6r7 r16r17r18r19r20r21r22r23 */
+        /* B1: r8r9r10r11r12r13r14r15 r24r25r26r27r28r29r30r31 */
+        C0 = _mm256_permute2x128_si256(B0, B1, 0x20);
+        C1 = _mm256_permute2x128_si256(B0, B1, 0x31);
+        /* C0: r0r1r2r3r4r5r6r7 r8r9r10r11r12r13r14r15 */
+        /* C1: r16r17r18r19r20r21r22r23 r24r25r26r27r28r29r30r31 */
+        _mm256_storeu_si256((__m256i *) (&R4[2 * i]), C0);
+        _mm256_storeu_si256((__m256i *) (16 + &R4[2 * i]), C1);
+        if (!i) {
+            break;
+        }
+        i = -16 - ((~15) & -i);
+    }
+
+    /* R4 ------> R3: reconstruct mod 95*[644]+[4591] */
+
+    i = 0;
+    s -= 1;
+    a2 = a0 = R4[47];
+    a0 = mulhi(a0, -272) - mulhi(mullo(a0, -26052), 644); /* -390...322 */
+    a0 += s[1 * i + 0]; /* -390...577 */
+    a0 += (a0 >> 15) & 644; /* 0...643 */
+    a1 = (a2 << 6) + ((s[i] - a0) >> 2);
+    a1 = mullo(a1, -7327);
+
+    /* invalid inputs might need reduction mod 4591 */
+    a1 -= 4591;
+    a1 += (a1 >> 15) & 4591;
+
+    R3[94] = a0;
+    R3[95] = a1;
+    s -= 47;
+    i = 31;
+    for (;;) {
+        A2 = A0 = _mm256_loadu_si256((__m256i *) &R4[i]);
+        S0 = _mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i *) (s + i)));
+        A0 = sub(mulhiconst(A0, -272), mulhiconst(mulloconst(A0, -26052), 644)); /* -390...322 */
+        A0 = add(A0, S0); /* -390...577 */
+        A0 = ifnegaddconst(A0, 644); /* 0...643 */
+        A1 = add(shiftleftconst(A2, 6), signedshiftrightconst(sub(S0, A0), 2));
+        A1 = mulloconst(A1, -7327);
+
+        /* invalid inputs might need reduction mod 644 */
+        A1 = ifgesubconst(A1, 644);
+
+        /* A0: r0r2r4r6r8r10r12r14 r16r18r20r22r24r26r28r30 */
+        /* A1: r1r3r5r7r9r11r13r15 r17r19r21r23r25r27r29r31 */
+        B0 = _mm256_unpacklo_epi16(A0, A1);
+        B1 = _mm256_unpackhi_epi16(A0, A1);
+        /* B0: r0r1r2r3r4r5r6r7 r16r17r18r19r20r21r22r23 */
+        /* B1: r8r9r10r11r12r13r14r15 r24r25r26r27r28r29r30r31 */
+        C0 = _mm256_permute2x128_si256(B0, B1, 0x20);
+        C1 = _mm256_permute2x128_si256(B0, B1, 0x31);
+        /* C0: r0r1r2r3r4r5r6r7 r8r9r10r11r12r13r14r15 */
+        /* C1: r16r17r18r19r20r21r22r23 r24r25r26r27r28r29r30r31 */
+        _mm256_storeu_si256((__m256i *) (&R3[2 * i]), C0);
+        _mm256_storeu_si256((__m256i *) (16 + &R3[2 * i]), C1);
+        if (!i) {
+            break;
+        }
+        i = -16 - ((~15) & -i);
+    }
+
+    /* R3 ------> R2: reconstruct mod 190*[406]+[4591] */
+
+    R2[190] = R3[95];
+    s -= 95;
+    i = 79;
+    for (;;) {
+        A2 = A0 = _mm256_loadu_si256((__m256i *) &R3[i]);
+        S0 = _mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i *) (s + i)));
+        A0 = sub(mulhiconst(A0, 78), mulhiconst(mulloconst(A0, 24213), 406)); /* -203...222 */
+        A0 = add(A0, S0); /* -203...477 */
+        A0 = subconst(A0, 406); /* -609...71 */
+        A0 = ifnegaddconst(A0, 406); /* -203...405 */
+        A0 = ifnegaddconst(A0, 406); /* 0...405 */
+        A1 = add(shiftleftconst(A2, 7), signedshiftrightconst(sub(S0, A0), 1));
+        A1 = mulloconst(A1, 25827);
+
+        /* invalid inputs might need reduction mod 406 */
+        A1 = ifgesubconst(A1, 406);
+
+        /* A0: r0r2r4r6r8r10r12r14 r16r18r20r22r24r26r28r30 */
+        /* A1: r1r3r5r7r9r11r13r15 r17r19r21r23r25r27r29r31 */
+        B0 = _mm256_unpacklo_epi16(A0, A1);
+        B1 = _mm256_unpackhi_epi16(A0, A1);
+        /* B0: r0r1r2r3r4r5r6r7 r16r17r18r19r20r21r22r23 */
+        /* B1: r8r9r10r11r12r13r14r15 r24r25r26r27r28r29r30r31 */
+        C0 = _mm256_permute2x128_si256(B0, B1, 0x20);
+        C1 = _mm256_permute2x128_si256(B0, B1, 0x31);
+        /* C0: r0r1r2r3r4r5r6r7 r8r9r10r11r12r13r14r15 */
+        /* C1: r16r17r18r19r20r21r22r23 r24r25r26r27r28r29r30r31 */
+        _mm256_storeu_si256((__m256i *) (&R2[2 * i]), C0);
+        _mm256_storeu_si256((__m256i *) (16 + &R2[2 * i]), C1);
+        if (!i) {
+            break;
+        }
+        i = -16 - ((~15) & -i);
+    }
+
+    /* R2 ------> R1: reconstruct mod 380*[322]+[4591] */
+
+    R1[380] = R2[190];
+    s -= 190;
+    i = 174;
+    for (;;) {
+        A2 = A0 = _mm256_loadu_si256((__m256i *) &R2[i]);
+        S0 = _mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i *) (s + i)));
+        A0 = sub(mulhiconst(A0, 50), mulhiconst(mulloconst(A0, 13433), 322)); /* -161...173 */
+        A0 = add(A0, S0); /* -161...428 */
+        A0 = subconst(A0, 322); /* -483...106 */
+        A0 = ifnegaddconst(A0, 322); /* -161...321 */
+        A0 = ifnegaddconst(A0, 322); /* 0...321 */
+        A1 = add(shiftleftconst(A2, 7), signedshiftrightconst(sub(S0, A0), 1));
+        A1 = mulloconst(A1, -7327);
+
+        /* invalid inputs might need reduction mod 322 */
+        A1 = ifgesubconst(A1, 322);
+
+        /* A0: r0r2r4r6r8r10r12r14 r16r18r20r22r24r26r28r30 */
+        /* A1: r1r3r5r7r9r11r13r15 r17r19r21r23r25r27r29r31 */
+        B0 = _mm256_unpacklo_epi16(A0, A1);
+        B1 = _mm256_unpackhi_epi16(A0, A1);
+        /* B0: r0r1r2r3r4r5r6r7 r16r17r18r19r20r21r22r23 */
+        /* B1: r8r9r10r11r12r13r14r15 r24r25r26r27r28r29r30r31 */
+        C0 = _mm256_permute2x128_si256(B0, B1, 0x20);
+        C1 = _mm256_permute2x128_si256(B0, B1, 0x31);
+        /* C0: r0r1r2r3r4r5r6r7 r8r9r10r11r12r13r14r15 */
+        /* C1: r16r17r18r19r20r21r22r23 r24r25r26r27r28r29r30r31 */
+        _mm256_storeu_si256((__m256i *) (&R1[2 * i]), C0);
+        _mm256_storeu_si256((__m256i *) (16 + &R1[2 * i]), C1);
+        if (!i) {
+            break;
+        }
+        i = -16 - ((~15) & -i);
+    }
+
+    /* R1 ------> R0: reconstruct mod 761*[4591] */
+
+    R0[760] = R1[380] - 2295;
+    s -= 760;
+    i = 364;
+    for (;;) {
+        A0 = _mm256_loadu_si256((__m256i *) &R1[i]);
+        S0 = _mm256_loadu_si256((__m256i *) (s + 2 * i));
+        S1 = _mm256_srli_epi16(S0, 8);
+        S0 &= _mm256_set1_epi16(255);
+        A0 = sub(mulhiconst(A0, 1702), mulhiconst(mulloconst(A0, -3654), 4591)); /* -2296...2721 */
+        A0 = add(A0, S1); /* -2296...2976 */
+        A0 = sub(mulhiconst(A0, 1702), mulhiconst(mulloconst(A0, -3654), 4591)); /* -2356...2372 */
+        A0 = add(A0, S0); /* -2356...2627 */
+        A0 = ifnegaddconst(A0, 4591); /* 0...4590 */
+        A1 = add(shiftleftconst(S1, 8), sub(S0, A0));
+        A1 = mulloconst(A1, 15631);
+
+        /* invalid inputs might need reduction mod 4591 */
+        A1 = ifgesubconst(A1, 4591);
+
+        A0 = subconst(A0, 2295);
+        A1 = subconst(A1, 2295);
+        /* A0: r0r2r4r6r8r10r12r14 r16r18r20r22r24r26r28r30 */
+        /* A1: r1r3r5r7r9r11r13r15 r17r19r21r23r25r27r29r31 */
+        B0 = _mm256_unpacklo_epi16(A0, A1);
+        B1 = _mm256_unpackhi_epi16(A0, A1);
+        /* B0: r0r1r2r3r4r5r6r7 r16r17r18r19r20r21r22r23 */
+        /* B1: r8r9r10r11r12r13r14r15 r24r25r26r27r28r29r30r31 */
+        C0 = _mm256_permute2x128_si256(B0, B1, 0x20);
+        C1 = _mm256_permute2x128_si256(B0, B1, 0x31);
+        /* C0: r0r1r2r3r4r5r6r7 r8r9r10r11r12r13r14r15 */
+        /* C1: r16r17r18r19r20r21r22r23 r24r25r26r27r28r29r30r31 */
+        _mm256_storeu_si256((__m256i *) (&R0[2 * i]), C0);
+        _mm256_storeu_si256((__m256i *) (16 + &R0[2 * i]), C1);
+        if (!i) {
+            break;
+        }
+        i = -16 - ((~15) & -i);
+    }
+}
diff --git a/crypto_kem/sntrup761/avx2/crypto_decode_761x4591.h b/crypto_kem/sntrup761/avx2/crypto_decode_761x4591.h
new file mode 100644
index 00000000..6b637814
--- /dev/null
+++ b/crypto_kem/sntrup761/avx2/crypto_decode_761x4591.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_SNTRUP761_AVX2_CRYPTO_DECODE_761X4591_H
+#define PQCLEAN_SNTRUP761_AVX2_CRYPTO_DECODE_761X4591_H
+
+#include <stdint.h>
+#define PQCLEAN_SNTRUP761_AVX2_crypto_decode_761x4591_STRBYTES 1158
+#define PQCLEAN_SNTRUP761_AVX2_crypto_decode_761x4591_ITEMS 761
+#define PQCLEAN_SNTRUP761_AVX2_crypto_decode_761x4591_ITEMBYTES 2
+
+void PQCLEAN_SNTRUP761_AVX2_crypto_decode_761x4591(void *v, const unsigned char *s);
+#endif
diff --git a/crypto_kem/sntrup761/avx2/crypto_decode_761xint16.c b/crypto_kem/sntrup761/avx2/crypto_decode_761xint16.c
new file mode 100644
index 00000000..4b323850
--- /dev/null
+++ b/crypto_kem/sntrup761/avx2/crypto_decode_761xint16.c
@@ -0,0 +1,16 @@
+#include "crypto_decode_761xint16.h"
+
+
+void PQCLEAN_SNTRUP761_AVX2_crypto_decode_761xint16(void *v, const unsigned char *s) {
+    uint16_t *x = v;
+    int i;
+
+    for (i = 0; i < 761; ++i) {
+        uint16_t u0 = s[0];
+        uint16_t u1 = s[1];
+        u1 <<= 8;
+        *x = u0 | u1;
+        x += 1;
+        s += 2;
+    }
+}
diff --git a/crypto_kem/sntrup761/avx2/crypto_decode_761xint16.h b/crypto_kem/sntrup761/avx2/crypto_decode_761xint16.h
new file mode 100644
index 00000000..050d7c9a
--- /dev/null
+++ b/crypto_kem/sntrup761/avx2/crypto_decode_761xint16.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_SNTRUP761_AVX2_CRYPTO_DECODE_761XINT16_H
+#define PQCLEAN_SNTRUP761_AVX2_CRYPTO_DECODE_761XINT16_H
+
+#include <stdint.h>
+#define PQCLEAN_SNTRUP761_AVX2_crypto_decode_761xint16_STRBYTES 1522
+#define PQCLEAN_SNTRUP761_AVX2_crypto_decode_761xint16_ITEMBYTES 2
+#define PQCLEAN_SNTRUP761_AVX2_crypto_decode_761xint16_ITEMS 761
+
+void PQCLEAN_SNTRUP761_AVX2_crypto_decode_761xint16(void *v, const unsigned char *s);
+#endif
diff --git a/crypto_kem/sntrup761/avx2/crypto_decode_761xint32.c b/crypto_kem/sntrup761/avx2/crypto_decode_761xint32.c
new file mode 100644
index 00000000..fb5210b1
--- /dev/null
+++ b/crypto_kem/sntrup761/avx2/crypto_decode_761xint32.c
@@ -0,0 +1,20 @@
+#include "crypto_decode_761xint32.h"
+
+
+void PQCLEAN_SNTRUP761_AVX2_crypto_decode_761xint32(void *v, const unsigned char *s) {
+    uint32_t *x = v;
+    int i;
+
+    for (i = 0; i < 761; ++i) {
+        uint32_t u0 = s[0];
+        uint32_t u1 = s[1];
+        uint32_t u2 = s[2];
+        uint32_t u3 = s[3];
+        u1 <<= 8;
+        u2 <<= 16;
+        u3 <<= 24;
+        *x = u0 | u1 | u2 | u3;
+        x += 1;
+        s += 4;
+    }
+}
diff --git a/crypto_kem/sntrup761/avx2/crypto_decode_761xint32.h b/crypto_kem/sntrup761/avx2/crypto_decode_761xint32.h
new file mode 100644
index 00000000..dfaf6eeb
--- /dev/null
+++ b/crypto_kem/sntrup761/avx2/crypto_decode_761xint32.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_SNTRUP761_AVX2_CRYPTO_DECODE_761XINT32_H
+#define PQCLEAN_SNTRUP761_AVX2_CRYPTO_DECODE_761XINT32_H
+
+#include <stdint.h>
+#define PQCLEAN_SNTRUP761_AVX2_crypto_decode_761xint32_STRBYTES 3044
+#define PQCLEAN_SNTRUP761_AVX2_crypto_decode_761xint32_ITEMBYTES 4
+#define PQCLEAN_SNTRUP761_AVX2_crypto_decode_761xint32_ITEMS 761
+
+void PQCLEAN_SNTRUP761_AVX2_crypto_decode_761xint32(void *v, const unsigned char *s);
+#endif
diff --git a/crypto_kem/sntrup761/avx2/crypto_decode_int16.c b/crypto_kem/sntrup761/avx2/crypto_decode_int16.c
new file mode 100644
index 00000000..3e4bf8ef
--- /dev/null
+++ b/crypto_kem/sntrup761/avx2/crypto_decode_int16.c
@@ -0,0 +1,9 @@
+#include "crypto_decode_int16.h"
+
+
+void PQCLEAN_SNTRUP761_AVX2_crypto_decode_int16(void *x, const unsigned char *s) {
+    uint16_t u0 = s[0];
+    uint16_t u1 = s[1];
+    u1 <<= 8;
+    *(uint16_t *) x = u0 | u1;
+}
diff --git a/crypto_kem/sntrup761/avx2/crypto_decode_int16.h b/crypto_kem/sntrup761/avx2/crypto_decode_int16.h
new file mode 100644
index 00000000..8ed24816
--- /dev/null
+++ b/crypto_kem/sntrup761/avx2/crypto_decode_int16.h
@@ -0,0 +1,9 @@
+#ifndef PQCLEAN_SNTRUP761_AVX2_CRYPTO_DECODE_INT16_H
+#define PQCLEAN_SNTRUP761_AVX2_CRYPTO_DECODE_INT16_H
+
+#include <stdint.h>
+#define crypto_core_multsntrup857_STRBYTES 2
+#define crypto_core_multsntrup857_ITEMBYTES 2
+#define crypto_core_multsntrup857_ITEMS 1
+void PQCLEAN_SNTRUP761_AVX2_crypto_decode_int16(void *x, const unsigned char *s);
+#endif
diff --git a/crypto_kem/sntrup761/avx2/crypto_encode_761x1531.c b/crypto_kem/sntrup761/avx2/crypto_encode_761x1531.c
new file mode 100644
index 00000000..c5a03da4
--- /dev/null
+++ b/crypto_kem/sntrup761/avx2/crypto_encode_761x1531.c
@@ -0,0 +1,301 @@
+#include "crypto_encode_761x1531.h"
+#include <immintrin.h>
+/* auto-generated; do not edit */
+
+#define int16 int16_t
+#define uint16 uint16_t
+#define uint32 uint32_t
+
+void PQCLEAN_SNTRUP761_AVX2_crypto_encode_761x1531(unsigned char *out, const void *v) {
+    const int16 *R0 = v;
+    /* XXX: caller could overlap R with input */
+    uint16 R[381];
+    long i;
+    const uint16 *reading;
+    uint16 *writing;
+    uint16 r0, r1;
+    uint32 r2;
+    uint32 s0;
+
+    reading = (uint16 *) R0;
+    writing = R;
+    i = 48;
+    while (i > 0) {
+        __m256i x, y;
+        --i;
+        if (!i) {
+            reading -= 8;
+            writing -= 4;
+            out -= 4;
+        }
+        x = _mm256_loadu_si256((__m256i *) reading);
+        x = _mm256_add_epi16(x, _mm256_set1_epi16(2295));
+        x &= _mm256_set1_epi16(16383);
+        x = _mm256_mulhi_epi16(x, _mm256_set1_epi16(21846));
+        y = x & _mm256_set1_epi32(65535);
+        x = _mm256_srli_epi32(x, 16);
+        x = _mm256_mullo_epi32(x, _mm256_set1_epi32(1531));
+        x = _mm256_add_epi32(y, x);
+        x = _mm256_shuffle_epi8(x, _mm256_set_epi8(
+                                    12, 8, 4, 0, 12, 8, 4, 0, 14, 13, 10, 9, 6, 5, 2, 1,
+                                    12, 8, 4, 0, 12, 8, 4, 0, 14, 13, 10, 9, 6, 5, 2, 1
+                                ));
+        x = _mm256_permute4x64_epi64(x, 0xd8);
+        _mm_storeu_si128((__m128i *) writing, _mm256_extractf128_si256(x, 0));
+        s0 = _mm256_extract_epi32(x, 4);
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 = _mm256_extract_epi32(x, 6);
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        reading += 16;
+        writing += 8;
+    }
+    R[380] = (((R0[760] + 2295) & 16383) * 10923) >> 15;
+
+    reading = (uint16 *) R;
+    writing = R;
+    i = 12;
+    while (i > 0) {
+        __m256i x, x2, y, y2;
+        --i;
+        if (!i) {
+            reading -= 4;
+            writing -= 2;
+            out -= 4;
+        }
+        x = _mm256_loadu_si256((__m256i *) (reading + 0));
+        x2 = _mm256_loadu_si256((__m256i *) (reading + 16));
+        y = x & _mm256_set1_epi32(65535);
+        y2 = x2 & _mm256_set1_epi32(65535);
+        x = _mm256_srli_epi32(x, 16);
+        x2 = _mm256_srli_epi32(x2, 16);
+        x = _mm256_mullo_epi32(x, _mm256_set1_epi32(9157));
+        x2 = _mm256_mullo_epi32(x2, _mm256_set1_epi32(9157));
+        x = _mm256_add_epi32(y, x);
+        x2 = _mm256_add_epi32(y2, x2);
+        x = _mm256_shuffle_epi8(x, _mm256_set_epi8(
+                                    15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0,
+                                    15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0
+                                ));
+        x2 = _mm256_shuffle_epi8(x2, _mm256_set_epi8(
+                                     15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0,
+                                     15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0
+                                 ));
+        x = _mm256_permute4x64_epi64(x, 0xd8);
+        x2 = _mm256_permute4x64_epi64(x2, 0xd8);
+        _mm256_storeu_si256((__m256i *) writing, _mm256_permute2f128_si256(x, x2, 0x31));
+        _mm256_storeu_si256((__m256i *) out, _mm256_permute2f128_si256(x, x2, 0x20));
+        reading += 32;
+        writing += 16;
+        out += 32;
+    }
+    R[190] = R[380];
+
+    reading = (uint16 *) R;
+    writing = R;
+    i = 12;
+    while (i > 0) {
+        __m256i x, y;
+        --i;
+        if (!i) {
+            reading -= 2;
+            writing -= 1;
+            out -= 1;
+        }
+        x = _mm256_loadu_si256((__m256i *) reading);
+        y = x & _mm256_set1_epi32(65535);
+        x = _mm256_srli_epi32(x, 16);
+        x = _mm256_mullo_epi32(x, _mm256_set1_epi32(1280));
+        x = _mm256_add_epi32(y, x);
+        x = _mm256_shuffle_epi8(x, _mm256_set_epi8(
+                                    12, 8, 4, 0, 12, 8, 4, 0, 14, 13, 10, 9, 6, 5, 2, 1,
+                                    12, 8, 4, 0, 12, 8, 4, 0, 14, 13, 10, 9, 6, 5, 2, 1
+                                ));
+        x = _mm256_permute4x64_epi64(x, 0xd8);
+        _mm_storeu_si128((__m128i *) writing, _mm256_extractf128_si256(x, 0));
+        s0 = _mm256_extract_epi32(x, 4);
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 = _mm256_extract_epi32(x, 6);
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        reading += 16;
+        writing += 8;
+    }
+    R[95] = R[190];
+
+    reading = (uint16 *) R;
+    writing = R;
+    i = 3;
+    while (i > 0) {
+        __m256i x, x2, y, y2;
+        --i;
+        x = _mm256_loadu_si256((__m256i *) (reading + 0));
+        x2 = _mm256_loadu_si256((__m256i *) (reading + 16));
+        y = x & _mm256_set1_epi32(65535);
+        y2 = x2 & _mm256_set1_epi32(65535);
+        x = _mm256_srli_epi32(x, 16);
+        x2 = _mm256_srli_epi32(x2, 16);
+        x = _mm256_mullo_epi32(x, _mm256_set1_epi32(6400));
+        x2 = _mm256_mullo_epi32(x2, _mm256_set1_epi32(6400));
+        x = _mm256_add_epi32(y, x);
+        x2 = _mm256_add_epi32(y2, x2);
+        x = _mm256_shuffle_epi8(x, _mm256_set_epi8(
+                                    15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0,
+                                    15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0
+                                ));
+        x2 = _mm256_shuffle_epi8(x2, _mm256_set_epi8(
+                                     15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0,
+                                     15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0
+                                 ));
+        x = _mm256_permute4x64_epi64(x, 0xd8);
+        x2 = _mm256_permute4x64_epi64(x2, 0xd8);
+        _mm256_storeu_si256((__m256i *) writing, _mm256_permute2f128_si256(x, x2, 0x31));
+        _mm256_storeu_si256((__m256i *) out, _mm256_permute2f128_si256(x, x2, 0x20));
+        reading += 32;
+        writing += 16;
+        out += 32;
+    }
+
+    reading = (uint16 *) R;
+    writing = R;
+    i = 3;
+    while (i > 0) {
+        __m256i x, y;
+        --i;
+        x = _mm256_loadu_si256((__m256i *) reading);
+        y = x & _mm256_set1_epi32(65535);
+        x = _mm256_srli_epi32(x, 16);
+        x = _mm256_mullo_epi32(x, _mm256_set1_epi32(625));
+        x = _mm256_add_epi32(y, x);
+        x = _mm256_shuffle_epi8(x, _mm256_set_epi8(
+                                    12, 8, 4, 0, 12, 8, 4, 0, 14, 13, 10, 9, 6, 5, 2, 1,
+                                    12, 8, 4, 0, 12, 8, 4, 0, 14, 13, 10, 9, 6, 5, 2, 1
+                                ));
+        x = _mm256_permute4x64_epi64(x, 0xd8);
+        _mm_storeu_si128((__m128i *) writing, _mm256_extractf128_si256(x, 0));
+        s0 = _mm256_extract_epi32(x, 4);
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 = _mm256_extract_epi32(x, 6);
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        reading += 16;
+        writing += 8;
+    }
+
+    reading = (uint16 *) R;
+    writing = R;
+    i = 2;
+    while (i > 0) {
+        __m256i x, y;
+        --i;
+        if (!i) {
+            reading -= 8;
+            writing -= 4;
+            out -= 4;
+        }
+        x = _mm256_loadu_si256((__m256i *) reading);
+        y = x & _mm256_set1_epi32(65535);
+        x = _mm256_srli_epi32(x, 16);
+        x = _mm256_mullo_epi32(x, _mm256_set1_epi32(1526));
+        x = _mm256_add_epi32(y, x);
+        x = _mm256_shuffle_epi8(x, _mm256_set_epi8(
+                                    12, 8, 4, 0, 12, 8, 4, 0, 14, 13, 10, 9, 6, 5, 2, 1,
+                                    12, 8, 4, 0, 12, 8, 4, 0, 14, 13, 10, 9, 6, 5, 2, 1
+                                ));
+        x = _mm256_permute4x64_epi64(x, 0xd8);
+        _mm_storeu_si128((__m128i *) writing, _mm256_extractf128_si256(x, 0));
+        s0 = _mm256_extract_epi32(x, 4);
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 = _mm256_extract_epi32(x, 6);
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        reading += 16;
+        writing += 8;
+    }
+
+    for (i = 0; i < 6; ++i) {
+        r0 = R[2 * i];
+        r1 = R[2 * i + 1];
+        r2 = r0 + r1 * (uint32)9097;
+        *out++ = r2;
+        r2 >>= 8;
+        *out++ = r2;
+        r2 >>= 8;
+        R[i] = r2;
+    }
+
+    for (i = 0; i < 3; ++i) {
+        r0 = R[2 * i];
+        r1 = R[2 * i + 1];
+        r2 = r0 + r1 * (uint32)1263;
+        *out++ = r2;
+        r2 >>= 8;
+        R[i] = r2;
+    }
+
+    r0 = R[0];
+    r1 = R[1];
+    r2 = r0 + r1 * (uint32)6232;
+    *out++ = r2;
+    r2 >>= 8;
+    *out++ = r2;
+    r2 >>= 8;
+    R[0] = r2;
+    R[1] = R[2];
+
+    r0 = R[0];
+    r1 = R[1];
+    r2 = r0 + r1 * (uint32)593;
+    *out++ = r2;
+    r2 >>= 8;
+    R[0] = r2;
+
+    r0 = R[0];
+    *out++ = r0;
+    r0 >>= 8;
+    *out++ = r0; /*clang-analyzer-deadcode.DeadStores*/ /*r0 >>= 8;*/
+}
diff --git a/crypto_kem/sntrup761/avx2/crypto_encode_761x1531.h b/crypto_kem/sntrup761/avx2/crypto_encode_761x1531.h
new file mode 100644
index 00000000..c4a2a753
--- /dev/null
+++ b/crypto_kem/sntrup761/avx2/crypto_encode_761x1531.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_SNTRUP761_AVX2_CRYPTO_ENCODE_761X1531_H
+#define PQCLEAN_SNTRUP761_AVX2_CRYPTO_ENCODE_761X1531_H
+
+#include <stdint.h>
+#define PQCLEAN_SNTRUP761_AVX2_crypto_encode_761x1531_STRBYTES 1007
+#define PQCLEAN_SNTRUP761_AVX2_crypto_encode_761x1531_ITEMS 761
+#define PQCLEAN_SNTRUP761_AVX2_crypto_encode_761x1531_ITEMBYTES 2
+
+void PQCLEAN_SNTRUP761_AVX2_crypto_encode_761x1531(unsigned char *out, const void *v);
+#endif
diff --git a/crypto_kem/sntrup761/avx2/crypto_encode_761x1531round.c b/crypto_kem/sntrup761/avx2/crypto_encode_761x1531round.c
new file mode 100644
index 00000000..ab045dbd
--- /dev/null
+++ b/crypto_kem/sntrup761/avx2/crypto_encode_761x1531round.c
@@ -0,0 +1,303 @@
+#include "crypto_encode_761x1531round.h"
+#include <immintrin.h>
+/* auto-generated; do not edit */
+
+#define int16 int16_t
+#define uint16 uint16_t
+#define uint32 uint32_t
+
+void PQCLEAN_SNTRUP761_AVX2_crypto_encode_761x1531round(unsigned char *out, const void *v) {
+    const int16 *R0 = v;
+    /* XXX: caller could overlap R with input */
+    uint16 R[381];
+    long i;
+    const uint16 *reading;
+    uint16 *writing;
+    uint16 r0, r1;
+    uint32 r2;
+    uint32 s0;
+
+    reading = (uint16 *) R0;
+    writing = R;
+    i = 48;
+    while (i > 0) {
+        __m256i x, y;
+        --i;
+        if (!i) {
+            reading -= 8;
+            writing -= 4;
+            out -= 4;
+        }
+        x = _mm256_loadu_si256((__m256i *) reading);
+        x = _mm256_mulhrs_epi16(x, _mm256_set1_epi16(10923));
+        x = _mm256_add_epi16(x, _mm256_add_epi16(x, x));
+        x = _mm256_add_epi16(x, _mm256_set1_epi16(2295));
+        x &= _mm256_set1_epi16(16383);
+        x = _mm256_mulhi_epi16(x, _mm256_set1_epi16(21846));
+        y = x & _mm256_set1_epi32(65535);
+        x = _mm256_srli_epi32(x, 16);
+        x = _mm256_mullo_epi32(x, _mm256_set1_epi32(1531));
+        x = _mm256_add_epi32(y, x);
+        x = _mm256_shuffle_epi8(x, _mm256_set_epi8(
+                                    12, 8, 4, 0, 12, 8, 4, 0, 14, 13, 10, 9, 6, 5, 2, 1,
+                                    12, 8, 4, 0, 12, 8, 4, 0, 14, 13, 10, 9, 6, 5, 2, 1
+                                ));
+        x = _mm256_permute4x64_epi64(x, 0xd8);
+        _mm_storeu_si128((__m128i *) writing, _mm256_extractf128_si256(x, 0));
+        s0 = _mm256_extract_epi32(x, 4);
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 = _mm256_extract_epi32(x, 6);
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        reading += 16;
+        writing += 8;
+    }
+    R[380] = (((3 * ((10923 * R0[760] + 16384) >> 15) + 2295) & 16383) * 10923) >> 15;
+
+    reading = (uint16 *) R;
+    writing = R;
+    i = 12;
+    while (i > 0) {
+        __m256i x, x2, y, y2;
+        --i;
+        if (!i) {
+            reading -= 4;
+            writing -= 2;
+            out -= 4;
+        }
+        x = _mm256_loadu_si256((__m256i *) (reading + 0));
+        x2 = _mm256_loadu_si256((__m256i *) (reading + 16));
+        y = x & _mm256_set1_epi32(65535);
+        y2 = x2 & _mm256_set1_epi32(65535);
+        x = _mm256_srli_epi32(x, 16);
+        x2 = _mm256_srli_epi32(x2, 16);
+        x = _mm256_mullo_epi32(x, _mm256_set1_epi32(9157));
+        x2 = _mm256_mullo_epi32(x2, _mm256_set1_epi32(9157));
+        x = _mm256_add_epi32(y, x);
+        x2 = _mm256_add_epi32(y2, x2);
+        x = _mm256_shuffle_epi8(x, _mm256_set_epi8(
+                                    15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0,
+                                    15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0
+                                ));
+        x2 = _mm256_shuffle_epi8(x2, _mm256_set_epi8(
+                                     15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0,
+                                     15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0
+                                 ));
+        x = _mm256_permute4x64_epi64(x, 0xd8);
+        x2 = _mm256_permute4x64_epi64(x2, 0xd8);
+        _mm256_storeu_si256((__m256i *) writing, _mm256_permute2f128_si256(x, x2, 0x31));
+        _mm256_storeu_si256((__m256i *) out, _mm256_permute2f128_si256(x, x2, 0x20));
+        reading += 32;
+        writing += 16;
+        out += 32;
+    }
+    R[190] = R[380];
+
+    reading = (uint16 *) R;
+    writing = R;
+    i = 12;
+    while (i > 0) {
+        __m256i x, y;
+        --i;
+        if (!i) {
+            reading -= 2;
+            writing -= 1;
+            out -= 1;
+        }
+        x = _mm256_loadu_si256((__m256i *) reading);
+        y = x & _mm256_set1_epi32(65535);
+        x = _mm256_srli_epi32(x, 16);
+        x = _mm256_mullo_epi32(x, _mm256_set1_epi32(1280));
+        x = _mm256_add_epi32(y, x);
+        x = _mm256_shuffle_epi8(x, _mm256_set_epi8(
+                                    12, 8, 4, 0, 12, 8, 4, 0, 14, 13, 10, 9, 6, 5, 2, 1,
+                                    12, 8, 4, 0, 12, 8, 4, 0, 14, 13, 10, 9, 6, 5, 2, 1
+                                ));
+        x = _mm256_permute4x64_epi64(x, 0xd8);
+        _mm_storeu_si128((__m128i *) writing, _mm256_extractf128_si256(x, 0));
+        s0 = _mm256_extract_epi32(x, 4);
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 = _mm256_extract_epi32(x, 6);
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        reading += 16;
+        writing += 8;
+    }
+    R[95] = R[190];
+
+    reading = (uint16 *) R;
+    writing = R;
+    i = 3;
+    while (i > 0) {
+        __m256i x, x2, y, y2;
+        --i;
+        x = _mm256_loadu_si256((__m256i *) (reading + 0));
+        x2 = _mm256_loadu_si256((__m256i *) (reading + 16));
+        y = x & _mm256_set1_epi32(65535);
+        y2 = x2 & _mm256_set1_epi32(65535);
+        x = _mm256_srli_epi32(x, 16);
+        x2 = _mm256_srli_epi32(x2, 16);
+        x = _mm256_mullo_epi32(x, _mm256_set1_epi32(6400));
+        x2 = _mm256_mullo_epi32(x2, _mm256_set1_epi32(6400));
+        x = _mm256_add_epi32(y, x);
+        x2 = _mm256_add_epi32(y2, x2);
+        x = _mm256_shuffle_epi8(x, _mm256_set_epi8(
+                                    15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0,
+                                    15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0
+                                ));
+        x2 = _mm256_shuffle_epi8(x2, _mm256_set_epi8(
+                                     15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0,
+                                     15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0
+                                 ));
+        x = _mm256_permute4x64_epi64(x, 0xd8);
+        x2 = _mm256_permute4x64_epi64(x2, 0xd8);
+        _mm256_storeu_si256((__m256i *) writing, _mm256_permute2f128_si256(x, x2, 0x31));
+        _mm256_storeu_si256((__m256i *) out, _mm256_permute2f128_si256(x, x2, 0x20));
+        reading += 32;
+        writing += 16;
+        out += 32;
+    }
+
+    reading = (uint16 *) R;
+    writing = R;
+    i = 3;
+    while (i > 0) {
+        __m256i x, y;
+        --i;
+        x = _mm256_loadu_si256((__m256i *) reading);
+        y = x & _mm256_set1_epi32(65535);
+        x = _mm256_srli_epi32(x, 16);
+        x = _mm256_mullo_epi32(x, _mm256_set1_epi32(625));
+        x = _mm256_add_epi32(y, x);
+        x = _mm256_shuffle_epi8(x, _mm256_set_epi8(
+                                    12, 8, 4, 0, 12, 8, 4, 0, 14, 13, 10, 9, 6, 5, 2, 1,
+                                    12, 8, 4, 0, 12, 8, 4, 0, 14, 13, 10, 9, 6, 5, 2, 1
+                                ));
+        x = _mm256_permute4x64_epi64(x, 0xd8);
+        _mm_storeu_si128((__m128i *) writing, _mm256_extractf128_si256(x, 0));
+        s0 = _mm256_extract_epi32(x, 4);
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 = _mm256_extract_epi32(x, 6);
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        reading += 16;
+        writing += 8;
+    }
+
+    reading = (uint16 *) R;
+    writing = R;
+    i = 2;
+    while (i > 0) {
+        __m256i x, y;
+        --i;
+        if (!i) {
+            reading -= 8;
+            writing -= 4;
+            out -= 4;
+        }
+        x = _mm256_loadu_si256((__m256i *) reading);
+        y = x & _mm256_set1_epi32(65535);
+        x = _mm256_srli_epi32(x, 16);
+        x = _mm256_mullo_epi32(x, _mm256_set1_epi32(1526));
+        x = _mm256_add_epi32(y, x);
+        x = _mm256_shuffle_epi8(x, _mm256_set_epi8(
+                                    12, 8, 4, 0, 12, 8, 4, 0, 14, 13, 10, 9, 6, 5, 2, 1,
+                                    12, 8, 4, 0, 12, 8, 4, 0, 14, 13, 10, 9, 6, 5, 2, 1
+                                ));
+        x = _mm256_permute4x64_epi64(x, 0xd8);
+        _mm_storeu_si128((__m128i *) writing, _mm256_extractf128_si256(x, 0));
+        s0 = _mm256_extract_epi32(x, 4);
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 = _mm256_extract_epi32(x, 6);
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        reading += 16;
+        writing += 8;
+    }
+
+    for (i = 0; i < 6; ++i) {
+        r0 = R[2 * i];
+        r1 = R[2 * i + 1];
+        r2 = r0 + r1 * (uint32)9097;
+        *out++ = r2;
+        r2 >>= 8;
+        *out++ = r2;
+        r2 >>= 8;
+        R[i] = r2;
+    }
+
+    for (i = 0; i < 3; ++i) {
+        r0 = R[2 * i];
+        r1 = R[2 * i + 1];
+        r2 = r0 + r1 * (uint32)1263;
+        *out++ = r2;
+        r2 >>= 8;
+        R[i] = r2;
+    }
+
+    r0 = R[0];
+    r1 = R[1];
+    r2 = r0 + r1 * (uint32)6232;
+    *out++ = r2;
+    r2 >>= 8;
+    *out++ = r2;
+    r2 >>= 8;
+    R[0] = r2;
+    R[1] = R[2];
+
+    r0 = R[0];
+    r1 = R[1];
+    r2 = r0 + r1 * (uint32)593;
+    *out++ = r2;
+    r2 >>= 8;
+    R[0] = r2;
+
+    r0 = R[0];
+    *out++ = r0;
+    r0 >>= 8;
+    *out++ = r0; /*clang-analyzer-deadcode.DeadStores*/ /*r0 >>= 8;*/
+}
diff --git a/crypto_kem/sntrup761/avx2/crypto_encode_761x1531round.h b/crypto_kem/sntrup761/avx2/crypto_encode_761x1531round.h
new file mode 100644
index 00000000..b3c29ef9
--- /dev/null
+++ b/crypto_kem/sntrup761/avx2/crypto_encode_761x1531round.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_SNTRUP761_AVX2_CRYPTO_ENCODE_761X1531ROUND_H
+#define PQCLEAN_SNTRUP761_AVX2_CRYPTO_ENCODE_761X1531ROUND_H
+
+#include <stdint.h>
+#define PQCLEAN_SNTRUP761_AVX2_crypto_encode_761x1531round_STRBYTES 1007
+#define PQCLEAN_SNTRUP761_AVX2_crypto_encode_761x1531round_ITEMS 761
+#define PQCLEAN_SNTRUP761_AVX2_crypto_encode_761x1531round_ITEMBYTES 2
+
+void PQCLEAN_SNTRUP761_AVX2_crypto_encode_761x1531round(unsigned char *out, const void *v);
+#endif
diff --git a/crypto_kem/sntrup761/avx2/crypto_encode_761x3.c b/crypto_kem/sntrup761/avx2/crypto_encode_761x3.c
new file mode 100644
index 00000000..d7442199
--- /dev/null
+++ b/crypto_kem/sntrup761/avx2/crypto_encode_761x3.c
@@ -0,0 +1,64 @@
+#include "crypto_encode_761x3.h"
+#include <immintrin.h>
+#define uint8 uint8_t
+
+#define p 761
+#define loops 6
+#define overshoot 2
+
+static const union {
+    uint8 init[32];
+    __m256i val;
+} lobytes_buf = { .init = {
+        255, 0, 255, 0, 255, 0, 255, 0,
+        255, 0, 255, 0, 255, 0, 255, 0,
+        255, 0, 255, 0, 255, 0, 255, 0,
+        255, 0, 255, 0, 255, 0, 255, 0,
+    }
+};
+#define lobytes (lobytes_buf.val)
+
+void PQCLEAN_SNTRUP761_AVX2_crypto_encode_761x3(unsigned char *s, const void *v) {
+    const uint8 *f = v;
+    int loop;
+    const uint8 *nextf = f + 128 - 4 * overshoot;
+    unsigned char *nexts = s + 32 - overshoot;
+
+    for (loop = loops; loop > 0; --loop) {
+        __m256i f0 = _mm256_loadu_si256((const __m256i *) (f + 0));
+        __m256i f1 = _mm256_loadu_si256((const __m256i *) (f + 32));
+        __m256i f2 = _mm256_loadu_si256((const __m256i *) (f + 64));
+        __m256i f3 = _mm256_loadu_si256((const __m256i *) (f + 96));
+        f = nextf;
+        nextf += 128;
+
+        __m256i a0 = _mm256_packus_epi16(f0 & lobytes, f1 & lobytes);
+        /* 0 2 4 6 8 10 12 14 32 34 36 38 40 42 44 46 */
+        /* 16 18 20 22 24 26 28 30 48 50 52 54 56 58 60 62 */
+        __m256i a1 = _mm256_packus_epi16(_mm256_srli_epi16(f0, 8), _mm256_srli_epi16(f1, 8));
+        /* 1 3 ... */
+        __m256i a2 = _mm256_packus_epi16(f2 & lobytes, f3 & lobytes);
+        __m256i a3 = _mm256_packus_epi16(_mm256_srli_epi16(f2, 8), _mm256_srli_epi16(f3, 8));
+
+        a0 = _mm256_add_epi8(a0, _mm256_slli_epi16(a1 & _mm256_set1_epi8(63), 2));
+        a2 = _mm256_add_epi8(a2, _mm256_slli_epi16(a3 & _mm256_set1_epi8(63), 2));
+
+        __m256i b0 = _mm256_packus_epi16(a0 & lobytes, a2 & lobytes);
+        /* 0 4 8 12 32 36 40 44 64 68 72 76 96 100 104 108 */
+        /* 16 20 24 28 48 52 56 60 80 84 88 92 112 116 120 124 */
+        __m256i b2 = _mm256_packus_epi16(_mm256_srli_epi16(a0, 8), _mm256_srli_epi16(a2, 8));
+        /* 2 6 ... */
+
+        b0 = _mm256_add_epi8(b0, _mm256_slli_epi16(b2 & _mm256_set1_epi8(15), 4));
+
+        b0 = _mm256_permutevar8x32_epi32(b0, _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0));
+
+        b0 = _mm256_add_epi8(b0, _mm256_set1_epi8(85));
+
+        _mm256_storeu_si256((__m256i *) s, b0);
+        s = nexts;
+        nexts += 32;
+    }
+
+    *s++ = *f++ + 1;
+}
diff --git a/crypto_kem/sntrup761/avx2/crypto_encode_761x3.h b/crypto_kem/sntrup761/avx2/crypto_encode_761x3.h
new file mode 100644
index 00000000..e99384fb
--- /dev/null
+++ b/crypto_kem/sntrup761/avx2/crypto_encode_761x3.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_SNTRUP761_AVX2_CRYPTO_ENCODE_761X3_H
+#define PQCLEAN_SNTRUP761_AVX2_CRYPTO_ENCODE_761X3_H
+
+#include <stdint.h>
+#define PQCLEAN_SNTRUP761_AVX2_crypto_encode_761x3_STRBYTES 191
+#define PQCLEAN_SNTRUP761_AVX2_crypto_encode_761x3_ITEMS 761
+#define PQCLEAN_SNTRUP761_AVX2_crypto_encode_761x3_ITEMBYTES 1
+
+void PQCLEAN_SNTRUP761_AVX2_crypto_encode_761x3(unsigned char *s, const void *v);
+#endif
diff --git a/crypto_kem/sntrup761/avx2/crypto_encode_761x4591.c b/crypto_kem/sntrup761/avx2/crypto_encode_761x4591.c
new file mode 100644
index 00000000..7d87488f
--- /dev/null
+++ b/crypto_kem/sntrup761/avx2/crypto_encode_761x4591.c
@@ -0,0 +1,308 @@
+#include "crypto_encode_761x4591.h"
+#include <immintrin.h>
+/* auto-generated; do not edit */
+
+#define int16 int16_t
+#define uint16 uint16_t
+#define uint32 uint32_t
+
+void PQCLEAN_SNTRUP761_AVX2_crypto_encode_761x4591(unsigned char *out, const void *v) {
+    const int16 *R0 = v;
+    /* XXX: caller could overlap R with input */
+    uint16 R[381];
+    long i;
+    const uint16 *reading;
+    uint16 *writing;
+    uint16 r0, r1;
+    uint32 r2;
+    uint32 s0;
+
+    reading = (uint16 *) R0;
+    writing = R;
+    i = 24;
+    while (i > 0) {
+        __m256i x, x2, y, y2;
+        --i;
+        if (!i) {
+            reading -= 8;
+            writing -= 4;
+            out -= 8;
+        }
+        x = _mm256_loadu_si256((__m256i *) (reading + 0));
+        x2 = _mm256_loadu_si256((__m256i *) (reading + 16));
+        x = _mm256_add_epi16(x, _mm256_set1_epi16(2295));
+        x2 = _mm256_add_epi16(x2, _mm256_set1_epi16(2295));
+        x &= _mm256_set1_epi16(16383);
+        x2 &= _mm256_set1_epi16(16383);
+        y = x & _mm256_set1_epi32(65535);
+        y2 = x2 & _mm256_set1_epi32(65535);
+        x = _mm256_srli_epi32(x, 16);
+        x2 = _mm256_srli_epi32(x2, 16);
+        x = _mm256_mullo_epi32(x, _mm256_set1_epi32(4591));
+        x2 = _mm256_mullo_epi32(x2, _mm256_set1_epi32(4591));
+        x = _mm256_add_epi32(y, x);
+        x2 = _mm256_add_epi32(y2, x2);
+        x = _mm256_shuffle_epi8(x, _mm256_set_epi8(
+                                    15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0,
+                                    15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0
+                                ));
+        x2 = _mm256_shuffle_epi8(x2, _mm256_set_epi8(
+                                     15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0,
+                                     15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0
+                                 ));
+        x = _mm256_permute4x64_epi64(x, 0xd8);
+        x2 = _mm256_permute4x64_epi64(x2, 0xd8);
+        _mm256_storeu_si256((__m256i *) writing, _mm256_permute2f128_si256(x, x2, 0x31));
+        _mm256_storeu_si256((__m256i *) out, _mm256_permute2f128_si256(x, x2, 0x20));
+        reading += 32;
+        writing += 16;
+        out += 32;
+    }
+    R[380] = ((R0[760] + 2295) & 16383);
+
+    reading = (uint16 *) R;
+    writing = R;
+    i = 24;
+    while (i > 0) {
+        __m256i x, y;
+        --i;
+        if (!i) {
+            reading -= 4;
+            writing -= 2;
+            out -= 2;
+        }
+        x = _mm256_loadu_si256((__m256i *) reading);
+        y = x & _mm256_set1_epi32(65535);
+        x = _mm256_srli_epi32(x, 16);
+        x = _mm256_mullo_epi32(x, _mm256_set1_epi32(322));
+        x = _mm256_add_epi32(y, x);
+        x = _mm256_shuffle_epi8(x, _mm256_set_epi8(
+                                    12, 8, 4, 0, 12, 8, 4, 0, 14, 13, 10, 9, 6, 5, 2, 1,
+                                    12, 8, 4, 0, 12, 8, 4, 0, 14, 13, 10, 9, 6, 5, 2, 1
+                                ));
+        x = _mm256_permute4x64_epi64(x, 0xd8);
+        _mm_storeu_si128((__m128i *) writing, _mm256_extractf128_si256(x, 0));
+        s0 = _mm256_extract_epi32(x, 4);
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 = _mm256_extract_epi32(x, 6);
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        reading += 16;
+        writing += 8;
+    }
+    R[190] = R[380];
+
+    reading = (uint16 *) R;
+    writing = R;
+    i = 12;
+    while (i > 0) {
+        __m256i x, y;
+        --i;
+        if (!i) {
+            reading -= 2;
+            writing -= 1;
+            out -= 1;
+        }
+        x = _mm256_loadu_si256((__m256i *) reading);
+        y = x & _mm256_set1_epi32(65535);
+        x = _mm256_srli_epi32(x, 16);
+        x = _mm256_mullo_epi32(x, _mm256_set1_epi32(406));
+        x = _mm256_add_epi32(y, x);
+        x = _mm256_shuffle_epi8(x, _mm256_set_epi8(
+                                    12, 8, 4, 0, 12, 8, 4, 0, 14, 13, 10, 9, 6, 5, 2, 1,
+                                    12, 8, 4, 0, 12, 8, 4, 0, 14, 13, 10, 9, 6, 5, 2, 1
+                                ));
+        x = _mm256_permute4x64_epi64(x, 0xd8);
+        _mm_storeu_si128((__m128i *) writing, _mm256_extractf128_si256(x, 0));
+        s0 = _mm256_extract_epi32(x, 4);
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 = _mm256_extract_epi32(x, 6);
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        reading += 16;
+        writing += 8;
+    }
+    R[95] = R[190];
+
+    reading = (uint16 *) R;
+    writing = R;
+    i = 6;
+    while (i > 0) {
+        __m256i x, y;
+        --i;
+        x = _mm256_loadu_si256((__m256i *) reading);
+        y = x & _mm256_set1_epi32(65535);
+        x = _mm256_srli_epi32(x, 16);
+        x = _mm256_mullo_epi32(x, _mm256_set1_epi32(644));
+        x = _mm256_add_epi32(y, x);
+        x = _mm256_shuffle_epi8(x, _mm256_set_epi8(
+                                    12, 8, 4, 0, 12, 8, 4, 0, 14, 13, 10, 9, 6, 5, 2, 1,
+                                    12, 8, 4, 0, 12, 8, 4, 0, 14, 13, 10, 9, 6, 5, 2, 1
+                                ));
+        x = _mm256_permute4x64_epi64(x, 0xd8);
+        _mm_storeu_si128((__m128i *) writing, _mm256_extractf128_si256(x, 0));
+        s0 = _mm256_extract_epi32(x, 4);
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 = _mm256_extract_epi32(x, 6);
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        reading += 16;
+        writing += 8;
+    }
+
+    reading = (uint16 *) R;
+    writing = R;
+    i = 3;
+    while (i > 0) {
+        __m256i x, y;
+        --i;
+        if (!i) {
+            reading -= 2;
+            writing -= 1;
+            out -= 1;
+        }
+        x = _mm256_loadu_si256((__m256i *) reading);
+        y = x & _mm256_set1_epi32(65535);
+        x = _mm256_srli_epi32(x, 16);
+        x = _mm256_mullo_epi32(x, _mm256_set1_epi32(1621));
+        x = _mm256_add_epi32(y, x);
+        x = _mm256_shuffle_epi8(x, _mm256_set_epi8(
+                                    12, 8, 4, 0, 12, 8, 4, 0, 14, 13, 10, 9, 6, 5, 2, 1,
+                                    12, 8, 4, 0, 12, 8, 4, 0, 14, 13, 10, 9, 6, 5, 2, 1
+                                ));
+        x = _mm256_permute4x64_epi64(x, 0xd8);
+        _mm_storeu_si128((__m128i *) writing, _mm256_extractf128_si256(x, 0));
+        s0 = _mm256_extract_epi32(x, 4);
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 = _mm256_extract_epi32(x, 6);
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        reading += 16;
+        writing += 8;
+    }
+    r0 = R[46];
+    r1 = R[47];
+    r2 = r0 + r1 * (uint32)1621;
+    *out++ = r2;
+    r2 >>= 8;
+    *out++ = r2;
+    r2 >>= 8;
+    R[23] = r2;
+
+    for (i = 0; i < 11; ++i) {
+        r0 = R[2 * i];
+        r1 = R[2 * i + 1];
+        r2 = r0 + r1 * (uint32)10265;
+        *out++ = r2;
+        r2 >>= 8;
+        *out++ = r2;
+        r2 >>= 8;
+        R[i] = r2;
+    }
+    r0 = R[22];
+    r1 = R[23];
+    r2 = r0 + r1 * (uint32)10265;
+    *out++ = r2;
+    r2 >>= 8;
+    R[11] = r2;
+
+    for (i = 0; i < 5; ++i) {
+        r0 = R[2 * i];
+        r1 = R[2 * i + 1];
+        r2 = r0 + r1 * (uint32)1608;
+        *out++ = r2;
+        r2 >>= 8;
+        R[i] = r2;
+    }
+    r0 = R[10];
+    r1 = R[11];
+    r2 = r0 + r1 * (uint32)1608;
+    *out++ = r2;
+    r2 >>= 8;
+    *out++ = r2;
+    r2 >>= 8;
+    R[5] = r2;
+
+    for (i = 0; i < 2; ++i) {
+        r0 = R[2 * i];
+        r1 = R[2 * i + 1];
+        r2 = r0 + r1 * (uint32)10101;
+        *out++ = r2;
+        r2 >>= 8;
+        *out++ = r2;
+        r2 >>= 8;
+        R[i] = r2;
+    }
+    r0 = R[4];
+    r1 = R[5];
+    r2 = r0 + r1 * (uint32)10101;
+    *out++ = r2;
+    r2 >>= 8;
+    R[2] = r2;
+
+    r0 = R[0];
+    r1 = R[1];
+    r2 = r0 + r1 * (uint32)1557;
+    *out++ = r2;
+    r2 >>= 8;
+    R[0] = r2;
+    R[1] = R[2];
+
+    r0 = R[0];
+    r1 = R[1];
+    r2 = r0 + r1 * (uint32)9470;
+    *out++ = r2;
+    r2 >>= 8;
+    *out++ = r2;
+    r2 >>= 8;
+    R[0] = r2;
+
+    r0 = R[0];
+    *out++ = r0;
+    r0 >>= 8;
+    *out++ = r0; /*clang-analyzer-deadcode.DeadStores*/ /*r0 >>= 8;*/
+}
diff --git a/crypto_kem/sntrup761/avx2/crypto_encode_761x4591.h b/crypto_kem/sntrup761/avx2/crypto_encode_761x4591.h
new file mode 100644
index 00000000..52404cc2
--- /dev/null
+++ b/crypto_kem/sntrup761/avx2/crypto_encode_761x4591.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_SNTRUP761_AVX2_CRYPTO_ENCODE_761X4591_H
+#define PQCLEAN_SNTRUP761_AVX2_CRYPTO_ENCODE_761X4591_H
+
+#include <stdint.h>
+#define PQCLEAN_SNTRUP761_AVX2_crypto_encode_761x4591_STRBYTES 1158
+#define PQCLEAN_SNTRUP761_AVX2_crypto_encode_761x4591_ITEMS 761
+#define PQCLEAN_SNTRUP761_AVX2_crypto_encode_761x4591_ITEMBYTES 2
+
+void PQCLEAN_SNTRUP761_AVX2_crypto_encode_761x4591(unsigned char *out, const void *v);
+#endif
diff --git a/crypto_kem/sntrup761/avx2/crypto_encode_761xfreeze3.c b/crypto_kem/sntrup761/avx2/crypto_encode_761xfreeze3.c
new file mode 100644
index 00000000..6622e310
--- /dev/null
+++ b/crypto_kem/sntrup761/avx2/crypto_encode_761xfreeze3.c
@@ -0,0 +1,31 @@
+#include "crypto_encode_761xfreeze3.h"
+#include <immintrin.h>
+#define int16 int16_t
+
+#define p 761
+
+void PQCLEAN_SNTRUP761_AVX2_crypto_encode_761xfreeze3(unsigned char *s, const void *v) {
+    const int16 *r = v;
+
+    int i = p - 16;
+    for (;;) {
+        do {
+            __m256i x = _mm256_loadu_si256((__m256i *) r);
+            __m256i y = _mm256_mulhrs_epi16(x, _mm256_set1_epi16(10923));
+            x = _mm256_sub_epi16(x, y);
+            y = _mm256_add_epi16(y, y);
+            x = _mm256_sub_epi16(x, y);
+            __m128i x0 = _mm256_extractf128_si256(x, 0);
+            __m128i x1 = _mm256_extractf128_si256(x, 1);
+            _mm_storeu_si128((__m128i *) s, _mm_packs_epi16(x0, x1));
+            i -= 16;
+            r += 16;
+            s += 16;
+        } while (i >= 0);
+        if (i <= -16) {
+            break;
+        }
+        r += i;
+        s += i;
+    }
+}
diff --git a/crypto_kem/sntrup761/avx2/crypto_encode_761xfreeze3.h b/crypto_kem/sntrup761/avx2/crypto_encode_761xfreeze3.h
new file mode 100644
index 00000000..cc89f9a5
--- /dev/null
+++ b/crypto_kem/sntrup761/avx2/crypto_encode_761xfreeze3.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_SNTRUP761_AVX2_CRYPTO_ENCODE_761XFREEZE3_H
+#define PQCLEAN_SNTRUP761_AVX2_CRYPTO_ENCODE_761XFREEZE3_H
+
+#include <stdint.h>
+#define PQCLEAN_SNTRUP761_AVX2_crypto_encode_761xfreeze3_STRBYTES 761
+#define PQCLEAN_SNTRUP761_AVX2_crypto_encode_761xfreeze3_ITEMS 761
+#define PQCLEAN_SNTRUP761_AVX2_crypto_encode_761xfreeze3_ITEMBYTES 2
+
+void PQCLEAN_SNTRUP761_AVX2_crypto_encode_761xfreeze3(unsigned char *s, const void *v);
+#endif
diff --git a/crypto_kem/sntrup761/avx2/crypto_encode_761xint16.c b/crypto_kem/sntrup761/avx2/crypto_encode_761xint16.c
new file mode 100644
index 00000000..ba851932
--- /dev/null
+++ b/crypto_kem/sntrup761/avx2/crypto_encode_761xint16.c
@@ -0,0 +1,13 @@
+#include "crypto_encode_761xint16.h"
+
+
+void PQCLEAN_SNTRUP761_AVX2_crypto_encode_761xint16(unsigned char *s, const void *v) {
+    const uint16_t *x = v;
+    int i;
+
+    for (i = 0; i < 761; ++i) {
+        uint16_t u = *x++;
+        *s++ = u;
+        *s++ = u >> 8;
+    }
+}
diff --git a/crypto_kem/sntrup761/avx2/crypto_encode_761xint16.h b/crypto_kem/sntrup761/avx2/crypto_encode_761xint16.h
new file mode 100644
index 00000000..38252f8d
--- /dev/null
+++ b/crypto_kem/sntrup761/avx2/crypto_encode_761xint16.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_SNTRUP761_AVX2_CRYPTO_ENCODE_761XINT16_H
+#define PQCLEAN_SNTRUP761_AVX2_CRYPTO_ENCODE_761XINT16_H
+
+#include <stdint.h>
+#define PQCLEAN_SNTRUP761_AVX2_crypto_encode_761xint16_STRBYTES 1522
+#define PQCLEAN_SNTRUP761_AVX2_crypto_encode_761xint16_ITEMBYTES 2
+#define PQCLEAN_SNTRUP761_AVX2_crypto_encode_761xint16_ITEMS 761
+
+void PQCLEAN_SNTRUP761_AVX2_crypto_encode_761xint16(unsigned char *s, const void *v);
+#endif
diff --git a/crypto_kem/sntrup761/avx2/crypto_encode_int16.c b/crypto_kem/sntrup761/avx2/crypto_encode_int16.c
new file mode 100644
index 00000000..378d0667
--- /dev/null
+++ b/crypto_kem/sntrup761/avx2/crypto_encode_int16.c
@@ -0,0 +1,9 @@
+#include "crypto_encode_int16.h"
+
+#define uint16 uint16_t
+
+void PQCLEAN_SNTRUP761_AVX2_crypto_encode_int16(unsigned char *s, const void *x) {
+    uint16 u = *(const uint16 *) x;
+    s[0] = u;
+    s[1] = u >> 8;
+}
diff --git a/crypto_kem/sntrup761/avx2/crypto_encode_int16.h b/crypto_kem/sntrup761/avx2/crypto_encode_int16.h
new file mode 100644
index 00000000..348de67a
--- /dev/null
+++ b/crypto_kem/sntrup761/avx2/crypto_encode_int16.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_SNTRUP761_AVX2_CRYPTO_ENCODE_INT16_H
+#define PQCLEAN_SNTRUP761_AVX2_CRYPTO_ENCODE_INT16_H
+
+#include <stdint.h>
+#define PQCLEAN_SNTRUP761_AVX2_crypto_encode_int16_STRBYTES 2
+#define PQCLEAN_SNTRUP761_AVX2_crypto_encode_int16_ITEMBYTES 2
+#define PQCLEAN_SNTRUP761_AVX2_crypto_encode_int16_ITEMS 1
+
+void PQCLEAN_SNTRUP761_AVX2_crypto_encode_int16(unsigned char *s, const void *x);
+#endif
diff --git a/crypto_kem/sntrup761/avx2/crypto_sort_int32.c b/crypto_kem/sntrup761/avx2/crypto_sort_int32.c
new file mode 100644
index 00000000..0f42e458
--- /dev/null
+++ b/crypto_kem/sntrup761/avx2/crypto_sort_int32.c
@@ -0,0 +1,1210 @@
+#include "crypto_sort_int32.h"
+#include <immintrin.h>
+// Based on supercop-20200820/crypto_sort/int32/avx2
+
+
+#define int32 int32_t
+
+typedef __m256i int32x8;
+#define int32x8_load(z) _mm256_loadu_si256((__m256i *) (z))
+#define int32x8_store(z,i) _mm256_storeu_si256((__m256i *) (z),(i))
+#define int32x8_min _mm256_min_epi32
+#define int32x8_max _mm256_max_epi32
+
+#define int32x8_MINMAX(a,b) \
+    do { \
+        int32x8 c = int32x8_min((a),(b)); \
+        (b) = int32x8_max((a),(b)); \
+        (a) = c; \
+    } while(0)
+
+static inline void int32_MINMAX(int32 *a, int32 *b) {
+    int32 ab = *b ^ *a;
+    int32 c = (int32)((int64_t) * b - (int64_t) * a);
+    c ^= ab & (c ^ *b);
+    c >>= 31;
+    c &= ab;
+    *a ^= c;
+    *b ^= c;
+}
+
+static void minmax_vector(int32 *x, int32 *y, size_t n) {
+    if ((long long) n < 8) {
+        while ((long long) n > 0) {
+            int32_MINMAX(x, y);
+            ++x;
+            ++y;
+            --n;
+        }
+        return;
+    }
+    if (n & 7) {
+        int32x8 x0 = int32x8_load(x + n - 8);
+        int32x8 y0 = int32x8_load(y + n - 8);
+        int32x8_MINMAX(x0, y0);
+        int32x8_store(x + n - 8, x0);
+        int32x8_store(y + n - 8, y0);
+        n &= ~7;
+    }
+    do {
+        int32x8 x0 = int32x8_load(x);
+        int32x8 y0 = int32x8_load(y);
+        int32x8_MINMAX(x0, y0);
+        int32x8_store(x, x0);
+        int32x8_store(y, y0);
+        x += 8;
+        y += 8;
+        n -= 8;
+    } while (n);
+}
+
+/* stages 8,4,2,1 of size-16 bitonic merging */
+static void merge16_finish(int32 *x, int32x8 x0, int32x8 x1, int flagdown) {
+    int32x8 b0, b1, c0, c1, mask;
+
+    int32x8_MINMAX(x0, x1);
+
+    b0 = _mm256_permute2x128_si256(x0, x1, 0x20); /* A0123B0123 */
+    b1 = _mm256_permute2x128_si256(x0, x1, 0x31); /* A4567B4567 */
+
+    int32x8_MINMAX(b0, b1);
+
+    c0 = _mm256_unpacklo_epi64(b0, b1); /* A0145B0145 */
+    c1 = _mm256_unpackhi_epi64(b0, b1); /* A2367B2367 */
+
+    int32x8_MINMAX(c0, c1);
+
+    b0 = _mm256_unpacklo_epi32(c0, c1); /* A0213B0213 */
+    b1 = _mm256_unpackhi_epi32(c0, c1); /* A4657B4657 */
+
+    c0 = _mm256_unpacklo_epi64(b0, b1); /* A0246B0246 */
+    c1 = _mm256_unpackhi_epi64(b0, b1); /* A1357B1357 */
+
+    int32x8_MINMAX(c0, c1);
+
+    b0 = _mm256_unpacklo_epi32(c0, c1); /* A0123B0123 */
+    b1 = _mm256_unpackhi_epi32(c0, c1); /* A4567B4567 */
+
+    x0 = _mm256_permute2x128_si256(b0, b1, 0x20); /* A01234567 */
+    x1 = _mm256_permute2x128_si256(b0, b1, 0x31); /* A01234567 */
+
+    if (flagdown) {
+        mask = _mm256_set1_epi32(-1);
+        x0 ^= mask;
+        x1 ^= mask;
+    }
+
+    int32x8_store(&x[0], x0);
+    int32x8_store(&x[8], x1);
+}
+
+/* stages 64,32 of bitonic merging; n is multiple of 128 */
+static void int32_twostages_32(int32 *x, size_t n) {
+    size_t i;
+
+    while (n > 0) {
+        for (i = 0; i < 32; i += 8) {
+            int32x8 x0 = int32x8_load(&x[i]);
+            int32x8 x1 = int32x8_load(&x[i + 32]);
+            int32x8 x2 = int32x8_load(&x[i + 64]);
+            int32x8 x3 = int32x8_load(&x[i + 96]);
+
+            int32x8_MINMAX(x0, x2);
+            int32x8_MINMAX(x1, x3);
+            int32x8_MINMAX(x0, x1);
+            int32x8_MINMAX(x2, x3);
+
+            int32x8_store(&x[i], x0);
+            int32x8_store(&x[i + 32], x1);
+            int32x8_store(&x[i + 64], x2);
+            int32x8_store(&x[i + 96], x3);
+        }
+        x += 128;
+        n -= 128;
+    }
+}
+
+/* stages 4q,2q,q of bitonic merging */
+static size_t int32_threestages(int32 *x, size_t n, size_t q) {
+    size_t k, i;
+
+    for (k = 0; k + 8 * q <= n; k += 8 * q) {
+        for (i = k; i < k + q; i += 8) {
+            int32x8 x0 = int32x8_load(&x[i]);
+            int32x8 x1 = int32x8_load(&x[i + q]);
+            int32x8 x2 = int32x8_load(&x[i + 2 * q]);
+            int32x8 x3 = int32x8_load(&x[i + 3 * q]);
+            int32x8 x4 = int32x8_load(&x[i + 4 * q]);
+            int32x8 x5 = int32x8_load(&x[i + 5 * q]);
+            int32x8 x6 = int32x8_load(&x[i + 6 * q]);
+            int32x8 x7 = int32x8_load(&x[i + 7 * q]);
+
+            int32x8_MINMAX(x0, x4);
+            int32x8_MINMAX(x1, x5);
+            int32x8_MINMAX(x2, x6);
+            int32x8_MINMAX(x3, x7);
+            int32x8_MINMAX(x0, x2);
+            int32x8_MINMAX(x1, x3);
+            int32x8_MINMAX(x4, x6);
+            int32x8_MINMAX(x5, x7);
+            int32x8_MINMAX(x0, x1);
+            int32x8_MINMAX(x2, x3);
+            int32x8_MINMAX(x4, x5);
+            int32x8_MINMAX(x6, x7);
+
+            int32x8_store(&x[i], x0);
+            int32x8_store(&x[i + q], x1);
+            int32x8_store(&x[i + 2 * q], x2);
+            int32x8_store(&x[i + 3 * q], x3);
+            int32x8_store(&x[i + 4 * q], x4);
+            int32x8_store(&x[i + 5 * q], x5);
+            int32x8_store(&x[i + 6 * q], x6);
+            int32x8_store(&x[i + 7 * q], x7);
+        }
+    }
+
+    return k;
+}
+
+/* n is a power of 2; n >= 8; if n == 8 then flagdown */
+// NOLINTNEXTLINE(google-readability-function-size)
+static void int32_sort_2power(int32 *x, size_t n, int flagdown) {
+    size_t p, q, i, j, k;
+    int32x8 mask;
+
+    if (n == 8) {
+        int32 x0 = x[0];
+        int32 x1 = x[1];
+        int32 x2 = x[2];
+        int32 x3 = x[3];
+        int32 x4 = x[4];
+        int32 x5 = x[5];
+        int32 x6 = x[6];
+        int32 x7 = x[7];
+
+        /* odd-even sort instead of bitonic sort */
+
+        int32_MINMAX(&x1, &x0);
+        int32_MINMAX(&x3, &x2);
+        int32_MINMAX(&x2, &x0);
+        int32_MINMAX(&x3, &x1);
+        int32_MINMAX(&x2, &x1);
+
+        int32_MINMAX(&x5, &x4);
+        int32_MINMAX(&x7, &x6);
+        int32_MINMAX(&x6, &x4);
+        int32_MINMAX(&x7, &x5);
+        int32_MINMAX(&x6, &x5);
+
+        int32_MINMAX(&x4, &x0);
+        int32_MINMAX(&x6, &x2);
+        int32_MINMAX(&x4, &x2);
+
+        int32_MINMAX(&x5, &x1);
+        int32_MINMAX(&x7, &x3);
+        int32_MINMAX(&x5, &x3);
+
+        int32_MINMAX(&x2, &x1);
+        int32_MINMAX(&x4, &x3);
+        int32_MINMAX(&x6, &x5);
+
+        x[0] = x0;
+        x[1] = x1;
+        x[2] = x2;
+        x[3] = x3;
+        x[4] = x4;
+        x[5] = x5;
+        x[6] = x6;
+        x[7] = x7;
+        return;
+    }
+
+    if (n == 16) {
+        int32x8 x0, x1, b0, b1, c0, c1;
+
+        x0 = int32x8_load(&x[0]);
+        x1 = int32x8_load(&x[8]);
+
+        mask = _mm256_set_epi32(0, 0, -1, -1, 0, 0, -1, -1);
+
+        x0 ^= mask; /* A01234567 */
+        x1 ^= mask; /* B01234567 */
+
+        b0 = _mm256_unpacklo_epi32(x0, x1); /* AB0AB1AB4AB5 */
+        b1 = _mm256_unpackhi_epi32(x0, x1); /* AB2AB3AB6AB7 */
+
+        c0 = _mm256_unpacklo_epi64(b0, b1); /* AB0AB2AB4AB6 */
+        c1 = _mm256_unpackhi_epi64(b0, b1); /* AB1AB3AB5AB7 */
+
+        int32x8_MINMAX(c0, c1);
+
+        mask = _mm256_set_epi32(0, 0, -1, -1, -1, -1, 0, 0);
+        c0 ^= mask;
+        c1 ^= mask;
+
+        b0 = _mm256_unpacklo_epi32(c0, c1); /* A01B01A45B45 */
+        b1 = _mm256_unpackhi_epi32(c0, c1); /* A23B23A67B67 */
+
+        int32x8_MINMAX(b0, b1);
+
+        x0 = _mm256_unpacklo_epi64(b0, b1); /* A01234567 */
+        x1 = _mm256_unpackhi_epi64(b0, b1); /* B01234567 */
+
+        b0 = _mm256_unpacklo_epi32(x0, x1); /* AB0AB1AB4AB5 */
+        b1 = _mm256_unpackhi_epi32(x0, x1); /* AB2AB3AB6AB7 */
+
+        c0 = _mm256_unpacklo_epi64(b0, b1); /* AB0AB2AB4AB6 */
+        c1 = _mm256_unpackhi_epi64(b0, b1); /* AB1AB3AB5AB7 */
+
+        int32x8_MINMAX(c0, c1);
+
+        b0 = _mm256_unpacklo_epi32(c0, c1); /* A01B01A45B45 */
+        b1 = _mm256_unpackhi_epi32(c0, c1); /* A23B23A67B67 */
+
+        b0 ^= mask;
+        b1 ^= mask;
+
+        c0 = _mm256_permute2x128_si256(b0, b1, 0x20); /* A01B01A23B23 */
+        c1 = _mm256_permute2x128_si256(b0, b1, 0x31); /* A45B45A67B67 */
+
+        int32x8_MINMAX(c0, c1);
+
+        b0 = _mm256_permute2x128_si256(c0, c1, 0x20); /* A01B01A45B45 */
+        b1 = _mm256_permute2x128_si256(c0, c1, 0x31); /* A23B23A67B67 */
+
+        int32x8_MINMAX(b0, b1);
+
+        x0 = _mm256_unpacklo_epi64(b0, b1); /* A01234567 */
+        x1 = _mm256_unpackhi_epi64(b0, b1); /* B01234567 */
+
+        b0 = _mm256_unpacklo_epi32(x0, x1); /* AB0AB1AB4AB5 */
+        b1 = _mm256_unpackhi_epi32(x0, x1); /* AB2AB3AB6AB7 */
+
+        c0 = _mm256_unpacklo_epi64(b0, b1); /* AB0AB2AB4AB6 */
+        c1 = _mm256_unpackhi_epi64(b0, b1); /* AB1AB3AB5AB7 */
+
+        int32x8_MINMAX(c0, c1);
+
+        b0 = _mm256_unpacklo_epi32(c0, c1); /* A01B01A45B45 */
+        b1 = _mm256_unpackhi_epi32(c0, c1); /* A23B23A67B67 */
+
+        x0 = _mm256_unpacklo_epi64(b0, b1); /* A01234567 */
+        x1 = _mm256_unpackhi_epi64(b0, b1); /* B01234567 */
+
+        mask = _mm256_set1_epi32(-1);
+        if (flagdown) {
+            x1 ^= mask;
+        } else {
+            x0 ^= mask;
+        }
+
+        merge16_finish(x, x0, x1, flagdown);
+        return;
+    }
+
+    if (n == 32) {
+        int32x8 x0, x1, x2, x3;
+
+        int32_sort_2power(x, 16, 1);
+        int32_sort_2power(x + 16, 16, 0);
+
+        x0 = int32x8_load(&x[0]);
+        x1 = int32x8_load(&x[8]);
+        x2 = int32x8_load(&x[16]);
+        x3 = int32x8_load(&x[24]);
+
+        if (flagdown) {
+            mask = _mm256_set1_epi32(-1);
+            x0 ^= mask;
+            x1 ^= mask;
+            x2 ^= mask;
+            x3 ^= mask;
+        }
+
+        int32x8_MINMAX(x0, x2);
+        int32x8_MINMAX(x1, x3);
+
+        merge16_finish(x, x0, x1, flagdown);
+        merge16_finish(x + 16, x2, x3, flagdown);
+        return;
+    }
+
+    p = n >> 3;
+    for (i = 0; i < p; i += 8) {
+        int32x8 x0 = int32x8_load(&x[i]);
+        int32x8 x2 = int32x8_load(&x[i + 2 * p]);
+        int32x8 x4 = int32x8_load(&x[i + 4 * p]);
+        int32x8 x6 = int32x8_load(&x[i + 6 * p]);
+
+        /* odd-even stage instead of bitonic stage */
+
+        int32x8_MINMAX(x4, x0);
+        int32x8_MINMAX(x6, x2);
+        int32x8_MINMAX(x2, x0);
+        int32x8_MINMAX(x6, x4);
+        int32x8_MINMAX(x2, x4);
+
+        int32x8_store(&x[i], x0);
+        int32x8_store(&x[i + 2 * p], x2);
+        int32x8_store(&x[i + 4 * p], x4);
+        int32x8_store(&x[i + 6 * p], x6);
+
+        int32x8 x1 = int32x8_load(&x[i + p]);
+        int32x8 x3 = int32x8_load(&x[i + 3 * p]);
+        int32x8 x5 = int32x8_load(&x[i + 5 * p]);
+        int32x8 x7 = int32x8_load(&x[i + 7 * p]);
+
+        int32x8_MINMAX(x1, x5);
+        int32x8_MINMAX(x3, x7);
+        int32x8_MINMAX(x1, x3);
+        int32x8_MINMAX(x5, x7);
+        int32x8_MINMAX(x5, x3);
+
+        int32x8_store(&x[i + p], x1);
+        int32x8_store(&x[i + 3 * p], x3);
+        int32x8_store(&x[i + 5 * p], x5);
+        int32x8_store(&x[i + 7 * p], x7);
+    }
+
+    if (n >= 128) {
+        int flip, flipflip;
+
+        mask = _mm256_set1_epi32(-1);
+
+        for (j = 0; j < n; j += 32) {
+            int32x8 x0 = int32x8_load(&x[j]);
+            int32x8 x1 = int32x8_load(&x[j + 16]);
+            x0 ^= mask;
+            x1 ^= mask;
+            int32x8_store(&x[j], x0);
+            int32x8_store(&x[j + 16], x1);
+        }
+
+        p = 8;
+        for (;;) { /* for p in [8, 16, ..., n/16] */
+            q = p >> 1;
+            while (q >= 128) {
+                int32_threestages(x, n, q >> 2);
+                q >>= 3;
+            }
+            if (q == 64) {
+                int32_twostages_32(x, n);
+                q = 16;
+            }
+            if (q == 32) {
+                q = 8;
+                for (k = 0; k < n; k += 8 * q) {
+                    for (i = k; i < k + q; i += 8) {
+                        int32x8 x0 = int32x8_load(&x[i]);
+                        int32x8 x1 = int32x8_load(&x[i + q]);
+                        int32x8 x2 = int32x8_load(&x[i + 2 * q]);
+                        int32x8 x3 = int32x8_load(&x[i + 3 * q]);
+                        int32x8 x4 = int32x8_load(&x[i + 4 * q]);
+                        int32x8 x5 = int32x8_load(&x[i + 5 * q]);
+                        int32x8 x6 = int32x8_load(&x[i + 6 * q]);
+                        int32x8 x7 = int32x8_load(&x[i + 7 * q]);
+
+                        int32x8_MINMAX(x0, x4);
+                        int32x8_MINMAX(x1, x5);
+                        int32x8_MINMAX(x2, x6);
+                        int32x8_MINMAX(x3, x7);
+                        int32x8_MINMAX(x0, x2);
+                        int32x8_MINMAX(x1, x3);
+                        int32x8_MINMAX(x4, x6);
+                        int32x8_MINMAX(x5, x7);
+                        int32x8_MINMAX(x0, x1);
+                        int32x8_MINMAX(x2, x3);
+                        int32x8_MINMAX(x4, x5);
+                        int32x8_MINMAX(x6, x7);
+
+                        int32x8_store(&x[i], x0);
+                        int32x8_store(&x[i + q], x1);
+                        int32x8_store(&x[i + 2 * q], x2);
+                        int32x8_store(&x[i + 3 * q], x3);
+                        int32x8_store(&x[i + 4 * q], x4);
+                        int32x8_store(&x[i + 5 * q], x5);
+                        int32x8_store(&x[i + 6 * q], x6);
+                        int32x8_store(&x[i + 7 * q], x7);
+                    }
+                }
+                q = 4;
+            }
+            if (q == 16) {
+                q = 8;
+                for (k = 0; k < n; k += 4 * q) {
+                    for (i = k; i < k + q; i += 8) {
+                        int32x8 x0 = int32x8_load(&x[i]);
+                        int32x8 x1 = int32x8_load(&x[i + q]);
+                        int32x8 x2 = int32x8_load(&x[i + 2 * q]);
+                        int32x8 x3 = int32x8_load(&x[i + 3 * q]);
+
+                        int32x8_MINMAX(x0, x2);
+                        int32x8_MINMAX(x1, x3);
+                        int32x8_MINMAX(x0, x1);
+                        int32x8_MINMAX(x2, x3);
+
+                        int32x8_store(&x[i], x0);
+                        int32x8_store(&x[i + q], x1);
+                        int32x8_store(&x[i + 2 * q], x2);
+                        int32x8_store(&x[i + 3 * q], x3);
+                    }
+                }
+                q = 4;
+            }
+            if (q == 8) {
+                for (k = 0; k < n; k += q + q) {
+                    int32x8 x0 = int32x8_load(&x[k]);
+                    int32x8 x1 = int32x8_load(&x[k + q]);
+
+                    int32x8_MINMAX(x0, x1);
+
+                    int32x8_store(&x[k], x0);
+                    int32x8_store(&x[k + q], x1);
+                }
+            }
+
+            q = n >> 3;
+            flip = (p << 1 == q);
+            flipflip = !flip;
+            for (j = 0; j < q; j += p + p) {
+                for (k = j; k < j + p + p; k += p) {
+                    for (i = k; i < k + p; i += 8) {
+                        int32x8 x0 = int32x8_load(&x[i]);
+                        int32x8 x1 = int32x8_load(&x[i + q]);
+                        int32x8 x2 = int32x8_load(&x[i + 2 * q]);
+                        int32x8 x3 = int32x8_load(&x[i + 3 * q]);
+                        int32x8 x4 = int32x8_load(&x[i + 4 * q]);
+                        int32x8 x5 = int32x8_load(&x[i + 5 * q]);
+                        int32x8 x6 = int32x8_load(&x[i + 6 * q]);
+                        int32x8 x7 = int32x8_load(&x[i + 7 * q]);
+
+                        int32x8_MINMAX(x0, x1);
+                        int32x8_MINMAX(x2, x3);
+                        int32x8_MINMAX(x4, x5);
+                        int32x8_MINMAX(x6, x7);
+                        int32x8_MINMAX(x0, x2);
+                        int32x8_MINMAX(x1, x3);
+                        int32x8_MINMAX(x4, x6);
+                        int32x8_MINMAX(x5, x7);
+                        int32x8_MINMAX(x0, x4);
+                        int32x8_MINMAX(x1, x5);
+                        int32x8_MINMAX(x2, x6);
+                        int32x8_MINMAX(x3, x7);
+
+                        if (flip) {
+                            x0 ^= mask;
+                            x1 ^= mask;
+                            x2 ^= mask;
+                            x3 ^= mask;
+                            x4 ^= mask;
+                            x5 ^= mask;
+                            x6 ^= mask;
+                            x7 ^= mask;
+                        }
+
+                        int32x8_store(&x[i], x0);
+                        int32x8_store(&x[i + q], x1);
+                        int32x8_store(&x[i + 2 * q], x2);
+                        int32x8_store(&x[i + 3 * q], x3);
+                        int32x8_store(&x[i + 4 * q], x4);
+                        int32x8_store(&x[i + 5 * q], x5);
+                        int32x8_store(&x[i + 6 * q], x6);
+                        int32x8_store(&x[i + 7 * q], x7);
+                    }
+                    flip ^= 1;
+                }
+                flip ^= flipflip;
+            }
+
+            if (p << 4 == n) {
+                break;
+            }
+            p <<= 1;
+        }
+    }
+
+    for (p = 4; p >= 1; p >>= 1) {
+        int32 *z = x;
+        int32 *target = x + n;
+        if (p == 4) {
+            mask = _mm256_set_epi32(0, 0, 0, 0, -1, -1, -1, -1);
+            while (z != target) {
+                int32x8 x0 = int32x8_load(&z[0]);
+                int32x8 x1 = int32x8_load(&z[8]);
+                x0 ^= mask;
+                x1 ^= mask;
+                int32x8_store(&z[0], x0);
+                int32x8_store(&z[8], x1);
+                z += 16;
+            }
+        } else if (p == 2) {
+            mask = _mm256_set_epi32(0, 0, -1, -1, -1, -1, 0, 0);
+            while (z != target) {
+                int32x8 x0 = int32x8_load(&z[0]);
+                int32x8 x1 = int32x8_load(&z[8]);
+                x0 ^= mask;
+                x1 ^= mask;
+                int32x8 b0 = _mm256_permute2x128_si256(x0, x1, 0x20);
+                int32x8 b1 = _mm256_permute2x128_si256(x0, x1, 0x31);
+                int32x8_MINMAX(b0, b1);
+                int32x8 c0 = _mm256_permute2x128_si256(b0, b1, 0x20);
+                int32x8 c1 = _mm256_permute2x128_si256(b0, b1, 0x31);
+                int32x8_store(&z[0], c0);
+                int32x8_store(&z[8], c1);
+                z += 16;
+            }
+        } else { /* p == 1 */
+            mask = _mm256_set_epi32(0, -1, -1, 0, 0, -1, -1, 0);
+            while (z != target) {
+                int32x8 x0 = int32x8_load(&z[0]);
+                int32x8 x1 = int32x8_load(&z[8]);
+                x0 ^= mask;
+                x1 ^= mask;
+                int32x8 b0 = _mm256_permute2x128_si256(x0, x1, 0x20); /* A0123B0123 */
+                int32x8 b1 = _mm256_permute2x128_si256(x0, x1, 0x31); /* A4567B4567 */
+                int32x8 c0 = _mm256_unpacklo_epi64(b0, b1); /* A0145B0145 */
+                int32x8 c1 = _mm256_unpackhi_epi64(b0, b1); /* A2367B2367 */
+                int32x8_MINMAX(c0, c1);
+                int32x8 d0 = _mm256_unpacklo_epi64(c0, c1); /* A0123B0123 */
+                int32x8 d1 = _mm256_unpackhi_epi64(c0, c1); /* A4567B4567 */
+                int32x8_MINMAX(d0, d1);
+                int32x8 e0 = _mm256_permute2x128_si256(d0, d1, 0x20);
+                int32x8 e1 = _mm256_permute2x128_si256(d0, d1, 0x31);
+                int32x8_store(&z[0], e0);
+                int32x8_store(&z[8], e1);
+                z += 16;
+            }
+        }
+
+        q = n >> 4;
+        while (q >= 128 || q == 32) {
+            int32_threestages(x, n, q >> 2);
+            q >>= 3;
+        }
+        while (q >= 16) {
+            q >>= 1;
+            for (j = 0; j < n; j += 4 * q) {
+                for (k = j; k < j + q; k += 8) {
+                    int32x8 x0 = int32x8_load(&x[k]);
+                    int32x8 x1 = int32x8_load(&x[k + q]);
+                    int32x8 x2 = int32x8_load(&x[k + 2 * q]);
+                    int32x8 x3 = int32x8_load(&x[k + 3 * q]);
+
+                    int32x8_MINMAX(x0, x2);
+                    int32x8_MINMAX(x1, x3);
+                    int32x8_MINMAX(x0, x1);
+                    int32x8_MINMAX(x2, x3);
+
+                    int32x8_store(&x[k], x0);
+                    int32x8_store(&x[k + q], x1);
+                    int32x8_store(&x[k + 2 * q], x2);
+                    int32x8_store(&x[k + 3 * q], x3);
+                }
+            }
+            q >>= 1;
+        }
+        if (q == 8) {
+            for (j = 0; j < n; j += 2 * q) {
+                int32x8 x0 = int32x8_load(&x[j]);
+                int32x8 x1 = int32x8_load(&x[j + q]);
+
+                int32x8_MINMAX(x0, x1);
+
+                int32x8_store(&x[j], x0);
+                int32x8_store(&x[j + q], x1);
+            }
+        }
+
+        q = n >> 3;
+        for (k = 0; k < q; k += 8) {
+            int32x8 x0 = int32x8_load(&x[k]);
+            int32x8 x1 = int32x8_load(&x[k + q]);
+            int32x8 x2 = int32x8_load(&x[k + 2 * q]);
+            int32x8 x3 = int32x8_load(&x[k + 3 * q]);
+            int32x8 x4 = int32x8_load(&x[k + 4 * q]);
+            int32x8 x5 = int32x8_load(&x[k + 5 * q]);
+            int32x8 x6 = int32x8_load(&x[k + 6 * q]);
+            int32x8 x7 = int32x8_load(&x[k + 7 * q]);
+
+            int32x8_MINMAX(x0, x1);
+            int32x8_MINMAX(x2, x3);
+            int32x8_MINMAX(x4, x5);
+            int32x8_MINMAX(x6, x7);
+            int32x8_MINMAX(x0, x2);
+            int32x8_MINMAX(x1, x3);
+            int32x8_MINMAX(x4, x6);
+            int32x8_MINMAX(x5, x7);
+            int32x8_MINMAX(x0, x4);
+            int32x8_MINMAX(x1, x5);
+            int32x8_MINMAX(x2, x6);
+            int32x8_MINMAX(x3, x7);
+
+            int32x8_store(&x[k], x0);
+            int32x8_store(&x[k + q], x1);
+            int32x8_store(&x[k + 2 * q], x2);
+            int32x8_store(&x[k + 3 * q], x3);
+            int32x8_store(&x[k + 4 * q], x4);
+            int32x8_store(&x[k + 5 * q], x5);
+            int32x8_store(&x[k + 6 * q], x6);
+            int32x8_store(&x[k + 7 * q], x7);
+        }
+    }
+
+    /* everything is still masked with _mm256_set_epi32(0,-1,0,-1,0,-1,0,-1); */
+    mask = _mm256_set1_epi32(-1);
+
+    for (i = 0; i < n; i += 64) {
+        int32x8 a0 = int32x8_load(&x[i]);
+        int32x8 a1 = int32x8_load(&x[i + 8]);
+        int32x8 a2 = int32x8_load(&x[i + 16]);
+        int32x8 a3 = int32x8_load(&x[i + 24]);
+        int32x8 a4 = int32x8_load(&x[i + 32]);
+        int32x8 a5 = int32x8_load(&x[i + 40]);
+        int32x8 a6 = int32x8_load(&x[i + 48]);
+        int32x8 a7 = int32x8_load(&x[i + 56]);
+
+        int32x8 b0 = _mm256_unpacklo_epi32(a0, a1); /* AB0AB1AB4AB5 */
+        int32x8 b1 = _mm256_unpackhi_epi32(a0, a1); /* AB2AB3AB6AB7 */
+        int32x8 b2 = _mm256_unpacklo_epi32(a2, a3); /* CD0CD1CD4CD5 */
+        int32x8 b3 = _mm256_unpackhi_epi32(a2, a3); /* CD2CD3CD6CD7 */
+        int32x8 b4 = _mm256_unpacklo_epi32(a4, a5); /* EF0EF1EF4EF5 */
+        int32x8 b5 = _mm256_unpackhi_epi32(a4, a5); /* EF2EF3EF6EF7 */
+        int32x8 b6 = _mm256_unpacklo_epi32(a6, a7); /* GH0GH1GH4GH5 */
+        int32x8 b7 = _mm256_unpackhi_epi32(a6, a7); /* GH2GH3GH6GH7 */
+
+        int32x8 c0 = _mm256_unpacklo_epi64(b0, b2); /* ABCD0ABCD4 */
+        int32x8 c1 = _mm256_unpacklo_epi64(b1, b3); /* ABCD2ABCD6 */
+        int32x8 c2 = _mm256_unpackhi_epi64(b0, b2); /* ABCD1ABCD5 */
+        int32x8 c3 = _mm256_unpackhi_epi64(b1, b3); /* ABCD3ABCD7 */
+        int32x8 c4 = _mm256_unpacklo_epi64(b4, b6); /* EFGH0EFGH4 */
+        int32x8 c5 = _mm256_unpacklo_epi64(b5, b7); /* EFGH2EFGH6 */
+        int32x8 c6 = _mm256_unpackhi_epi64(b4, b6); /* EFGH1EFGH5 */
+        int32x8 c7 = _mm256_unpackhi_epi64(b5, b7); /* EFGH3EFGH7 */
+
+        if (flagdown) {
+            c2 ^= mask;
+            c3 ^= mask;
+            c6 ^= mask;
+            c7 ^= mask;
+        } else {
+            c0 ^= mask;
+            c1 ^= mask;
+            c4 ^= mask;
+            c5 ^= mask;
+        }
+
+        int32x8 d0 = _mm256_permute2x128_si256(c0, c4, 0x20); /* ABCDEFGH0 */
+        int32x8 d1 = _mm256_permute2x128_si256(c2, c6, 0x20); /* ABCDEFGH1 */
+        int32x8 d2 = _mm256_permute2x128_si256(c1, c5, 0x20); /* ABCDEFGH2 */
+        int32x8 d3 = _mm256_permute2x128_si256(c3, c7, 0x20); /* ABCDEFGH5 */
+        int32x8 d4 = _mm256_permute2x128_si256(c0, c4, 0x31); /* ABCDEFGH4 */
+        int32x8 d5 = _mm256_permute2x128_si256(c2, c6, 0x31); /* ABCDEFGH3 */
+        int32x8 d6 = _mm256_permute2x128_si256(c1, c5, 0x31); /* ABCDEFGH6 */
+        int32x8 d7 = _mm256_permute2x128_si256(c3, c7, 0x31); /* ABCDEFGH7 */
+
+        int32x8_MINMAX(d0, d1);
+        int32x8_MINMAX(d2, d3);
+        int32x8_MINMAX(d4, d5);
+        int32x8_MINMAX(d6, d7);
+        int32x8_MINMAX(d0, d2);
+        int32x8_MINMAX(d1, d3);
+        int32x8_MINMAX(d4, d6);
+        int32x8_MINMAX(d5, d7);
+        int32x8_MINMAX(d0, d4);
+        int32x8_MINMAX(d1, d5);
+        int32x8_MINMAX(d2, d6);
+        int32x8_MINMAX(d3, d7);
+
+        int32x8 e0 = _mm256_unpacklo_epi32(d0, d1);
+        int32x8 e1 = _mm256_unpackhi_epi32(d0, d1);
+        int32x8 e2 = _mm256_unpacklo_epi32(d2, d3);
+        int32x8 e3 = _mm256_unpackhi_epi32(d2, d3);
+        int32x8 e4 = _mm256_unpacklo_epi32(d4, d5);
+        int32x8 e5 = _mm256_unpackhi_epi32(d4, d5);
+        int32x8 e6 = _mm256_unpacklo_epi32(d6, d7);
+        int32x8 e7 = _mm256_unpackhi_epi32(d6, d7);
+
+        int32x8 f0 = _mm256_unpacklo_epi64(e0, e2);
+        int32x8 f1 = _mm256_unpacklo_epi64(e1, e3);
+        int32x8 f2 = _mm256_unpackhi_epi64(e0, e2);
+        int32x8 f3 = _mm256_unpackhi_epi64(e1, e3);
+        int32x8 f4 = _mm256_unpacklo_epi64(e4, e6);
+        int32x8 f5 = _mm256_unpacklo_epi64(e5, e7);
+        int32x8 f6 = _mm256_unpackhi_epi64(e4, e6);
+        int32x8 f7 = _mm256_unpackhi_epi64(e5, e7);
+
+        int32x8 g0 = _mm256_permute2x128_si256(f0, f4, 0x20);
+        int32x8 g1 = _mm256_permute2x128_si256(f2, f6, 0x20);
+        int32x8 g2 = _mm256_permute2x128_si256(f1, f5, 0x20);
+        int32x8 g3 = _mm256_permute2x128_si256(f3, f7, 0x20);
+        int32x8 g4 = _mm256_permute2x128_si256(f0, f4, 0x31);
+        int32x8 g5 = _mm256_permute2x128_si256(f2, f6, 0x31);
+        int32x8 g6 = _mm256_permute2x128_si256(f1, f5, 0x31);
+        int32x8 g7 = _mm256_permute2x128_si256(f3, f7, 0x31);
+
+        int32x8_store(&x[i], g0);
+        int32x8_store(&x[i + 8], g1);
+        int32x8_store(&x[i + 16], g2);
+        int32x8_store(&x[i + 24], g3);
+        int32x8_store(&x[i + 32], g4);
+        int32x8_store(&x[i + 40], g5);
+        int32x8_store(&x[i + 48], g6);
+        int32x8_store(&x[i + 56], g7);
+    }
+
+    q = n >> 4;
+    while (q >= 128 || q == 32) {
+        q >>= 2;
+        for (j = 0; j < n; j += 8 * q) {
+            for (i = j; i < j + q; i += 8) {
+                int32x8 x0 = int32x8_load(&x[i]);
+                int32x8 x1 = int32x8_load(&x[i + q]);
+                int32x8 x2 = int32x8_load(&x[i + 2 * q]);
+                int32x8 x3 = int32x8_load(&x[i + 3 * q]);
+                int32x8 x4 = int32x8_load(&x[i + 4 * q]);
+                int32x8 x5 = int32x8_load(&x[i + 5 * q]);
+                int32x8 x6 = int32x8_load(&x[i + 6 * q]);
+                int32x8 x7 = int32x8_load(&x[i + 7 * q]);
+                int32x8_MINMAX(x0, x4);
+                int32x8_MINMAX(x1, x5);
+                int32x8_MINMAX(x2, x6);
+                int32x8_MINMAX(x3, x7);
+                int32x8_MINMAX(x0, x2);
+                int32x8_MINMAX(x1, x3);
+                int32x8_MINMAX(x4, x6);
+                int32x8_MINMAX(x5, x7);
+                int32x8_MINMAX(x0, x1);
+                int32x8_MINMAX(x2, x3);
+                int32x8_MINMAX(x4, x5);
+                int32x8_MINMAX(x6, x7);
+                int32x8_store(&x[i], x0);
+                int32x8_store(&x[i + q], x1);
+                int32x8_store(&x[i + 2 * q], x2);
+                int32x8_store(&x[i + 3 * q], x3);
+                int32x8_store(&x[i + 4 * q], x4);
+                int32x8_store(&x[i + 5 * q], x5);
+                int32x8_store(&x[i + 6 * q], x6);
+                int32x8_store(&x[i + 7 * q], x7);
+            }
+        }
+        q >>= 1;
+    }
+    while (q >= 16) {
+        q >>= 1;
+        for (j = 0; j < n; j += 4 * q) {
+            for (i = j; i < j + q; i += 8) {
+                int32x8 x0 = int32x8_load(&x[i]);
+                int32x8 x1 = int32x8_load(&x[i + q]);
+                int32x8 x2 = int32x8_load(&x[i + 2 * q]);
+                int32x8 x3 = int32x8_load(&x[i + 3 * q]);
+                int32x8_MINMAX(x0, x2);
+                int32x8_MINMAX(x1, x3);
+                int32x8_MINMAX(x0, x1);
+                int32x8_MINMAX(x2, x3);
+                int32x8_store(&x[i], x0);
+                int32x8_store(&x[i + q], x1);
+                int32x8_store(&x[i + 2 * q], x2);
+                int32x8_store(&x[i + 3 * q], x3);
+            }
+        }
+        q >>= 1;
+    }
+    if (q == 8) {
+        for (j = 0; j < n; j += q + q) {
+            int32x8 x0 = int32x8_load(&x[j]);
+            int32x8 x1 = int32x8_load(&x[j + q]);
+            int32x8_MINMAX(x0, x1);
+            int32x8_store(&x[j], x0);
+            int32x8_store(&x[j + q], x1);
+        }
+    }
+
+    q = n >> 3;
+    for (i = 0; i < q; i += 8) {
+        int32x8 x0 = int32x8_load(&x[i]);
+        int32x8 x1 = int32x8_load(&x[i + q]);
+        int32x8 x2 = int32x8_load(&x[i + 2 * q]);
+        int32x8 x3 = int32x8_load(&x[i + 3 * q]);
+        int32x8 x4 = int32x8_load(&x[i + 4 * q]);
+        int32x8 x5 = int32x8_load(&x[i + 5 * q]);
+        int32x8 x6 = int32x8_load(&x[i + 6 * q]);
+        int32x8 x7 = int32x8_load(&x[i + 7 * q]);
+
+        int32x8_MINMAX(x0, x1);
+        int32x8_MINMAX(x2, x3);
+        int32x8_MINMAX(x4, x5);
+        int32x8_MINMAX(x6, x7);
+        int32x8_MINMAX(x0, x2);
+        int32x8_MINMAX(x1, x3);
+        int32x8_MINMAX(x4, x6);
+        int32x8_MINMAX(x5, x7);
+        int32x8_MINMAX(x0, x4);
+        int32x8_MINMAX(x1, x5);
+        int32x8_MINMAX(x2, x6);
+        int32x8_MINMAX(x3, x7);
+
+        int32x8 b0 = _mm256_unpacklo_epi32(x0, x4); /* AE0AE1AE4AE5 */
+        int32x8 b1 = _mm256_unpackhi_epi32(x0, x4); /* AE2AE3AE6AE7 */
+        int32x8 b2 = _mm256_unpacklo_epi32(x1, x5); /* BF0BF1BF4BF5 */
+        int32x8 b3 = _mm256_unpackhi_epi32(x1, x5); /* BF2BF3BF6BF7 */
+        int32x8 b4 = _mm256_unpacklo_epi32(x2, x6); /* CG0CG1CG4CG5 */
+        int32x8 b5 = _mm256_unpackhi_epi32(x2, x6); /* CG2CG3CG6CG7 */
+        int32x8 b6 = _mm256_unpacklo_epi32(x3, x7); /* DH0DH1DH4DH5 */
+        int32x8 b7 = _mm256_unpackhi_epi32(x3, x7); /* DH2DH3DH6DH7 */
+
+        int32x8 c0 = _mm256_unpacklo_epi64(b0, b4); /* AECG0AECG4 */
+        int32x8 c1 = _mm256_unpacklo_epi64(b1, b5); /* AECG2AECG6 */
+        int32x8 c2 = _mm256_unpackhi_epi64(b0, b4); /* AECG1AECG5 */
+        int32x8 c3 = _mm256_unpackhi_epi64(b1, b5); /* AECG3AECG7 */
+        int32x8 c4 = _mm256_unpacklo_epi64(b2, b6); /* BFDH0BFDH4 */
+        int32x8 c5 = _mm256_unpacklo_epi64(b3, b7); /* BFDH2BFDH6 */
+        int32x8 c6 = _mm256_unpackhi_epi64(b2, b6); /* BFDH1BFDH5 */
+        int32x8 c7 = _mm256_unpackhi_epi64(b3, b7); /* BFDH3BFDH7 */
+
+        int32x8 d0 = _mm256_permute2x128_si256(c0, c4, 0x20); /* AECGBFDH0 */
+        int32x8 d1 = _mm256_permute2x128_si256(c1, c5, 0x20); /* AECGBFDH2 */
+        int32x8 d2 = _mm256_permute2x128_si256(c2, c6, 0x20); /* AECGBFDH1 */
+        int32x8 d3 = _mm256_permute2x128_si256(c3, c7, 0x20); /* AECGBFDH3 */
+        int32x8 d4 = _mm256_permute2x128_si256(c0, c4, 0x31); /* AECGBFDH4 */
+        int32x8 d5 = _mm256_permute2x128_si256(c1, c5, 0x31); /* AECGBFDH6 */
+        int32x8 d6 = _mm256_permute2x128_si256(c2, c6, 0x31); /* AECGBFDH5 */
+        int32x8 d7 = _mm256_permute2x128_si256(c3, c7, 0x31); /* AECGBFDH7 */
+
+        if (flagdown) {
+            d0 ^= mask;
+            d1 ^= mask;
+            d2 ^= mask;
+            d3 ^= mask;
+            d4 ^= mask;
+            d5 ^= mask;
+            d6 ^= mask;
+            d7 ^= mask;
+        }
+
+        int32x8_store(&x[i], d0);
+        int32x8_store(&x[i + q], d4);
+        int32x8_store(&x[i + 2 * q], d1);
+        int32x8_store(&x[i + 3 * q], d5);
+        int32x8_store(&x[i + 4 * q], d2);
+        int32x8_store(&x[i + 5 * q], d6);
+        int32x8_store(&x[i + 6 * q], d3);
+        int32x8_store(&x[i + 7 * q], d7);
+    }
+}
+
+void PQCLEAN_SNTRUP761_AVX2_crypto_sort_int32(int32 *x, size_t n) {
+    size_t q, i, j;
+
+    if (n <= 8) {
+        if (n == 8) {
+            int32_MINMAX(&x[0], &x[1]);
+            int32_MINMAX(&x[1], &x[2]);
+            int32_MINMAX(&x[2], &x[3]);
+            int32_MINMAX(&x[3], &x[4]);
+            int32_MINMAX(&x[4], &x[5]);
+            int32_MINMAX(&x[5], &x[6]);
+            int32_MINMAX(&x[6], &x[7]);
+        }
+        if (n >= 7) {
+            int32_MINMAX(&x[0], &x[1]);
+            int32_MINMAX(&x[1], &x[2]);
+            int32_MINMAX(&x[2], &x[3]);
+            int32_MINMAX(&x[3], &x[4]);
+            int32_MINMAX(&x[4], &x[5]);
+            int32_MINMAX(&x[5], &x[6]);
+        }
+        if (n >= 6) {
+            int32_MINMAX(&x[0], &x[1]);
+            int32_MINMAX(&x[1], &x[2]);
+            int32_MINMAX(&x[2], &x[3]);
+            int32_MINMAX(&x[3], &x[4]);
+            int32_MINMAX(&x[4], &x[5]);
+        }
+        if (n >= 5) {
+            int32_MINMAX(&x[0], &x[1]);
+            int32_MINMAX(&x[1], &x[2]);
+            int32_MINMAX(&x[2], &x[3]);
+            int32_MINMAX(&x[3], &x[4]);
+        }
+        if (n >= 4) {
+            int32_MINMAX(&x[0], &x[1]);
+            int32_MINMAX(&x[1], &x[2]);
+            int32_MINMAX(&x[2], &x[3]);
+        }
+        if (n >= 3) {
+            int32_MINMAX(&x[0], &x[1]);
+            int32_MINMAX(&x[1], &x[2]);
+        }
+        if (n >= 2) {
+            int32_MINMAX(&x[0], &x[1]);
+        }
+        return;
+    }
+
+    if (!(n & (n - 1))) {
+        int32_sort_2power(x, n, 0);
+        return;
+    }
+
+    q = 8;
+    while (q < n - q) {
+        q += q;
+    }
+    /* n > q >= 8 */
+
+    if (q <= 128) { /* n <= 256 */
+        int32x8 y[32];
+        for (i = q >> 3; i < q >> 2; ++i) {
+            y[i] = _mm256_set1_epi32(0x7fffffff);
+        }
+        for (i = 0; i < n; ++i) {
+            ((int32 *) y)[i] = x[i];
+        }
+        int32_sort_2power((int32 *) y, 2 * q, 0);
+        for (i = 0; i < n; ++i) {
+            x[i] = ((int32 *) y)[i];
+        }
+        return;
+    }
+
+    int32_sort_2power(x, q, 1);
+    PQCLEAN_SNTRUP761_AVX2_crypto_sort_int32(x + q, n - q);
+
+    while (q >= 64) {
+        q >>= 2;
+        j = int32_threestages(x, n, q);
+        minmax_vector(x + j, x + j + 4 * q, n - 4 * q - j);
+        if (j + 4 * q <= n) {
+            for (i = j; i < j + q; i += 8) {
+                int32x8 x0 = int32x8_load(&x[i]);
+                int32x8 x1 = int32x8_load(&x[i + q]);
+                int32x8 x2 = int32x8_load(&x[i + 2 * q]);
+                int32x8 x3 = int32x8_load(&x[i + 3 * q]);
+                int32x8_MINMAX(x0, x2);
+                int32x8_MINMAX(x1, x3);
+                int32x8_MINMAX(x0, x1);
+                int32x8_MINMAX(x2, x3);
+                int32x8_store(&x[i], x0);
+                int32x8_store(&x[i + q], x1);
+                int32x8_store(&x[i + 2 * q], x2);
+                int32x8_store(&x[i + 3 * q], x3);
+            }
+            j += 4 * q;
+        }
+        minmax_vector(x + j, x + j + 2 * q, n - 2 * q - j);
+        if (j + 2 * q <= n) {
+            for (i = j; i < j + q; i += 8) {
+                int32x8 x0 = int32x8_load(&x[i]);
+                int32x8 x1 = int32x8_load(&x[i + q]);
+                int32x8_MINMAX(x0, x1);
+                int32x8_store(&x[i], x0);
+                int32x8_store(&x[i + q], x1);
+            }
+            j += 2 * q;
+        }
+        minmax_vector(x + j, x + j + q, n - q - j);
+        q >>= 1;
+    }
+    if (q == 32) {
+        j = 0;
+        for (; j + 64 <= n; j += 64) {
+            int32x8 x0 = int32x8_load(&x[j]);
+            int32x8 x1 = int32x8_load(&x[j + 8]);
+            int32x8 x2 = int32x8_load(&x[j + 16]);
+            int32x8 x3 = int32x8_load(&x[j + 24]);
+            int32x8 x4 = int32x8_load(&x[j + 32]);
+            int32x8 x5 = int32x8_load(&x[j + 40]);
+            int32x8 x6 = int32x8_load(&x[j + 48]);
+            int32x8 x7 = int32x8_load(&x[j + 56]);
+            int32x8_MINMAX(x0, x4);
+            int32x8_MINMAX(x1, x5);
+            int32x8_MINMAX(x2, x6);
+            int32x8_MINMAX(x3, x7);
+            int32x8_MINMAX(x0, x2);
+            int32x8_MINMAX(x1, x3);
+            int32x8_MINMAX(x4, x6);
+            int32x8_MINMAX(x5, x7);
+            int32x8_MINMAX(x0, x1);
+            int32x8_MINMAX(x2, x3);
+            int32x8_MINMAX(x4, x5);
+            int32x8_MINMAX(x6, x7);
+            int32x8 a0 = _mm256_permute2x128_si256(x0, x1, 0x20);
+            int32x8 a1 = _mm256_permute2x128_si256(x0, x1, 0x31);
+            int32x8 a2 = _mm256_permute2x128_si256(x2, x3, 0x20);
+            int32x8 a3 = _mm256_permute2x128_si256(x2, x3, 0x31);
+            int32x8 a4 = _mm256_permute2x128_si256(x4, x5, 0x20);
+            int32x8 a5 = _mm256_permute2x128_si256(x4, x5, 0x31);
+            int32x8 a6 = _mm256_permute2x128_si256(x6, x7, 0x20);
+            int32x8 a7 = _mm256_permute2x128_si256(x6, x7, 0x31);
+            int32x8_MINMAX(a0, a1);
+            int32x8_MINMAX(a2, a3);
+            int32x8_MINMAX(a4, a5);
+            int32x8_MINMAX(a6, a7);
+            int32x8 b0 = _mm256_permute2x128_si256(a0, a1, 0x20);
+            int32x8 b1 = _mm256_permute2x128_si256(a0, a1, 0x31);
+            int32x8 b2 = _mm256_permute2x128_si256(a2, a3, 0x20);
+            int32x8 b3 = _mm256_permute2x128_si256(a2, a3, 0x31);
+            int32x8 b4 = _mm256_permute2x128_si256(a4, a5, 0x20);
+            int32x8 b5 = _mm256_permute2x128_si256(a4, a5, 0x31);
+            int32x8 b6 = _mm256_permute2x128_si256(a6, a7, 0x20);
+            int32x8 b7 = _mm256_permute2x128_si256(a6, a7, 0x31);
+            int32x8 c0 = _mm256_unpacklo_epi64(b0, b1);
+            int32x8 c1 = _mm256_unpackhi_epi64(b0, b1);
+            int32x8 c2 = _mm256_unpacklo_epi64(b2, b3);
+            int32x8 c3 = _mm256_unpackhi_epi64(b2, b3);
+            int32x8 c4 = _mm256_unpacklo_epi64(b4, b5);
+            int32x8 c5 = _mm256_unpackhi_epi64(b4, b5);
+            int32x8 c6 = _mm256_unpacklo_epi64(b6, b7);
+            int32x8 c7 = _mm256_unpackhi_epi64(b6, b7);
+            int32x8_MINMAX(c0, c1);
+            int32x8_MINMAX(c2, c3);
+            int32x8_MINMAX(c4, c5);
+            int32x8_MINMAX(c6, c7);
+            int32x8 d0 = _mm256_unpacklo_epi32(c0, c1);
+            int32x8 d1 = _mm256_unpackhi_epi32(c0, c1);
+            int32x8 d2 = _mm256_unpacklo_epi32(c2, c3);
+            int32x8 d3 = _mm256_unpackhi_epi32(c2, c3);
+            int32x8 d4 = _mm256_unpacklo_epi32(c4, c5);
+            int32x8 d5 = _mm256_unpackhi_epi32(c4, c5);
+            int32x8 d6 = _mm256_unpacklo_epi32(c6, c7);
+            int32x8 d7 = _mm256_unpackhi_epi32(c6, c7);
+            int32x8 e0 = _mm256_unpacklo_epi64(d0, d1);
+            int32x8 e1 = _mm256_unpackhi_epi64(d0, d1);
+            int32x8 e2 = _mm256_unpacklo_epi64(d2, d3);
+            int32x8 e3 = _mm256_unpackhi_epi64(d2, d3);
+            int32x8 e4 = _mm256_unpacklo_epi64(d4, d5);
+            int32x8 e5 = _mm256_unpackhi_epi64(d4, d5);
+            int32x8 e6 = _mm256_unpacklo_epi64(d6, d7);
+            int32x8 e7 = _mm256_unpackhi_epi64(d6, d7);
+            int32x8_MINMAX(e0, e1);
+            int32x8_MINMAX(e2, e3);
+            int32x8_MINMAX(e4, e5);
+            int32x8_MINMAX(e6, e7);
+            int32x8 f0 = _mm256_unpacklo_epi32(e0, e1);
+            int32x8 f1 = _mm256_unpackhi_epi32(e0, e1);
+            int32x8 f2 = _mm256_unpacklo_epi32(e2, e3);
+            int32x8 f3 = _mm256_unpackhi_epi32(e2, e3);
+            int32x8 f4 = _mm256_unpacklo_epi32(e4, e5);
+            int32x8 f5 = _mm256_unpackhi_epi32(e4, e5);
+            int32x8 f6 = _mm256_unpacklo_epi32(e6, e7);
+            int32x8 f7 = _mm256_unpackhi_epi32(e6, e7);
+            int32x8_store(&x[j], f0);
+            int32x8_store(&x[j + 8], f1);
+            int32x8_store(&x[j + 16], f2);
+            int32x8_store(&x[j + 24], f3);
+            int32x8_store(&x[j + 32], f4);
+            int32x8_store(&x[j + 40], f5);
+            int32x8_store(&x[j + 48], f6);
+            int32x8_store(&x[j + 56], f7);
+        }
+        minmax_vector(x + j, x + j + 32, n - 32 - j);
+        goto continue16;
+    }
+    if (q == 16) {
+        j = 0;
+continue16:
+        for (; j + 32 <= n; j += 32) {
+            int32x8 x0 = int32x8_load(&x[j]);
+            int32x8 x1 = int32x8_load(&x[j + 8]);
+            int32x8 x2 = int32x8_load(&x[j + 16]);
+            int32x8 x3 = int32x8_load(&x[j + 24]);
+            int32x8_MINMAX(x0, x2);
+            int32x8_MINMAX(x1, x3);
+            int32x8_MINMAX(x0, x1);
+            int32x8_MINMAX(x2, x3);
+            int32x8 a0 = _mm256_permute2x128_si256(x0, x1, 0x20);
+            int32x8 a1 = _mm256_permute2x128_si256(x0, x1, 0x31);
+            int32x8 a2 = _mm256_permute2x128_si256(x2, x3, 0x20);
+            int32x8 a3 = _mm256_permute2x128_si256(x2, x3, 0x31);
+            int32x8_MINMAX(a0, a1);
+            int32x8_MINMAX(a2, a3);
+            int32x8 b0 = _mm256_permute2x128_si256(a0, a1, 0x20);
+            int32x8 b1 = _mm256_permute2x128_si256(a0, a1, 0x31);
+            int32x8 b2 = _mm256_permute2x128_si256(a2, a3, 0x20);
+            int32x8 b3 = _mm256_permute2x128_si256(a2, a3, 0x31);
+            int32x8 c0 = _mm256_unpacklo_epi64(b0, b1);
+            int32x8 c1 = _mm256_unpackhi_epi64(b0, b1);
+            int32x8 c2 = _mm256_unpacklo_epi64(b2, b3);
+            int32x8 c3 = _mm256_unpackhi_epi64(b2, b3);
+            int32x8_MINMAX(c0, c1);
+            int32x8_MINMAX(c2, c3);
+            int32x8 d0 = _mm256_unpacklo_epi32(c0, c1);
+            int32x8 d1 = _mm256_unpackhi_epi32(c0, c1);
+            int32x8 d2 = _mm256_unpacklo_epi32(c2, c3);
+            int32x8 d3 = _mm256_unpackhi_epi32(c2, c3);
+            int32x8 e0 = _mm256_unpacklo_epi64(d0, d1);
+            int32x8 e1 = _mm256_unpackhi_epi64(d0, d1);
+            int32x8 e2 = _mm256_unpacklo_epi64(d2, d3);
+            int32x8 e3 = _mm256_unpackhi_epi64(d2, d3);
+            int32x8_MINMAX(e0, e1);
+            int32x8_MINMAX(e2, e3);
+            int32x8 f0 = _mm256_unpacklo_epi32(e0, e1);
+            int32x8 f1 = _mm256_unpackhi_epi32(e0, e1);
+            int32x8 f2 = _mm256_unpacklo_epi32(e2, e3);
+            int32x8 f3 = _mm256_unpackhi_epi32(e2, e3);
+            int32x8_store(&x[j], f0);
+            int32x8_store(&x[j + 8], f1);
+            int32x8_store(&x[j + 16], f2);
+            int32x8_store(&x[j + 24], f3);
+        }
+        minmax_vector(x + j, x + j + 16, n - 16 - j);
+        goto continue8;
+    }
+    /* q == 8 */
+    j = 0;
+continue8:
+    for (; j + 16 <= n; j += 16) {
+        int32x8 x0 = int32x8_load(&x[j]);
+        int32x8 x1 = int32x8_load(&x[j + 8]);
+        int32x8_MINMAX(x0, x1);
+        int32x8_store(&x[j], x0);
+        int32x8_store(&x[j + 8], x1);
+        int32x8 a0 = _mm256_permute2x128_si256(x0, x1, 0x20); /* x0123y0123 */
+        int32x8 a1 = _mm256_permute2x128_si256(x0, x1, 0x31); /* x4567y4567 */
+        int32x8_MINMAX(a0, a1);
+        int32x8 b0 = _mm256_permute2x128_si256(a0, a1, 0x20); /* x01234567 */
+        int32x8 b1 = _mm256_permute2x128_si256(a0, a1, 0x31); /* y01234567 */
+        int32x8 c0 = _mm256_unpacklo_epi64(b0, b1); /* x01y01x45y45 */
+        int32x8 c1 = _mm256_unpackhi_epi64(b0, b1); /* x23y23x67y67 */
+        int32x8_MINMAX(c0, c1);
+        int32x8 d0 = _mm256_unpacklo_epi32(c0, c1); /* x02x13x46x57 */
+        int32x8 d1 = _mm256_unpackhi_epi32(c0, c1); /* y02y13y46y57 */
+        int32x8 e0 = _mm256_unpacklo_epi64(d0, d1); /* x02y02x46y46 */
+        int32x8 e1 = _mm256_unpackhi_epi64(d0, d1); /* x13y13x57y57 */
+        int32x8_MINMAX(e0, e1);
+        int32x8 f0 = _mm256_unpacklo_epi32(e0, e1); /* x01234567 */
+        int32x8 f1 = _mm256_unpackhi_epi32(e0, e1); /* y01234567 */
+        int32x8_store(&x[j], f0);
+        int32x8_store(&x[j + 8], f1);
+    }
+    minmax_vector(x + j, x + j + 8, n - 8 - j);
+    if (j + 8 <= n) {
+        int32_MINMAX(&x[j], &x[j + 4]);
+        int32_MINMAX(&x[j + 1], &x[j + 5]);
+        int32_MINMAX(&x[j + 2], &x[j + 6]);
+        int32_MINMAX(&x[j + 3], &x[j + 7]);
+        int32_MINMAX(&x[j], &x[j + 2]);
+        int32_MINMAX(&x[j + 1], &x[j + 3]);
+        int32_MINMAX(&x[j], &x[j + 1]);
+        int32_MINMAX(&x[j + 2], &x[j + 3]);
+        int32_MINMAX(&x[j + 4], &x[j + 6]);
+        int32_MINMAX(&x[j + 5], &x[j + 7]);
+        int32_MINMAX(&x[j + 4], &x[j + 5]);
+        int32_MINMAX(&x[j + 6], &x[j + 7]);
+        j += 8;
+    }
+    minmax_vector(x + j, x + j + 4, n - 4 - j);
+    if (j + 4 <= n) {
+        int32_MINMAX(&x[j], &x[j + 2]);
+        int32_MINMAX(&x[j + 1], &x[j + 3]);
+        int32_MINMAX(&x[j], &x[j + 1]);
+        int32_MINMAX(&x[j + 2], &x[j + 3]);
+        j += 4;
+    }
+    if (j + 3 <= n) {
+        int32_MINMAX(&x[j], &x[j + 2]);
+    }
+    if (j + 2 <= n) {
+        int32_MINMAX(&x[j], &x[j + 1]);
+    }
+}
diff --git a/crypto_kem/sntrup761/avx2/crypto_sort_int32.h b/crypto_kem/sntrup761/avx2/crypto_sort_int32.h
new file mode 100644
index 00000000..b328ef1c
--- /dev/null
+++ b/crypto_kem/sntrup761/avx2/crypto_sort_int32.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_SNTRUP761_AVX2_CRYPTO_SORT
+#define PQCLEAN_SNTRUP761_AVX2_CRYPTO_SORT
+
+#include <stddef.h>
+#include <stdint.h>
+
+
+void PQCLEAN_SNTRUP761_AVX2_crypto_sort_int32(int32_t *x, size_t n);
+
+#endif
diff --git a/crypto_kem/sntrup761/avx2/crypto_sort_uint32.c b/crypto_kem/sntrup761/avx2/crypto_sort_uint32.c
new file mode 100644
index 00000000..d7e172a0
--- /dev/null
+++ b/crypto_kem/sntrup761/avx2/crypto_sort_uint32.c
@@ -0,0 +1,20 @@
+#include "crypto_sort_int32.h"
+#include "crypto_sort_uint32.h"
+#include <stdint.h>
+
+#define uint32 uint32_t
+
+/* can save time by vectorizing xor loops */
+/* can save time by integrating xor loops with int32_sort */
+
+void PQCLEAN_SNTRUP761_AVX2_crypto_sort_uint32(uint32_t *array, size_t n) {
+    uint32 *x = array;
+    size_t j;
+    for (j = 0; j < n; ++j) {
+        x[j] ^= 0x80000000;
+    }
+    PQCLEAN_SNTRUP761_AVX2_crypto_sort_int32((int32_t *)array, n);
+    for (j = 0; j < n; ++j) {
+        x[j] ^= 0x80000000;
+    }
+}
diff --git a/crypto_kem/sntrup761/avx2/crypto_sort_uint32.h b/crypto_kem/sntrup761/avx2/crypto_sort_uint32.h
new file mode 100644
index 00000000..91ecbcc9
--- /dev/null
+++ b/crypto_kem/sntrup761/avx2/crypto_sort_uint32.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_SNTRUP761_AVX2_CRYPTO_SORT_UINT32_H
+#define PQCLEAN_SNTRUP761_AVX2_CRYPTO_SORT_UINT32_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+
+void PQCLEAN_SNTRUP761_AVX2_crypto_sort_uint32(uint32_t *array, size_t n);
+
+#endif
diff --git a/crypto_kem/sntrup761/avx2/crypto_stream_aes256ctr.c b/crypto_kem/sntrup761/avx2/crypto_stream_aes256ctr.c
new file mode 100644
index 00000000..cd5240ed
--- /dev/null
+++ b/crypto_kem/sntrup761/avx2/crypto_stream_aes256ctr.c
@@ -0,0 +1,15 @@
+#include "crypto_stream_aes256ctr.h"
+
+
+int PQCLEAN_SNTRUP761_AVX2_crypto_stream_aes256ctr(
+    uint8_t *out,
+    size_t outlen,
+    const uint8_t nonce[AESCTR_NONCEBYTES],
+    const uint8_t key[AES256_KEYBYTES]) {
+
+    aes256ctx state;
+    aes256_ctr_keyexp(&state, key);
+    aes256_ctr(out, outlen, nonce, &state);
+    aes256_ctx_release(&state);
+    return 0;
+}
diff --git a/crypto_kem/sntrup761/avx2/crypto_stream_aes256ctr.h b/crypto_kem/sntrup761/avx2/crypto_stream_aes256ctr.h
new file mode 100644
index 00000000..f5072215
--- /dev/null
+++ b/crypto_kem/sntrup761/avx2/crypto_stream_aes256ctr.h
@@ -0,0 +1,15 @@
+#ifndef PQCLEAN_SNTRUP761_AVX2_CRYPTO_STREAM_AES256CTR_H
+#define PQCLEAN_SNTRUP761_AVX2_CRYPTO_STREAM_AES256CTR_H
+#include "aes.h"
+#include <stddef.h>
+#include <stdint.h>
+
+
+
+int PQCLEAN_SNTRUP761_AVX2_crypto_stream_aes256ctr(
+    uint8_t *out,
+    size_t outlen,
+    const uint8_t nonce[AESCTR_NONCEBYTES],
+    const uint8_t key[AES256_KEYBYTES]);
+
+#endif
diff --git a/crypto_kem/sntrup761/avx2/crypto_verify_1039.c b/crypto_kem/sntrup761/avx2/crypto_verify_1039.c
new file mode 100644
index 00000000..2bc66d89
--- /dev/null
+++ b/crypto_kem/sntrup761/avx2/crypto_verify_1039.c
@@ -0,0 +1,36 @@
+#include "crypto_verify_1039.h"
+#include <immintrin.h>
+
+int PQCLEAN_SNTRUP761_AVX2_crypto_verify_1039(const unsigned char *x, const unsigned char *y) {
+    __m256i diff = _mm256_set1_epi8(0);
+    unsigned int differentbits = 0;
+    int i = PQCLEAN_SNTRUP761_AVX2_crypto_verify_1039_BYTES;
+
+    i -= 32;
+    for (;;) {
+        do {
+            __m256i x0 = _mm256_loadu_si256((__m256i *) x);
+            __m256i y0 = _mm256_loadu_si256((__m256i *) y);
+            diff |= x0 ^ y0;
+            i -= 32;
+            x += 32;
+            y += 32;
+        } while (i >= 0);
+        if (i <= -32) {
+            break;
+        }
+        x += i;
+        y += i;
+    }
+
+    diff |= _mm256_srli_epi16(diff, 8);
+    diff |= _mm256_srli_epi32(diff, 16);
+    diff |= _mm256_srli_epi64(diff, 32);
+
+    differentbits = _mm256_extract_epi8(diff, 0);
+    differentbits |= _mm256_extract_epi8(diff, 8);
+    differentbits |= _mm256_extract_epi8(diff, 16);
+    differentbits |= _mm256_extract_epi8(diff, 24);
+
+    return (int) (1 & ((differentbits - 1) >> 8)) - 1;
+}
diff --git a/crypto_kem/sntrup761/avx2/crypto_verify_1039.h b/crypto_kem/sntrup761/avx2/crypto_verify_1039.h
new file mode 100644
index 00000000..b0d5ffd4
--- /dev/null
+++ b/crypto_kem/sntrup761/avx2/crypto_verify_1039.h
@@ -0,0 +1,8 @@
+#ifndef PQCLEAN_SNTRUP761_AVX2_CRYPTO_VERIFY_1039_H
+#define PQCLEAN_SNTRUP761_AVX2_CRYPTO_VERIFY_1039_H
+
+#include <stdint.h>
+#define PQCLEAN_SNTRUP761_AVX2_crypto_verify_1039_BYTES 1039
+
+int PQCLEAN_SNTRUP761_AVX2_crypto_verify_1039(const unsigned char *x, const unsigned char *y);
+#endif
diff --git a/crypto_kem/sntrup761/avx2/kem.c b/crypto_kem/sntrup761/avx2/kem.c
new file mode 100644
index 00000000..7d0d0152
--- /dev/null
+++ b/crypto_kem/sntrup761/avx2/kem.c
@@ -0,0 +1,247 @@
+#include "api.h"
+#include "crypto_sort_uint32.h"
+#include "params.h"
+#include "randombytes.h"
+#include "sha2.h"
+
+
+
+#define int8 int8_t
+#define int16 int16_t
+#define int32 int32_t
+#define uint16 uint16_t
+#define uint32 uint32_t
+
+/* ----- arithmetic mod 3 */
+
+typedef int8 small;
+/* F3 is always represented as -1,0,1 */
+
+/* ----- arithmetic mod q */
+
+typedef int16 Fq;
+/* always represented as -(q-1)/2...(q-1)/2 */
+
+/* ----- small polynomials */
+
+/* R3_fromR(R_fromRq(r)) */
+static void R3_fromRq(small *out, const Fq *r) {
+    crypto_encode_pxfreeze3((unsigned char *) out, (unsigned char *) r);
+}
+
+/* h = f*g in the ring R3 */
+static void R3_mult(small *h, const small *f, const small *g) {
+    crypto_core_mult3((unsigned char *) h, (const unsigned char *) f, (const unsigned char *) g);
+}
+
+/* ----- polynomials mod q */
+
+/* h = h*g in the ring Rq */
+static void Rq_mult_small(Fq *h, const small *g) {
+    crypto_encode_pxint16((unsigned char *) h, h);
+    crypto_core_mult((unsigned char *) h, (const unsigned char *) h, (const unsigned char *) g);
+    crypto_decode_pxint16(h, (const unsigned char *) h);
+}
+
+/* h = 3f in Rq */
+static void Rq_mult3(Fq *h, const Fq *f) {
+    crypto_encode_pxint16((unsigned char *) h, f);
+    crypto_core_scale3((unsigned char *) h, (const unsigned char *) h);
+    crypto_decode_pxint16(h, (const unsigned char *) h);
+}
+
+/* out = 1/(3*in) in Rq */
+/* caller must have 2p+1 bytes free in out, not just 2p */
+static void Rq_recip3(Fq *out, const small *in) {
+    crypto_core_inv((unsigned char *) out, (const unsigned char *) in);
+    /* could check byte 2*p for failure; but, in context, inv always works */
+    crypto_decode_pxint16(out, (unsigned char *) out);
+}
+
+/* ----- underlying hash function */
+
+#define Hash_bytes 32
+
+static void Hash(unsigned char *out, const unsigned char *in, int inlen) {
+    unsigned char h[64];
+    int i;
+    sha512(h, in, inlen);
+    for (i = 0; i < 32; ++i) {
+        out[i] = h[i];
+    }
+}
+
+/* ----- higher-level randomness */
+
+static void Short_random(small *out) {
+    uint32 L[ppadsort];
+    int i;
+
+    randombytes((unsigned char *) L, 4 * p);
+    crypto_decode_pxint32(L, (unsigned char *) L);
+    for (i = 0; i < w; ++i) {
+        L[i] = L[i] & (uint32) - 2;
+    }
+    for (i = w; i < p; ++i) {
+        L[i] = (L[i] & (uint32) - 3) | 1;
+    }
+    for (i = p; i < ppadsort; ++i) {
+        L[i] = 0xffffffff;
+    }
+    PQCLEAN_SNTRUP761_AVX2_crypto_sort_uint32(L, ppadsort);
+    for (i = 0; i < p; ++i) {
+        out[i] = (L[i] & 3) - 1;
+    }
+}
+
+static void Small_random(small *out) {
+    uint32 L[p];
+    int i;
+
+    randombytes((unsigned char *) L, sizeof L);
+    crypto_decode_pxint32(L, (unsigned char *) L);
+    for (i = 0; i < p; ++i) {
+        out[i] = (((L[i] & 0x3fffffff) * 3) >> 30) - 1;
+    }
+}
+
+/* ----- Streamlined NTRU Prime */
+
+typedef small Inputs[p]; /* passed by reference */
+#define Ciphertexts_bytes Rounded_bytes
+#define SecretKeys_bytes (2*Small_bytes)
+#define PublicKeys_bytes Rq_bytes
+#define Confirm_bytes 32
+
+/* c,r_enc[1:] = Hide(r,pk,cache); cache is Hash4(pk) */
+/* also set r_enc[0]=3 */
+/* also set x[0]=2, and x[1:1+Hash_bytes] = Hash3(r_enc) */
+/* also overwrite x[1+Hash_bytes:1+2*Hash_bytes] */
+static void Hide(unsigned char *x, unsigned char *c, unsigned char *r_enc, const Inputs r, const unsigned char *pk, const unsigned char *cache) {
+    Fq h[p];
+    int i;
+
+    Small_encode(r_enc + 1, r);
+    Rq_decode(h, pk);
+    Rq_mult_small(h, r);
+    Round_and_encode(c, h);
+    r_enc[0] = 3;
+    Hash(x + 1, r_enc, 1 + Small_bytes);
+    for (i = 0; i < Hash_bytes; ++i) {
+        x[1 + Hash_bytes + i] = cache[i];
+    }
+    x[0] = 2;
+    Hash(c + Ciphertexts_bytes, x, 1 + Hash_bytes * 2);
+}
+
+
+int PQCLEAN_SNTRUP761_AVX2_crypto_kem_keypair(unsigned char *pk, unsigned char *sk) {
+    small g[p];
+    for (;;) {
+        Small_random(g);
+        {
+            small v[p + 1];
+            crypto_core_inv3((unsigned char *) v, (const unsigned char *) g);
+            if (v[p] == 0) {
+                Small_encode(sk + Small_bytes, v);
+                break;
+            }
+        }
+    }
+    {
+        small f[p];
+        Short_random(f);
+        Small_encode(sk, f);
+        {
+            Fq h[p + 1];
+            Rq_recip3(h, f); /* always works */
+            Rq_mult_small(h, g);
+            Rq_encode(pk, h);
+        }
+    }
+    {
+        int i;
+        unsigned char sksave = sk[SecretKeys_bytes - 1];
+        for (i = 0; i < PublicKeys_bytes; ++i) {
+            sk[SecretKeys_bytes + i] = pk[i];
+        }
+        sk[SecretKeys_bytes - 1] = 4;
+        Hash(sk + SecretKeys_bytes + PublicKeys_bytes + Small_bytes, sk + SecretKeys_bytes - 1, 1 + PublicKeys_bytes);
+        sk[SecretKeys_bytes - 1] = sksave;
+        randombytes(sk + SecretKeys_bytes + PublicKeys_bytes, Small_bytes);
+    }
+    return 0;
+}
+
+int PQCLEAN_SNTRUP761_AVX2_crypto_kem_enc(unsigned char *c, unsigned char *k, const unsigned char *pk) {
+    unsigned char cache[Hash_bytes];
+    int i;
+    {
+        unsigned char y[1 + PublicKeys_bytes]; /* XXX: can eliminate with incremental hashing */
+        for (i = 0; i < PublicKeys_bytes; ++i) {
+            y[1 + i] = pk[i];
+        }
+        y[0] = 4;
+        Hash(cache, y, sizeof y);
+    }
+    {
+        Inputs r;
+        Short_random(r);
+        {
+            unsigned char r_enc[Small_bytes + 1];
+            unsigned char x[1 + Hash_bytes + Ciphertexts_bytes + Confirm_bytes];
+            Hide(x, c, r_enc, r, pk, cache);
+            for (i = 0; i < Ciphertexts_bytes + Confirm_bytes; ++i) {
+                x[1 + Hash_bytes + i] = c[i];
+            }
+            x[0] = 1;
+            Hash(k, x, sizeof x);
+        }
+    }
+    return 0;
+}
+
+int PQCLEAN_SNTRUP761_AVX2_crypto_kem_dec(unsigned char *k, const unsigned char *c, const unsigned char *sk) {
+    const unsigned char *pk = sk + SecretKeys_bytes;
+    const unsigned char *rho = pk + PublicKeys_bytes;
+    const unsigned char *cache = rho + Small_bytes;
+    int mask, i;
+    Inputs r;
+    {
+        Fq d[p];
+        Rounded_decode(d, c);
+        {
+            small f[p];
+            Small_decode(f, sk);
+            Rq_mult_small(d, f);
+            Rq_mult3(d, d);
+        }
+        {
+            small e[p];
+            small v[p];
+            R3_fromRq(e, d);
+            Small_decode(v, sk + Small_bytes);
+            R3_mult(r, e, v);
+        }
+        crypto_core_wforce((unsigned char *) r, (unsigned char *) r);
+    }
+    {
+        unsigned char r_enc[1 + Small_bytes];
+        unsigned char cnew[Ciphertexts_bytes + Confirm_bytes];
+        unsigned char x[1 + Hash_bytes + Ciphertexts_bytes + Confirm_bytes];
+        /* XXX: can use incremental hashing to reduce x size */
+
+        Hide(x, cnew, r_enc, r, pk, cache);
+        mask = crypto_verify_clen(c, cnew);
+        for (i = 0; i < Small_bytes; ++i) {
+            r_enc[i + 1] ^= mask & (r_enc[i + 1] ^ rho[i]);
+        }
+        Hash(x + 1, r_enc, 1 + Small_bytes); /* XXX: can instead do cmov on cached hash of rho */
+        for (i = 0; i < Ciphertexts_bytes + Confirm_bytes; ++i) {
+            x[1 + Hash_bytes + i] = c[i];
+        }
+        x[0] = 1 + mask;
+        Hash(k, x, sizeof x);
+    }
+    return 0;
+}
diff --git a/crypto_kem/sntrup761/avx2/params.h b/crypto_kem/sntrup761/avx2/params.h
new file mode 100644
index 00000000..7d1c63d8
--- /dev/null
+++ b/crypto_kem/sntrup761/avx2/params.h
@@ -0,0 +1,71 @@
+#ifndef params_H
+#define params_H
+#include "crypto_core_inv3sntrup761.h"
+#include "crypto_core_invsntrup761.h"
+#include "crypto_core_mult3sntrup761.h"
+#include "crypto_core_multsntrup761.h"
+#include "crypto_core_scale3sntrup761.h"
+#include "crypto_core_weightsntrup761.h"
+#include "crypto_core_wforcesntrup761.h"
+#include "crypto_decode_761x1531.h"
+#include "crypto_decode_761x3.h"
+#include "crypto_decode_761x4591.h"
+#include "crypto_decode_761xint16.h"
+#include "crypto_decode_761xint32.h"
+#include "crypto_encode_761x1531.h"
+#include "crypto_encode_761x1531round.h"
+#include "crypto_encode_761x3.h"
+#include "crypto_encode_761x4591.h"
+#include "crypto_encode_761xfreeze3.h"
+#include "crypto_encode_761xint16.h"
+#include "crypto_encode_int16.h"
+#include "crypto_verify_1039.h"
+
+
+#define p 761
+#define qinv 15631 /* reciprocal of q mod 2^16 */
+#define q27 29235 /* closest integer to 2^27/q */
+#define q18 57 /* closest integer to 2^18/q */
+#define ppad 769
+#define crypto_core_weight PQCLEAN_SNTRUP761_AVX2_crypto_core_weightsntrup761
+#define q 4591
+#define w 286
+
+#define ppadsort 768
+
+#define crypto_verify_clen PQCLEAN_SNTRUP761_AVX2_crypto_verify_1039
+
+#define Rq_bytes PQCLEAN_SNTRUP761_AVX2_crypto_encode_761x4591_STRBYTES
+#define Rq_encode PQCLEAN_SNTRUP761_AVX2_crypto_encode_761x4591
+#define Rq_decode PQCLEAN_SNTRUP761_AVX2_crypto_decode_761x4591
+
+#define Rounded_bytes PQCLEAN_SNTRUP761_AVX2_crypto_decode_761x1531_STRBYTES
+#define Rounded_decode PQCLEAN_SNTRUP761_AVX2_crypto_decode_761x1531
+
+#define Round_and_encode PQCLEAN_SNTRUP761_AVX2_crypto_encode_761x1531round
+
+#define Small_bytes PQCLEAN_SNTRUP761_AVX2_crypto_encode_761x3_STRBYTES
+#define Small_encode PQCLEAN_SNTRUP761_AVX2_crypto_encode_761x3
+#define Small_decode PQCLEAN_SNTRUP761_AVX2_crypto_decode_761x3
+
+#define crypto_encode_pxfreeze3 PQCLEAN_SNTRUP761_AVX2_crypto_encode_761xfreeze3
+
+#define crypto_decode_pxint32 PQCLEAN_SNTRUP761_AVX2_crypto_decode_761xint32
+
+#define crypto_decode_pxint16 PQCLEAN_SNTRUP761_AVX2_crypto_decode_761xint16
+
+#define crypto_encode_pxint16 PQCLEAN_SNTRUP761_AVX2_crypto_encode_761xint16
+
+#define crypto_core_wforce PQCLEAN_SNTRUP761_AVX2_crypto_core_wforcesntrup761
+
+#define crypto_core_scale3 PQCLEAN_SNTRUP761_AVX2_crypto_core_scale3sntrup761
+
+#define crypto_core_inv PQCLEAN_SNTRUP761_AVX2_crypto_core_invsntrup761
+
+#define crypto_core_inv3 PQCLEAN_SNTRUP761_AVX2_crypto_core_inv3sntrup761
+
+#define crypto_core_mult3 PQCLEAN_SNTRUP761_AVX2_crypto_core_mult3sntrup761
+
+#define crypto_core_mult PQCLEAN_SNTRUP761_AVX2_crypto_core_multsntrup761
+
+#endif
diff --git a/crypto_kem/sntrup761/clean/LICENSE b/crypto_kem/sntrup761/clean/LICENSE
new file mode 100644
index 00000000..d5d21fff
--- /dev/null
+++ b/crypto_kem/sntrup761/clean/LICENSE
@@ -0,0 +1 @@
+Public Domain
diff --git a/crypto_kem/sntrup761/clean/Makefile b/crypto_kem/sntrup761/clean/Makefile
new file mode 100644
index 00000000..b62759a2
--- /dev/null
+++ b/crypto_kem/sntrup761/clean/Makefile
@@ -0,0 +1,19 @@
+# This Makefile can be used with GNU Make or BSD Make
+
+LIB=libsntrup761_clean.a
+HEADERS=api.h crypto_core_inv3sntrup761.h crypto_core_invsntrup761.h crypto_core_mult3sntrup761.h crypto_core_multsntrup761.h crypto_core_scale3sntrup761.h crypto_core_weightsntrup761.h crypto_core_wforcesntrup761.h crypto_decode_761x1531.h crypto_decode_761x3.h crypto_decode_761x4591.h crypto_decode_761xint16.h crypto_decode_761xint32.h crypto_encode_761x1531.h crypto_encode_761x1531round.h crypto_encode_761x3.h crypto_encode_761x4591.h crypto_encode_761xfreeze3.h crypto_encode_761xint16.h crypto_encode_int16.h crypto_sort_int32.h crypto_sort_uint32.h crypto_stream_aes256ctr.h crypto_verify_1039.h params.h 
+OBJECTS=crypto_core_inv3sntrup761.o crypto_core_invsntrup761.o crypto_core_mult3sntrup761.o crypto_core_multsntrup761.o crypto_core_scale3sntrup761.o crypto_core_weightsntrup761.o crypto_core_wforcesntrup761.o crypto_decode_761x1531.o crypto_decode_761x3.o crypto_decode_761x4591.o crypto_decode_761xint16.o crypto_decode_761xint32.o crypto_encode_761x1531.o crypto_encode_761x1531round.o crypto_encode_761x3.o crypto_encode_761x4591.o crypto_encode_761xfreeze3.o crypto_encode_761xint16.o crypto_encode_int16.o crypto_sort_int32.o crypto_sort_uint32.o crypto_stream_aes256ctr.o crypto_verify_1039.o kem.o 
+
+CFLAGS=-O3 -Wall -Wextra -Wpedantic -Wvla -Werror -Wredundant-decls -Wmissing-prototypes -std=c99 -I../../../common $(EXTRAFLAGS)
+
+all: $(LIB)
+
+%.o: %.c $(HEADERS)
+	$(CC) $(CFLAGS) -c -o $@ $<
+
+$(LIB): $(OBJECTS)
+	$(AR) -r $@ $(OBJECTS)
+
+clean:
+	$(RM) $(OBJECTS)
+	$(RM) $(LIB)
diff --git a/crypto_kem/sntrup761/clean/Makefile.Microsoft_nmake b/crypto_kem/sntrup761/clean/Makefile.Microsoft_nmake
new file mode 100644
index 00000000..4618d5a2
--- /dev/null
+++ b/crypto_kem/sntrup761/clean/Makefile.Microsoft_nmake
@@ -0,0 +1,19 @@
+# This Makefile can be used with Microsoft Visual Studio's nmake using the command:
+#    nmake /f Makefile.Microsoft_nmake
+
+LIBRARY=libsntrup761_clean.lib
+OBJECTS=crypto_core_inv3sntrup761.obj crypto_core_invsntrup761.obj crypto_core_mult3sntrup761.obj crypto_core_multsntrup761.obj crypto_core_scale3sntrup761.obj crypto_core_weightsntrup761.obj crypto_core_wforcesntrup761.obj crypto_decode_761x1531.obj crypto_decode_761x3.obj crypto_decode_761x4591.obj crypto_decode_761xint16.obj crypto_decode_761xint32.obj crypto_encode_761x1531.obj crypto_encode_761x1531round.obj crypto_encode_761x3.obj crypto_encode_761x4591.obj crypto_encode_761xfreeze3.obj crypto_encode_761xint16.obj crypto_encode_int16.obj crypto_sort_int32.obj crypto_sort_uint32.obj crypto_stream_aes256ctr.obj crypto_verify_1039.obj kem.obj 
+
+CFLAGS=/nologo /O2 /I ..\..\..\common /W4 /WX
+
+all: $(LIBRARY)
+
+# Make sure objects are recompiled if headers change.
+$(OBJECTS): *.h
+
+$(LIBRARY): $(OBJECTS)
+    LIB.EXE /NOLOGO /WX /OUT:$@ $**
+
+clean:
+    -DEL $(OBJECTS)
+    -DEL $(LIBRARY)
diff --git a/crypto_kem/sntrup761/clean/api.h b/crypto_kem/sntrup761/clean/api.h
new file mode 100644
index 00000000..f0d53841
--- /dev/null
+++ b/crypto_kem/sntrup761/clean/api.h
@@ -0,0 +1,16 @@
+#ifndef PQCLEAN_SNTRUP761_CLEAN_API_H
+#define PQCLEAN_SNTRUP761_CLEAN_API_H
+
+
+
+#define PQCLEAN_SNTRUP761_CLEAN_CRYPTO_ALGNAME "sntrup761"
+
+#define PQCLEAN_SNTRUP761_CLEAN_CRYPTO_SECRETKEYBYTES 1763
+#define PQCLEAN_SNTRUP761_CLEAN_CRYPTO_PUBLICKEYBYTES 1158
+#define PQCLEAN_SNTRUP761_CLEAN_CRYPTO_CIPHERTEXTBYTES 1039
+#define PQCLEAN_SNTRUP761_CLEAN_CRYPTO_BYTES 32
+
+int PQCLEAN_SNTRUP761_CLEAN_crypto_kem_keypair(unsigned char *pk, unsigned char *sk);
+int PQCLEAN_SNTRUP761_CLEAN_crypto_kem_enc(unsigned char *c, unsigned char *k, const unsigned char *pk);
+int PQCLEAN_SNTRUP761_CLEAN_crypto_kem_dec(unsigned char *k, const unsigned char *c, const unsigned char *sk);
+#endif
diff --git a/crypto_kem/sntrup761/clean/crypto_core_inv3sntrup761.c b/crypto_kem/sntrup761/clean/crypto_core_inv3sntrup761.c
new file mode 100644
index 00000000..9b9af62a
--- /dev/null
+++ b/crypto_kem/sntrup761/clean/crypto_core_inv3sntrup761.c
@@ -0,0 +1,110 @@
+#include "crypto_core_inv3sntrup761.h"
+#include "params.h"
+
+
+
+#define int8 int8_t
+#define int16 int16_t
+#define int32 int32_t
+#define uint16 uint16_t
+#define uint32 uint32_t
+
+/* ----- masks */
+
+/* return -1 if x!=0; else return 0 */
+static int int16_nonzero_mask(int16 x) {
+    uint16 u = x; /* 0, else 1...65535 */
+    uint32 v = u; /* 0, else 1...65535 */
+    v = -v; /* 0, else 2^32-65535...2^32-1 */
+    v >>= 31; /* 0, else 1 */
+    return -v; /* 0, else -1 */
+}
+
+/* return -1 if x<0; otherwise return 0 */
+static int int16_negative_mask(int16 x) {
+    uint16 u = x;
+    u >>= 15;
+    return -(int) u;
+    /* alternative with gcc -fwrapv: */
+    /* x>>15 compiles to CPU's arithmetic right shift */
+}
+
+/* ----- arithmetic mod 3 */
+
+typedef int8 small;
+/* F3 is always represented as -1,0,1 */
+
+/* works for -16384 <= x < 16384 */
+static small F3_freeze(int16 x) {
+    return x - 3 * ((10923 * x + 16384) >> 15);
+}
+
+/* byte p of output is 0 if recip succeeded; else -1 */
+int PQCLEAN_SNTRUP761_CLEAN_crypto_core_inv3sntrup761(unsigned char *outbytes, const unsigned char *inbytes) {
+    small *out = (void *) outbytes;
+    small *in = (void *) inbytes;
+    small f[p + 1], g[p + 1], v[p + 1], r[p + 1];
+    int i, loop, delta;
+    int sign, swap, t;
+
+    for (i = 0; i < p + 1; ++i) {
+        v[i] = 0;
+    }
+    for (i = 0; i < p + 1; ++i) {
+        r[i] = 0;
+    }
+    r[0] = 1;
+    for (i = 0; i < p; ++i) {
+        f[i] = 0;
+    }
+    f[0] = 1;
+    f[p - 1] = f[p] = -1;
+    for (i = 0; i < p; ++i) {
+        small i1 = in[i] & 1;
+        g[p - 1 - i] = i1 - (in[i] & (i1 << 1));
+    }
+    g[p] = 0;
+
+    delta = 1;
+
+    for (loop = 0; loop < 2 * p - 1; ++loop) {
+        for (i = p; i > 0; --i) {
+            v[i] = v[i - 1];
+        }
+        v[0] = 0;
+
+        sign = -g[0] * f[0];
+        swap = int16_negative_mask(-delta) & int16_nonzero_mask(g[0]);
+        delta ^= swap & (delta ^ -delta);
+        delta += 1;
+
+        for (i = 0; i < p + 1; ++i) {
+            t = swap & (f[i] ^ g[i]);
+            f[i] ^= t;
+            g[i] ^= t;
+            t = swap & (v[i] ^ r[i]);
+            v[i] ^= t;
+            r[i] ^= t;
+        }
+
+        for (i = 0; i < p + 1; ++i) {
+            g[i] = F3_freeze(g[i] + sign * f[i]);
+        }
+        for (i = 0; i < p + 1; ++i) {
+            r[i] = F3_freeze(r[i] + sign * v[i]);
+        }
+
+        for (i = 0; i < p; ++i) {
+            g[i] = g[i + 1];
+        }
+        g[p] = 0;
+    }
+
+    sign = f[0];
+    for (i = 0; i < p; ++i) {
+        out[i] = sign * v[p - 1 - i];
+    }
+
+    out[p] = int16_nonzero_mask(delta);
+    return 0;
+}
diff --git a/crypto_kem/sntrup761/clean/crypto_core_inv3sntrup761.h b/crypto_kem/sntrup761/clean/crypto_core_inv3sntrup761.h
new file mode 100644
index 00000000..583e49c0
--- /dev/null
+++ b/crypto_kem/sntrup761/clean/crypto_core_inv3sntrup761.h
@@ -0,0 +1,11 @@
+#ifndef PQCLEAN_SNTRUP761_CLEAN_CRYPTO_CORE_INV3SNTRUP761_H
+#define PQCLEAN_SNTRUP761_CLEAN_CRYPTO_CORE_INV3SNTRUP761_H
+
+#include <stdint.h>
+#define PQCLEAN_SNTRUP761_CLEAN_crypto_core_inv3sntrup761_OUTPUTBYTES 762
+#define PQCLEAN_SNTRUP761_CLEAN_crypto_core_inv3sntrup761_INPUTBYTES 761
+#define PQCLEAN_SNTRUP761_CLEAN_crypto_core_inv3sntrup761_KEYBYTES 0
+#define PQCLEAN_SNTRUP761_CLEAN_crypto_core_inv3sntrup761_CONSTBYTES 0
+
+int PQCLEAN_SNTRUP761_CLEAN_crypto_core_inv3sntrup761(unsigned char *outbytes, const unsigned char *inbytes);
+#endif
diff --git a/crypto_kem/sntrup761/clean/crypto_core_invsntrup761.c b/crypto_kem/sntrup761/clean/crypto_core_invsntrup761.c
new file mode 100644
index 00000000..b496b030
--- /dev/null
+++ b/crypto_kem/sntrup761/clean/crypto_core_invsntrup761.c
@@ -0,0 +1,130 @@
+#include "crypto_core_invsntrup761.h"
+#include "params.h"
+
+
+#define int8 int8_t
+#define int16 int16_t
+#define int32 int32_t
+#define uint16 uint16_t
+#define uint32 uint32_t
+
+
+/* ----- masks */
+
+/* return -1 if x!=0; else return 0 */
+static int int16_nonzero_mask(int16 x) {
+    uint16 u = x; /* 0, else 1...65535 */
+    uint32 v = u; /* 0, else 1...65535 */
+    v = -v; /* 0, else 2^32-65535...2^32-1 */
+    v >>= 31; /* 0, else 1 */
+    return -v; /* 0, else -1 */
+}
+
+/* return -1 if x<0; otherwise return 0 */
+static int int16_negative_mask(int16 x) {
+    uint16 u = x;
+    u >>= 15;
+    return -(int) u;
+    /* alternative with gcc -fwrapv: */
+    /* x>>15 compiles to CPU's arithmetic right shift */
+}
+
+/* ----- arithmetic mod q */
+
+typedef int8 small;
+
+typedef int16 Fq;
+/* always represented as -(q-1)/2...(q-1)/2 */
+
+/* works for -14000000 < x < 14000000 if q in 4591, 4621, 5167 */
+static Fq Fq_freeze(int32 x) {
+    x -= q * ((q18 * x) >> 18);
+    x -= q * ((q27 * x + 67108864) >> 27);
+    return x;
+}
+
+static Fq Fq_recip(Fq a1) {
+    int i = 1;
+    Fq ai = a1;
+
+    while (i < q - 2) {
+        ai = Fq_freeze(a1 * (int32)ai);
+        i += 1;
+    }
+    return ai;
+}
+
+/* ----- polynomials mod q */
+
+/* out = 1/(3*in) in Rq */
+/* outbytes[2*p] is 0 if recip succeeded; else -1 */
+int PQCLEAN_SNTRUP761_CLEAN_crypto_core_invsntrup761(unsigned char *outbytes, const unsigned char *inbytes) {
+    small *in = (void *) inbytes;
+    Fq out[p], f[p + 1], g[p + 1], v[p + 1], r[p + 1];
+    int i, loop, delta;
+    int swap, t;
+    int32 f0, g0;
+    Fq scale;
+
+    for (i = 0; i < p + 1; ++i) {
+        v[i] = 0;
+    }
+    for (i = 0; i < p + 1; ++i) {
+        r[i] = 0;
+    }
+    r[0] = Fq_recip(3);
+    for (i = 0; i < p; ++i) {
+        f[i] = 0;
+    }
+    f[0] = 1;
+    f[p - 1] = f[p] = -1;
+    for (i = 0; i < p; ++i) {
+        g[p - 1 - i] = in[i];
+    }
+    g[p] = 0;
+
+    delta = 1;
+
+    for (loop = 0; loop < 2 * p - 1; ++loop) {
+        for (i = p; i > 0; --i) {
+            v[i] = v[i - 1];
+        }
+        v[0] = 0;
+
+        swap = int16_negative_mask(-delta) & int16_nonzero_mask(g[0]);
+        delta ^= swap & (delta ^ -delta);
+        delta += 1;
+
+        for (i = 0; i < p + 1; ++i) {
+            t = swap & (f[i] ^ g[i]);
+            f[i] ^= t;
+            g[i] ^= t;
+            t = swap & (v[i] ^ r[i]);
+            v[i] ^= t;
+            r[i] ^= t;
+        }
+
+        f0 = f[0];
+        g0 = g[0];
+        for (i = 0; i < p + 1; ++i) {
+            g[i] = Fq_freeze(f0 * g[i] - g0 * f[i]);
+        }
+        for (i = 0; i < p + 1; ++i) {
+            r[i] = Fq_freeze(f0 * r[i] - g0 * v[i]);
+        }
+
+        for (i = 0; i < p; ++i) {
+            g[i] = g[i + 1];
+        }
+        g[p] = 0;
+    }
+
+    scale = Fq_recip(f[0]);
+    for (i = 0; i < p; ++i) {
+        out[i] = Fq_freeze(scale * (int32)v[p - 1 - i]);
+    }
+
+    crypto_encode_pxint16(outbytes, out);
+    outbytes[2 * p] = int16_nonzero_mask(delta);
+    return 0;
+}
diff --git a/crypto_kem/sntrup761/clean/crypto_core_invsntrup761.h b/crypto_kem/sntrup761/clean/crypto_core_invsntrup761.h
new file mode 100644
index 00000000..c9e09ef4
--- /dev/null
+++ b/crypto_kem/sntrup761/clean/crypto_core_invsntrup761.h
@@ -0,0 +1,11 @@
+#ifndef PQCLEAN_SNTRUP761_CLEAN_CRYPTO_CORE_INVSNTRUP761_H
+#define PQCLEAN_SNTRUP761_CLEAN_CRYPTO_CORE_INVSNTRUP761_H
+
+#include <stdint.h>
+#define PQCLEAN_SNTRUP761_CLEAN_crypto_core_invsntrup761_OUTPUTBYTES 1523
+#define PQCLEAN_SNTRUP761_CLEAN_crypto_core_invsntrup761_INPUTBYTES 761
+#define PQCLEAN_SNTRUP761_CLEAN_crypto_core_invsntrup761_KEYBYTES 0
+#define PQCLEAN_SNTRUP761_CLEAN_crypto_core_invsntrup761_CONSTBYTES 0
+
+int PQCLEAN_SNTRUP761_CLEAN_crypto_core_invsntrup761(unsigned char *outbytes, const unsigned char *inbytes);
+#endif
diff --git a/crypto_kem/sntrup761/clean/crypto_core_mult3sntrup761.c b/crypto_kem/sntrup761/clean/crypto_core_mult3sntrup761.c
new file mode 100644
index 00000000..7c32f29e
--- /dev/null
+++ b/crypto_kem/sntrup761/clean/crypto_core_mult3sntrup761.c
@@ -0,0 +1,57 @@
+#include "crypto_core_mult3sntrup761.h"
+#include "params.h"
+
+
+#define int8 int8_t
+#define int16 int16_t
+typedef int8 small;
+
+/* works for -16384 <= x < 16384 */
+static small F3_freeze(int16 x) {
+    return x - 3 * ((10923 * x + 16384) >> 15);
+}
+
+int PQCLEAN_SNTRUP761_CLEAN_crypto_core_mult3sntrup761(unsigned char *outbytes, const unsigned char *inbytes, const unsigned char *kbytes) {
+    small *h = (void *) outbytes;
+    small f[p];
+    small g[p];
+    small fg[p + p - 1];
+    int16 result;
+    int i, j;
+
+    for (i = 0; i < p; ++i) {
+        small fi = inbytes[i];
+        small fi0 = fi & 1;
+        f[i] = fi0 - (fi & (fi0 << 1));
+    }
+    for (i = 0; i < p; ++i) {
+        small gi = kbytes[i];
+        small gi0 = gi & 1;
+        g[i] = gi0 - (gi & (gi0 << 1));
+    }
+
+    for (i = 0; i < p; ++i) {
+        result = 0;
+        for (j = 0; j <= i; ++j) {
+            result += f[j] * g[i - j];
+        }
+        fg[i] = F3_freeze(result);
+    }
+    for (i = p; i < p + p - 1; ++i) {
+        result = 0;
+        for (j = i - p + 1; j < p; ++j) {
+            result += f[j] * g[i - j];
+        }
+        fg[i] = F3_freeze(result);
+    }
+
+    for (i = p + p - 2; i >= p; --i) {
+        fg[i - p] = F3_freeze(fg[i - p] + fg[i]);
+        fg[i - p + 1] = F3_freeze(fg[i - p + 1] + fg[i]);
+    }
+
+    for (i = 0; i < p; ++i) {
+        h[i] = fg[i];
+    }
+    return 0;
+}
diff --git a/crypto_kem/sntrup761/clean/crypto_core_mult3sntrup761.h b/crypto_kem/sntrup761/clean/crypto_core_mult3sntrup761.h
new file mode 100644
index 00000000..d06fb73c
--- /dev/null
+++ b/crypto_kem/sntrup761/clean/crypto_core_mult3sntrup761.h
@@ -0,0 +1,11 @@
+#ifndef PQCLEAN_SNTRUP761_CLEAN_CRYPTO_CORE_MULT3SNTRUP761_H
+#define PQCLEAN_SNTRUP761_CLEAN_CRYPTO_CORE_MULT3SNTRUP761_H
+
+#include <stdint.h>
+#define PQCLEAN_SNTRUP761_CLEAN_crypto_core_mult3sntrup761_OUTPUTBYTES 761
+#define PQCLEAN_SNTRUP761_CLEAN_crypto_core_mult3sntrup761_INPUTBYTES 761
+#define PQCLEAN_SNTRUP761_CLEAN_crypto_core_mult3sntrup761_KEYBYTES 761
+#define PQCLEAN_SNTRUP761_CLEAN_crypto_core_mult3sntrup761_CONSTBYTES 0
+
+int PQCLEAN_SNTRUP761_CLEAN_crypto_core_mult3sntrup761(unsigned char *outbytes, const unsigned char *inbytes, const unsigned char *kbytes);
+#endif
diff --git a/crypto_kem/sntrup761/clean/crypto_core_multsntrup761.c b/crypto_kem/sntrup761/clean/crypto_core_multsntrup761.c
new file mode 100644
index 00000000..e283456e
--- /dev/null
+++ b/crypto_kem/sntrup761/clean/crypto_core_multsntrup761.c
@@ -0,0 +1,60 @@
+#include "crypto_core_multsntrup761.h"
+#include "params.h"
+
+
+#define int8 int8_t
+#define int16 int16_t
+#define int32 int32_t
+typedef int8 small;
+
+typedef int16 Fq;
+/* always represented as -(q-1)/2...(q-1)/2 */
+
+/* works for -14000000 < x < 14000000 if q in 4591, 4621, 5167 */
+static Fq Fq_freeze(int32 x) {
+    x -= q * ((q18 * x) >> 18);
+    x -= q * ((q27 * x + 67108864) >> 27);
+    return x;
+}
+
+int PQCLEAN_SNTRUP761_CLEAN_crypto_core_multsntrup761(unsigned char *outbytes, const unsigned char *inbytes, const unsigned char *kbytes) {
+    Fq f[p];
+    small g[p];
+    Fq fg[p + p - 1];
+    int32 result;
+    int i, j;
+
+    crypto_decode_pxint16(f, inbytes);
+    for (i = 0; i < p; ++i) {
+        f[i] = Fq_freeze(f[i]);
+    }
+
+    for (i = 0; i < p; ++i) {
+        small gi = kbytes[i];
+        small gi0 = gi & 1;
+        g[i] = gi0 - (gi & (gi0 << 1));
+    }
+
+    for (i = 0; i < p; ++i) {
+        result = 0;
+        for (j = 0; j <= i; ++j) {
+            result += f[j] * (int32)g[i - j];
+        }
+        fg[i] = Fq_freeze(result);
+    }
+    for (i = p; i < p + p - 1; ++i) {
+        result = 0;
+        for (j = i - p + 1; j < p; ++j) {
+            result += f[j] * (int32)g[i - j];
+        }
+        fg[i] = Fq_freeze(result);
+    }
+
+    for (i = p + p - 2; i >= p; --i) {
+        fg[i - p] = Fq_freeze(fg[i - p] + fg[i]);
+        fg[i - p + 1] = Fq_freeze(fg[i - p + 1] + fg[i]);
+    }
+
+    crypto_encode_pxint16(outbytes, fg);
+    return 0;
+}
diff --git a/crypto_kem/sntrup761/clean/crypto_core_multsntrup761.h b/crypto_kem/sntrup761/clean/crypto_core_multsntrup761.h
new file mode 100644
index 00000000..dc81268f
--- /dev/null
+++ b/crypto_kem/sntrup761/clean/crypto_core_multsntrup761.h
@@ -0,0 +1,11 @@
+#ifndef PQCLEAN_SNTRUP761_CLEAN_CRYPTO_CORE_MULTSNTRUP761_H
+#define PQCLEAN_SNTRUP761_CLEAN_CRYPTO_CORE_MULTSNTRUP761_H
+
+#include <stdint.h>
+#define PQCLEAN_SNTRUP761_CLEAN_crypto_core_multsntrup761_OUTPUTBYTES 1522
+#define PQCLEAN_SNTRUP761_CLEAN_crypto_core_multsntrup761_INPUTBYTES 1522
+#define PQCLEAN_SNTRUP761_CLEAN_crypto_core_multsntrup761_KEYBYTES 761
+#define PQCLEAN_SNTRUP761_CLEAN_crypto_core_multsntrup761_CONSTBYTES 0
+
+int PQCLEAN_SNTRUP761_CLEAN_crypto_core_multsntrup761(unsigned char *outbytes, const unsigned char *inbytes, const unsigned char *kbytes);
+#endif
diff --git a/crypto_kem/sntrup761/clean/crypto_core_scale3sntrup761.c b/crypto_kem/sntrup761/clean/crypto_core_scale3sntrup761.c
new file mode 100644
index 00000000..f4da4e6f
--- /dev/null
+++ b/crypto_kem/sntrup761/clean/crypto_core_scale3sntrup761.c
@@ -0,0 +1,32 @@
+#include "crypto_core_scale3sntrup761.h"
+#include "crypto_decode_761xint16.h"
+#include "crypto_encode_761xint16.h"
+
+
+#define p 761
+#define q 4591
+
+#define crypto_decode_pxint16 PQCLEAN_SNTRUP761_CLEAN_crypto_decode_761xint16
+#define crypto_encode_pxint16 PQCLEAN_SNTRUP761_CLEAN_crypto_encode_761xint16
+
+typedef int16_t Fq;
+
+/* out = 3*in in Rq */
+int PQCLEAN_SNTRUP761_CLEAN_crypto_core_scale3sntrup761(unsigned char *outbytes, const unsigned char *inbytes) {
+    Fq f[p];
+    int i;
+
+    crypto_decode_pxint16(f, inbytes);
+    for (i = 0; i < p; ++i) {
+        Fq x = f[i];
+        x *= 3; /* (-3q+3)/2 ... (3q-3)/2 */
+        x -= (q + 1) / 2; /* -2q+1 ... q-2 */
+        x += q & (x >> 15); /* -q+1 ... q-1 */
+        x += q & (x >> 15); /* 0 ... q-1 */
+        x -= (q - 1) / 2; /* -(q-1)/2 ... (q-1)/2 */
+        f[i] = x;
+    }
+    crypto_encode_pxint16(outbytes, f);
+
+    return 0;
+}
diff --git a/crypto_kem/sntrup761/clean/crypto_core_scale3sntrup761.h b/crypto_kem/sntrup761/clean/crypto_core_scale3sntrup761.h
new file mode 100644
index 00000000..712cdb5b
--- /dev/null
+++ b/crypto_kem/sntrup761/clean/crypto_core_scale3sntrup761.h
@@ -0,0 +1,11 @@
+#ifndef PQCLEAN_SNTRUP761_CLEAN_CRYPTO_CORE_SCALE3SNTRUP761_H
+#define PQCLEAN_SNTRUP761_CLEAN_CRYPTO_CORE_SCALE3SNTRUP761_H
+
+#include <stdint.h>
+#define PQCLEAN_SNTRUP761_CLEAN_crypto_core_scale3sntrup761_OUTPUTBYTES 1522
+#define PQCLEAN_SNTRUP761_CLEAN_crypto_core_scale3sntrup761_INPUTBYTES 1522
+#define PQCLEAN_SNTRUP761_CLEAN_crypto_core_scale3sntrup761_KEYBYTES 0
+#define PQCLEAN_SNTRUP761_CLEAN_crypto_core_scale3sntrup761_CONSTBYTES 0
+
+int PQCLEAN_SNTRUP761_CLEAN_crypto_core_scale3sntrup761(unsigned char *outbytes, const unsigned char *inbytes);
+#endif
diff --git a/crypto_kem/sntrup761/clean/crypto_core_weightsntrup761.c b/crypto_kem/sntrup761/clean/crypto_core_weightsntrup761.c
new file mode 100644
index 00000000..3809abf0
--- /dev/null
+++ b/crypto_kem/sntrup761/clean/crypto_core_weightsntrup761.c
@@ -0,0 +1,21 @@
+#include "crypto_core_weightsntrup761.h"
+#include "crypto_encode_int16.h"
+#include "params.h"
+
+
+#define int8 int8_t
+#define int16 int16_t
+
+
+/* out = little-endian weight of bottom bits of in */
+int PQCLEAN_SNTRUP761_CLEAN_crypto_core_weightsntrup761(unsigned char *outbytes, const unsigned char *inbytes) {
+    int8 *in = (void *) inbytes;
+    int16 weight = 0;
+    int i;
+
+    for (i = 0; i < p; ++i) {
+        weight += in[i] & 1;
+    }
+    PQCLEAN_SNTRUP761_CLEAN_crypto_encode_int16(outbytes, &weight);
+    return 0;
+}
diff --git a/crypto_kem/sntrup761/clean/crypto_core_weightsntrup761.h b/crypto_kem/sntrup761/clean/crypto_core_weightsntrup761.h
new file mode 100644
index 00000000..a1cef62f
--- /dev/null
+++ b/crypto_kem/sntrup761/clean/crypto_core_weightsntrup761.h
@@ -0,0 +1,11 @@
+#ifndef PQCLEAN_SNTRUP761_CLEAN_CRYPTO_CORE_WEIGHTSNTRUP761_H
+#define PQCLEAN_SNTRUP761_CLEAN_CRYPTO_CORE_WEIGHTSNTRUP761_H
+
+#include <stdint.h>
+#define PQCLEAN_SNTRUP761_CLEAN_crypto_core_weightsntrup761_OUTPUTBYTES 2
+#define PQCLEAN_SNTRUP761_CLEAN_crypto_core_weightsntrup761_INPUTBYTES 761
+#define PQCLEAN_SNTRUP761_CLEAN_crypto_core_weightsntrup761_KEYBYTES 0
+#define PQCLEAN_SNTRUP761_CLEAN_crypto_core_weightsntrup761_CONSTBYTES 0
+
+int PQCLEAN_SNTRUP761_CLEAN_crypto_core_weightsntrup761(unsigned char *outbytes, const unsigned char *inbytes);
+#endif
diff --git a/crypto_kem/sntrup761/clean/crypto_core_wforcesntrup761.c b/crypto_kem/sntrup761/clean/crypto_core_wforcesntrup761.c
new file mode 100644
index 00000000..806d3111
--- /dev/null
+++ b/crypto_kem/sntrup761/clean/crypto_core_wforcesntrup761.c
@@ -0,0 +1,48 @@
+#include "crypto_core_wforcesntrup761.h"
+#include "params.h"
+
+
+#define int8 int8_t
+#define int16 int16_t
+#define uint16 uint16_t
+#define uint32 uint32_t
+
+typedef int8 small;
+
+
+/* return -1 if x!=0; else return 0 */
+static int int16_nonzero_mask(int16 x) {
+    uint16 u = x; /* 0, else 1...65535 */
+    uint32 v = u; /* 0, else 1...65535 */
+    v = -v; /* 0, else 2^32-65535...2^32-1 */
+    v >>= 31; /* 0, else 1 */
+    return -v; /* 0, else -1 */
+}
+
+/* 0 if Weightw_is(r), else -1 */
+static int Weightw_mask(const small *r) {
+    int weight = 0;
+    int i;
+
+    for (i = 0; i < p; ++i) {
+        weight += r[i] & 1;
+    }
+    return int16_nonzero_mask(weight - w);
+}
+
+/* out = in if bottom bits of in have weight w */
+/* otherwise out = (1,1,...,1,0,0,...,0) */
+int PQCLEAN_SNTRUP761_CLEAN_crypto_core_wforcesntrup761(unsigned char *outbytes, const unsigned char *inbytes) {
+    small *out = (void *) outbytes;
+    const small *in = (const void *) inbytes;
+    int i, mask;
+
+    mask = Weightw_mask(in); /* 0 if weight w, else -1 */
+    for (i = 0; i < w; ++i) {
+        out[i] = ((in[i] ^ 1) & ~mask) ^ 1;
+    }
+    for (i = w; i < p; ++i) {
+        out[i] = in[i] & ~mask;
+    }
+    return 0;
+}
diff --git a/crypto_kem/sntrup761/clean/crypto_core_wforcesntrup761.h b/crypto_kem/sntrup761/clean/crypto_core_wforcesntrup761.h
new file mode 100644
index 00000000..42e84231
--- /dev/null
+++ b/crypto_kem/sntrup761/clean/crypto_core_wforcesntrup761.h
@@ -0,0 +1,11 @@
+#ifndef PQCLEAN_SNTRUP761_CLEAN_CRYPTO_CORE_WFORCESNTRUP761_H
+#define PQCLEAN_SNTRUP761_CLEAN_CRYPTO_CORE_WFORCESNTRUP761_H
+
+#include <stdint.h>
+#define PQCLEAN_SNTRUP761_CLEAN_crypto_core_wforcesntrup761_OUTPUTBYTES 761
+#define PQCLEAN_SNTRUP761_CLEAN_crypto_core_wforcesntrup761_INPUTBYTES 761
+#define PQCLEAN_SNTRUP761_CLEAN_crypto_core_wforcesntrup761_KEYBYTES 0
+#define PQCLEAN_SNTRUP761_CLEAN_crypto_core_wforcesntrup761_CONSTBYTES 0
+
+int PQCLEAN_SNTRUP761_CLEAN_crypto_core_wforcesntrup761(unsigned char *outbytes, const unsigned char *inbytes);
+#endif
diff --git a/crypto_kem/sntrup761/clean/crypto_decode_761x1531.c b/crypto_kem/sntrup761/clean/crypto_decode_761x1531.c
new file mode 100644
index 00000000..d19d7402
--- /dev/null
+++ b/crypto_kem/sntrup761/clean/crypto_decode_761x1531.c
@@ -0,0 +1,211 @@
+#include "crypto_decode_761x1531.h"
+
+/* auto-generated; do not edit */
+
+#define int16 int16_t
+#define uint16 uint16_t
+#define uint32 uint32_t
+#define uint64 uint64_t
+
+/*
+CPU division instruction typically takes time depending on x.
+This software is designed to take time independent of x.
+Time still varies depending on m; user must ensure that m is constant.
+Time also varies on CPUs where multiplication is variable-time.
+There could be more CPU issues.
+There could also be compiler issues.
+*/
+
+static void uint32_divmod_uint14(uint32 *q, uint16 *r, uint32 x, uint16 m) {
+    uint32 v = 0x80000000;
+    uint32 qpart;
+    uint32 mask;
+
+    v /= m;
+
+    /* caller guarantees m > 0 */
+    /* caller guarantees m < 16384 */
+    /* vm <= 2^31 <= vm+m-1 */
+    /* xvm <= 2^31 x <= xvm+x(m-1) */
+
+    *q = 0;
+
+    qpart = (x * (uint64)v) >> 31;
+    /* 2^31 qpart <= xv <= 2^31 qpart + 2^31-1 */
+    /* 2^31 qpart m <= xvm <= 2^31 qpart m + (2^31-1)m */
+    /* 2^31 qpart m <= 2^31 x <= 2^31 qpart m + (2^31-1)m + x(m-1) */
+    /* 0 <= 2^31 newx <= (2^31-1)m + x(m-1) */
+    /* 0 <= newx <= (1-1/2^31)m + x(m-1)/2^31 */
+    /* 0 <= newx <= (1-1/2^31)(2^14-1) + (2^32-1)((2^14-1)-1)/2^31 */
+
+    x -= qpart * m;
+    *q += qpart;
+    /* x <= 49146 */
+
+    qpart = (x * (uint64)v) >> 31;
+    /* 0 <= newx <= (1-1/2^31)m + x(m-1)/2^31 */
+    /* 0 <= newx <= m + 49146(2^14-1)/2^31 */
+    /* 0 <= newx <= m + 0.4 */
+    /* 0 <= newx <= m */
+
+    x -= qpart * m;
+    *q += qpart;
+    /* x <= m */
+
+    x -= m;
+    *q += 1;
+    mask = -(x >> 31);
+    x += mask & (uint32)m;
+    *q += mask;
+    /* x < m */
+
+    *r = x;
+}
+
+static uint16 uint32_mod_uint14(uint32 x, uint16 m) {
+    uint32 q;
+    uint16 r;
+    uint32_divmod_uint14(&q, &r, x, m);
+    return r;
+}
+
+void PQCLEAN_SNTRUP761_CLEAN_crypto_decode_761x1531(void *v, const unsigned char *s) {
+    int16 *R0 = v;
+    uint16 R1[381], R2[191], R3[96], R4[48], R5[24], R6[12], R7[6], R8[3], R9[2], R10[1];
+    long long i;
+    uint16 r0;
+    uint32 r1, r2;
+
+    s += PQCLEAN_SNTRUP761_CLEAN_crypto_decode_761x1531_STRBYTES;
+    r1 = 0;
+    r1 = (r1 << 8) | *--s;
+    r1 = (r1 << 8) | *--s;
+    r1 = uint32_mod_uint14(r1, 3475); /* needed only for invalid inputs */
+    R10[0] = r1;
+
+    r2 = R10[0];
+    r2 = (r2 << 8) | *--s;
+    uint32_divmod_uint14(&r1, &r0, r2, 593);
+    R9[0] = r0;
+    r1 = uint32_mod_uint14(r1, 1500); /* needed only for invalid inputs */
+    R9[1] = r1;
+
+    R8[2] = R9[1];
+    r2 = R9[0];
+    r2 = (r2 << 8) | *--s;
+    r2 = (r2 << 8) | *--s;
+    uint32_divmod_uint14(&r1, &r0, r2, 6232);
+    R8[0] = r0;
+    r1 = uint32_mod_uint14(r1, 6232); /* needed only for invalid inputs */
+    R8[1] = r1;
+
+    r2 = R8[2];
+    r2 = (r2 << 8) | *--s;
+    uint32_divmod_uint14(&r1, &r0, r2, 1263);
+    R7[4] = r0;
+    r1 = uint32_mod_uint14(r1, 304); /* needed only for invalid inputs */
+    R7[5] = r1;
+    for (i = 1; i >= 0; --i) {
+        r2 = R8[i];
+        r2 = (r2 << 8) | *--s;
+        uint32_divmod_uint14(&r1, &r0, r2, 1263);
+        R7[2 * i] = r0;
+        r1 = uint32_mod_uint14(r1, 1263); /* needed only for invalid inputs */
+        R7[2 * i + 1] = r1;
+    }
+
+    r2 = R7[5];
+    r2 = (r2 << 8) | *--s;
+    r2 = (r2 << 8) | *--s;
+    uint32_divmod_uint14(&r1, &r0, r2, 9097);
+    R6[10] = r0;
+    r1 = uint32_mod_uint14(r1, 2188); /* needed only for invalid inputs */
+    R6[11] = r1;
+    for (i = 4; i >= 0; --i) {
+        r2 = R7[i];
+        r2 = (r2 << 8) | *--s;
+        r2 = (r2 << 8) | *--s;
+        uint32_divmod_uint14(&r1, &r0, r2, 9097);
+        R6[2 * i] = r0;
+        r1 = uint32_mod_uint14(r1, 9097); /* needed only for invalid inputs */
+        R6[2 * i + 1] = r1;
+    }
+
+    r2 = R6[11];
+    r2 = (r2 << 8) | *--s;
+    uint32_divmod_uint14(&r1, &r0, r2, 1526);
+    R5[22] = r0;
+    r1 = uint32_mod_uint14(r1, 367); /* needed only for invalid inputs */
+    R5[23] = r1;
+    for (i = 10; i >= 0; --i) {
+        r2 = R6[i];
+        r2 = (r2 << 8) | *--s;
+        uint32_divmod_uint14(&r1, &r0, r2, 1526);
+        R5[2 * i] = r0;
+        r1 = uint32_mod_uint14(r1, 1526); /* needed only for invalid inputs */
+        R5[2 * i + 1] = r1;
+    }
+
+    r2 = R5[23];
+    r2 = (r2 << 8) | *--s;
+    uint32_divmod_uint14(&r1, &r0, r2, 625);
+    R4[46] = r0;
+    r1 = uint32_mod_uint14(r1, 150); /* needed only for invalid inputs */
+    R4[47] = r1;
+    for (i = 22; i >= 0; --i) {
+        r2 = R5[i];
+        r2 = (r2 << 8) | *--s;
+        uint32_divmod_uint14(&r1, &r0, r2, 625);
+        R4[2 * i] = r0;
+        r1 = uint32_mod_uint14(r1, 625); /* needed only for invalid inputs */
+        R4[2 * i + 1] = r1;
+    }
+
+    r2 = R4[47];
+    r2 = (r2 << 8) | *--s;
+    r2 = (r2 << 8) | *--s;
+    uint32_divmod_uint14(&r1, &r0, r2, 6400);
+    R3[94] = r0;
+    r1 = uint32_mod_uint14(r1, 1531); /* needed only for invalid inputs */
+    R3[95] = r1;
+    for (i = 46; i >= 0; --i) {
+        r2 = R4[i];
+        r2 = (r2 << 8) | *--s;
+        r2 = (r2 << 8) | *--s;
+        uint32_divmod_uint14(&r1, &r0, r2, 6400);
+        R3[2 * i] = r0;
+        r1 = uint32_mod_uint14(r1, 6400); /* needed only for invalid inputs */
+        R3[2 * i + 1] = r1;
+    }
+
+    R2[190] = R3[95];
+    for (i = 94; i >= 0; --i) {
+        r2 = R3[i];
+        r2 = (r2 << 8) | *--s;
+        uint32_divmod_uint14(&r1, &r0, r2, 1280);
+        R2[2 * i] = r0;
+        r1 = uint32_mod_uint14(r1, 1280); /* needed only for invalid inputs */
+        R2[2 * i + 1] = r1;
+    }
+
+    R1[380] = R2[190];
+    for (i = 189; i >= 0; --i) {
+        r2 = R2[i];
+        r2 = (r2 << 8) | *--s;
+        r2 = (r2 << 8) | *--s;
+        uint32_divmod_uint14(&r1, &r0, r2, 9157);
+        R1[2 * i] = r0;
+        r1 = uint32_mod_uint14(r1, 9157); /* needed only for invalid inputs */
+        R1[2 * i + 1] = r1;
+    }
+
+    R0[760] = 3 * R1[380] - 2295;
+    for (i = 379; i >= 0; --i) {
+        r2 = R1[i];
+        r2 = (r2 << 8) | *--s;
+        uint32_divmod_uint14(&r1, &r0, r2, 1531);
+        R0[2 * i] = 3 * r0 - 2295;
+        r1 = uint32_mod_uint14(r1, 1531); /* needed only for invalid inputs */
+        R0[2 * i + 1] = 3 * r1 - 2295;
+    }
+}
diff --git a/crypto_kem/sntrup761/clean/crypto_decode_761x1531.h b/crypto_kem/sntrup761/clean/crypto_decode_761x1531.h
new file mode 100644
index 00000000..c55247c9
--- /dev/null
+++ b/crypto_kem/sntrup761/clean/crypto_decode_761x1531.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_SNTRUP761_CLEAN_CRYPTO_DECODE_761X1531_H
+#define PQCLEAN_SNTRUP761_CLEAN_CRYPTO_DECODE_761X1531_H
+
+#include <stdint.h>
+#define PQCLEAN_SNTRUP761_CLEAN_crypto_decode_761x1531_STRBYTES 1007
+#define PQCLEAN_SNTRUP761_CLEAN_crypto_decode_761x1531_ITEMS 761
+#define PQCLEAN_SNTRUP761_CLEAN_crypto_decode_761x1531_ITEMBYTES 2
+
+void PQCLEAN_SNTRUP761_CLEAN_crypto_decode_761x1531(void *v, const unsigned char *s);
+#endif
diff --git a/crypto_kem/sntrup761/clean/crypto_decode_761x3.c b/crypto_kem/sntrup761/clean/crypto_decode_761x3.c
new file mode 100644
index 00000000..394b0ccb
--- /dev/null
+++ b/crypto_kem/sntrup761/clean/crypto_decode_761x3.c
@@ -0,0 +1,24 @@
+#include "crypto_decode_761x3.h"
+
+#define uint8 uint8_t
+
+#define p 761
+
+void PQCLEAN_SNTRUP761_CLEAN_crypto_decode_761x3(void *v, const unsigned char *s) {
+    uint8 *f = v;
+    uint8 x;
+    int i;
+
+    for (i = 0; i < p / 4; ++i) {
+        x = *s++;
+        *f++ = ((uint8)(x & 3)) - 1;
+        x >>= 2;
+        *f++ = ((uint8)(x & 3)) - 1;
+        x >>= 2;
+        *f++ = ((uint8)(x & 3)) - 1;
+        x >>= 2;
+        *f++ = ((uint8)(x & 3)) - 1;
+    }
+    x = *s++;
+    *f++ = ((uint8)(x & 3)) - 1;
+}
diff --git a/crypto_kem/sntrup761/clean/crypto_decode_761x3.h b/crypto_kem/sntrup761/clean/crypto_decode_761x3.h
new file mode 100644
index 00000000..acf9d9cc
--- /dev/null
+++ b/crypto_kem/sntrup761/clean/crypto_decode_761x3.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_SNTRUP761_CLEAN_CRYPTO_DECODE_761X3_H
+#define PQCLEAN_SNTRUP761_CLEAN_CRYPTO_DECODE_761X3_H
+
+#include <stdint.h>
+#define PQCLEAN_SNTRUP761_CLEAN_crypto_decode_761x3_STRBYTES 191
+#define PQCLEAN_SNTRUP761_CLEAN_crypto_decode_761x3_ITEMS 761
+#define PQCLEAN_SNTRUP761_CLEAN_crypto_decode_761x3_ITEMBYTES 1
+
+void PQCLEAN_SNTRUP761_CLEAN_crypto_decode_761x3(void *v, const unsigned char *s);
+#endif
diff --git a/crypto_kem/sntrup761/clean/crypto_decode_761x4591.c b/crypto_kem/sntrup761/clean/crypto_decode_761x4591.c
new file mode 100644
index 00000000..630c11fe
--- /dev/null
+++ b/crypto_kem/sntrup761/clean/crypto_decode_761x4591.c
@@ -0,0 +1,211 @@
+#include "crypto_decode_761x4591.h"
+
+/* auto-generated; do not edit */
+
+#define int16 int16_t
+#define uint16 uint16_t
+#define uint32 uint32_t
+#define uint64 uint64_t
+
+/*
+CPU division instruction typically takes time depending on x.
+This software is designed to take time independent of x.
+Time still varies depending on m; user must ensure that m is constant.
+Time also varies on CPUs where multiplication is variable-time.
+There could be more CPU issues.
+There could also be compiler issues.
+*/
+
+static void uint32_divmod_uint14(uint32 *q, uint16 *r, uint32 x, uint16 m) {
+    uint32 v = 0x80000000;
+    uint32 qpart;
+    uint32 mask;
+
+    v /= m;
+
+    /* caller guarantees m > 0 */
+    /* caller guarantees m < 16384 */
+    /* vm <= 2^31 <= vm+m-1 */
+    /* xvm <= 2^31 x <= xvm+x(m-1) */
+
+    *q = 0;
+
+    qpart = (x * (uint64)v) >> 31;
+    /* 2^31 qpart <= xv <= 2^31 qpart + 2^31-1 */
+    /* 2^31 qpart m <= xvm <= 2^31 qpart m + (2^31-1)m */
+    /* 2^31 qpart m <= 2^31 x <= 2^31 qpart m + (2^31-1)m + x(m-1) */
+    /* 0 <= 2^31 newx <= (2^31-1)m + x(m-1) */
+    /* 0 <= newx <= (1-1/2^31)m + x(m-1)/2^31 */
+    /* 0 <= newx <= (1-1/2^31)(2^14-1) + (2^32-1)((2^14-1)-1)/2^31 */
+
+    x -= qpart * m;
+    *q += qpart;
+    /* x <= 49146 */
+
+    qpart = (x * (uint64)v) >> 31;
+    /* 0 <= newx <= (1-1/2^31)m + x(m-1)/2^31 */
+    /* 0 <= newx <= m + 49146(2^14-1)/2^31 */
+    /* 0 <= newx <= m + 0.4 */
+    /* 0 <= newx <= m */
+
+    x -= qpart * m;
+    *q += qpart;
+    /* x <= m */
+
+    x -= m;
+    *q += 1;
+    mask = -(x >> 31);
+    x += mask & (uint32)m;
+    *q += mask;
+    /* x < m */
+
+    *r = x;
+}
+
+static uint16 uint32_mod_uint14(uint32 x, uint16 m) {
+    uint32 q;
+    uint16 r;
+    uint32_divmod_uint14(&q, &r, x, m);
+    return r;
+}
+
+void PQCLEAN_SNTRUP761_CLEAN_crypto_decode_761x4591(void *v, const unsigned char *s) {
+    int16 *R0 = v;
+    uint16 R1[381], R2[191], R3[96], R4[48], R5[24], R6[12], R7[6], R8[3], R9[2], R10[1];
+    long long i;
+    uint16 r0;
+    uint32 r1, r2;
+
+    s += PQCLEAN_SNTRUP761_CLEAN_crypto_decode_761x4591_STRBYTES;
+    r1 = 0;
+    r1 = (r1 << 8) | *--s;
+    r1 = (r1 << 8) | *--s;
+    r1 = uint32_mod_uint14(r1, 1608); /* needed only for invalid inputs */
+    R10[0] = r1;
+
+    r2 = R10[0];
+    r2 = (r2 << 8) | *--s;
+    r2 = (r2 << 8) | *--s;
+    uint32_divmod_uint14(&r1, &r0, r2, 9470);
+    R9[0] = r0;
+    r1 = uint32_mod_uint14(r1, 11127); /* needed only for invalid inputs */
+    R9[1] = r1;
+
+    R8[2] = R9[1];
+    r2 = R9[0];
+    r2 = (r2 << 8) | *--s;
+    uint32_divmod_uint14(&r1, &r0, r2, 1557);
+    R8[0] = r0;
+    r1 = uint32_mod_uint14(r1, 1557); /* needed only for invalid inputs */
+    R8[1] = r1;
+
+    r2 = R8[2];
+    r2 = (r2 << 8) | *--s;
+    uint32_divmod_uint14(&r1, &r0, r2, 10101);
+    R7[4] = r0;
+    r1 = uint32_mod_uint14(r1, 282); /* needed only for invalid inputs */
+    R7[5] = r1;
+    for (i = 1; i >= 0; --i) {
+        r2 = R8[i];
+        r2 = (r2 << 8) | *--s;
+        r2 = (r2 << 8) | *--s;
+        uint32_divmod_uint14(&r1, &r0, r2, 10101);
+        R7[2 * i] = r0;
+        r1 = uint32_mod_uint14(r1, 10101); /* needed only for invalid inputs */
+        R7[2 * i + 1] = r1;
+    }
+
+    r2 = R7[5];
+    r2 = (r2 << 8) | *--s;
+    r2 = (r2 << 8) | *--s;
+    uint32_divmod_uint14(&r1, &r0, r2, 1608);
+    R6[10] = r0;
+    r1 = uint32_mod_uint14(r1, 11468); /* needed only for invalid inputs */
+    R6[11] = r1;
+    for (i = 4; i >= 0; --i) {
+        r2 = R7[i];
+        r2 = (r2 << 8) | *--s;
+        uint32_divmod_uint14(&r1, &r0, r2, 1608);
+        R6[2 * i] = r0;
+        r1 = uint32_mod_uint14(r1, 1608); /* needed only for invalid inputs */
+        R6[2 * i + 1] = r1;
+    }
+
+    r2 = R6[11];
+    r2 = (r2 << 8) | *--s;
+    uint32_divmod_uint14(&r1, &r0, r2, 10265);
+    R5[22] = r0;
+    r1 = uint32_mod_uint14(r1, 286); /* needed only for invalid inputs */
+    R5[23] = r1;
+    for (i = 10; i >= 0; --i) {
+        r2 = R6[i];
+        r2 = (r2 << 8) | *--s;
+        r2 = (r2 << 8) | *--s;
+        uint32_divmod_uint14(&r1, &r0, r2, 10265);
+        R5[2 * i] = r0;
+        r1 = uint32_mod_uint14(r1, 10265); /* needed only for invalid inputs */
+        R5[2 * i + 1] = r1;
+    }
+
+    r2 = R5[23];
+    r2 = (r2 << 8) | *--s;
+    r2 = (r2 << 8) | *--s;
+    uint32_divmod_uint14(&r1, &r0, r2, 1621);
+    R4[46] = r0;
+    r1 = uint32_mod_uint14(r1, 11550); /* needed only for invalid inputs */
+    R4[47] = r1;
+    for (i = 22; i >= 0; --i) {
+        r2 = R5[i];
+        r2 = (r2 << 8) | *--s;
+        uint32_divmod_uint14(&r1, &r0, r2, 1621);
+        R4[2 * i] = r0;
+        r1 = uint32_mod_uint14(r1, 1621); /* needed only for invalid inputs */
+        R4[2 * i + 1] = r1;
+    }
+
+    r2 = R4[47];
+    r2 = (r2 << 8) | *--s;
+    uint32_divmod_uint14(&r1, &r0, r2, 644);
+    R3[94] = r0;
+    r1 = uint32_mod_uint14(r1, 4591); /* needed only for invalid inputs */
+    R3[95] = r1;
+    for (i = 46; i >= 0; --i) {
+        r2 = R4[i];
+        r2 = (r2 << 8) | *--s;
+        uint32_divmod_uint14(&r1, &r0, r2, 644);
+        R3[2 * i] = r0;
+        r1 = uint32_mod_uint14(r1, 644); /* needed only for invalid inputs */
+        R3[2 * i + 1] = r1;
+    }
+
+    R2[190] = R3[95];
+    for (i = 94; i >= 0; --i) {
+        r2 = R3[i];
+        r2 = (r2 << 8) | *--s;
+        uint32_divmod_uint14(&r1, &r0, r2, 406);
+        R2[2 * i] = r0;
+        r1 = uint32_mod_uint14(r1, 406); /* needed only for invalid inputs */
+        R2[2 * i + 1] = r1;
+    }
+
+    R1[380] = R2[190];
+    for (i = 189; i >= 0; --i) {
+        r2 = R2[i];
+        r2 = (r2 << 8) | *--s;
+        uint32_divmod_uint14(&r1, &r0, r2, 322);
+        R1[2 * i] = r0;
+        r1 = uint32_mod_uint14(r1, 322); /* needed only for invalid inputs */
+        R1[2 * i + 1] = r1;
+    }
+
+    R0[760] = R1[380] - 2295;
+    for (i = 379; i >= 0; --i) {
+        r2 = R1[i];
+        r2 = (r2 << 8) | *--s;
+        r2 = (r2 << 8) | *--s;
+        uint32_divmod_uint14(&r1, &r0, r2, 4591);
+        R0[2 * i] = r0 - 2295;
+        r1 = uint32_mod_uint14(r1, 4591); /* needed only for invalid inputs */
+        R0[2 * i + 1] = r1 - 2295;
+    }
+}
diff --git a/crypto_kem/sntrup761/clean/crypto_decode_761x4591.h b/crypto_kem/sntrup761/clean/crypto_decode_761x4591.h
new file mode 100644
index 00000000..49b80ca6
--- /dev/null
+++ b/crypto_kem/sntrup761/clean/crypto_decode_761x4591.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_SNTRUP761_CLEAN_CRYPTO_DECODE_761X4591_H
+#define PQCLEAN_SNTRUP761_CLEAN_CRYPTO_DECODE_761X4591_H
+
+#include <stdint.h>
+#define PQCLEAN_SNTRUP761_CLEAN_crypto_decode_761x4591_STRBYTES 1158
+#define PQCLEAN_SNTRUP761_CLEAN_crypto_decode_761x4591_ITEMS 761
+#define PQCLEAN_SNTRUP761_CLEAN_crypto_decode_761x4591_ITEMBYTES 2
+
+void PQCLEAN_SNTRUP761_CLEAN_crypto_decode_761x4591(void *v, const unsigned char *s);
+#endif
diff --git a/crypto_kem/sntrup761/clean/crypto_decode_761xint16.c b/crypto_kem/sntrup761/clean/crypto_decode_761xint16.c
new file mode 100644
index 00000000..7678d872
--- /dev/null
+++ b/crypto_kem/sntrup761/clean/crypto_decode_761xint16.c
@@ -0,0 +1,16 @@
+#include "crypto_decode_761xint16.h"
+
+
+void PQCLEAN_SNTRUP761_CLEAN_crypto_decode_761xint16(void *v, const unsigned char *s) {
+    uint16_t *x = v;
+    int i;
+
+    for (i = 0; i < 761; ++i) {
+        uint16_t u0 = s[0];
+        uint16_t u1 = s[1];
+        u1 <<= 8;
+        *x = u0 | u1;
+        x += 1;
+        s += 2;
+    }
+}
diff --git a/crypto_kem/sntrup761/clean/crypto_decode_761xint16.h b/crypto_kem/sntrup761/clean/crypto_decode_761xint16.h
new file mode 100644
index 00000000..867054c9
--- /dev/null
+++ b/crypto_kem/sntrup761/clean/crypto_decode_761xint16.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_SNTRUP761_CLEAN_CRYPTO_DECODE_761XINT16_H
+#define PQCLEAN_SNTRUP761_CLEAN_CRYPTO_DECODE_761XINT16_H
+
+#include <stdint.h>
+#define PQCLEAN_SNTRUP761_CLEAN_crypto_decode_761xint16_STRBYTES 1522
+#define PQCLEAN_SNTRUP761_CLEAN_crypto_decode_761xint16_ITEMBYTES 2
+#define PQCLEAN_SNTRUP761_CLEAN_crypto_decode_761xint16_ITEMS 761
+
+void PQCLEAN_SNTRUP761_CLEAN_crypto_decode_761xint16(void *v, const unsigned char *s);
+#endif
diff --git a/crypto_kem/sntrup761/clean/crypto_decode_761xint32.c b/crypto_kem/sntrup761/clean/crypto_decode_761xint32.c
new file mode 100644
index 00000000..71d4dcc4
--- /dev/null
+++ b/crypto_kem/sntrup761/clean/crypto_decode_761xint32.c
@@ -0,0 +1,20 @@
+#include "crypto_decode_761xint32.h"
+
+
+void PQCLEAN_SNTRUP761_CLEAN_crypto_decode_761xint32(void *v, const unsigned char *s) {
+    uint32_t *x = v;
+    int i;
+
+    for (i = 0; i < 761; ++i) {
+        uint32_t u0 = s[0];
+        uint32_t u1 = s[1];
+        uint32_t u2 = s[2];
+        uint32_t u3 = s[3];
+        u1 <<= 8;
+        u2 <<= 16;
+        u3 <<= 24;
+        *x = u0 | u1 | u2 | u3;
+        x += 1;
+        s += 4;
+    }
+}
diff --git a/crypto_kem/sntrup761/clean/crypto_decode_761xint32.h b/crypto_kem/sntrup761/clean/crypto_decode_761xint32.h
new file mode 100644
index 00000000..7771a1e8
--- /dev/null
+++ b/crypto_kem/sntrup761/clean/crypto_decode_761xint32.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_SNTRUP761_CLEAN_CRYPTO_DECODE_761XINT32_H
+#define PQCLEAN_SNTRUP761_CLEAN_CRYPTO_DECODE_761XINT32_H
+
+#include <stdint.h>
+#define PQCLEAN_SNTRUP761_CLEAN_crypto_decode_761xint32_STRBYTES 3044
+#define PQCLEAN_SNTRUP761_CLEAN_crypto_decode_761xint32_ITEMBYTES 4
+#define PQCLEAN_SNTRUP761_CLEAN_crypto_decode_761xint32_ITEMS 761
+
+void PQCLEAN_SNTRUP761_CLEAN_crypto_decode_761xint32(void *v, const unsigned char *s);
+#endif
diff --git a/crypto_kem/sntrup761/clean/crypto_encode_761x1531.c b/crypto_kem/sntrup761/clean/crypto_encode_761x1531.c
new file mode 100644
index 00000000..17db0eef
--- /dev/null
+++ b/crypto_kem/sntrup761/clean/crypto_encode_761x1531.c
@@ -0,0 +1,119 @@
+#include "crypto_encode_761x1531.h"
+
+/* auto-generated; do not edit */
+
+#define int16 int16_t
+#define uint16 uint16_t
+#define uint32 uint32_t
+
+void PQCLEAN_SNTRUP761_CLEAN_crypto_encode_761x1531(unsigned char *out, const void *v) {
+    const int16 *R0 = v;
+    /* XXX: caller could overlap R with input */
+    uint16 R[381];
+    long i;
+    uint16 r0, r1;
+    uint32 r2;
+
+    for (i = 0; i < 380; ++i) {
+        r0 = (((R0[2 * i] + 2295) & 16383) * 10923) >> 15;
+        r1 = (((R0[2 * i + 1] + 2295) & 16383) * 10923) >> 15;
+        r2 = r0 + r1 * (uint32)1531;
+        *out++ = r2;
+        r2 >>= 8;
+        R[i] = r2;
+    }
+    R[380] = (((R0[760] + 2295) & 16383) * 10923) >> 15;
+
+    for (i = 0; i < 190; ++i) {
+        r0 = R[2 * i];
+        r1 = R[2 * i + 1];
+        r2 = r0 + r1 * (uint32)9157;
+        *out++ = r2;
+        r2 >>= 8;
+        *out++ = r2;
+        r2 >>= 8;
+        R[i] = r2;
+    }
+    R[190] = R[380];
+
+    for (i = 0; i < 95; ++i) {
+        r0 = R[2 * i];
+        r1 = R[2 * i + 1];
+        r2 = r0 + r1 * (uint32)1280;
+        *out++ = r2;
+        r2 >>= 8;
+        R[i] = r2;
+    }
+    R[95] = R[190];
+
+    for (i = 0; i < 48; ++i) {
+        r0 = R[2 * i];
+        r1 = R[2 * i + 1];
+        r2 = r0 + r1 * (uint32)6400;
+        *out++ = r2;
+        r2 >>= 8;
+        *out++ = r2;
+        r2 >>= 8;
+        R[i] = r2;
+    }
+
+    for (i = 0; i < 24; ++i) {
+        r0 = R[2 * i];
+        r1 = R[2 * i + 1];
+        r2 = r0 + r1 * (uint32)625;
+        *out++ = r2;
+        r2 >>= 8;
+        R[i] = r2;
+    }
+
+    for (i = 0; i < 12; ++i) {
+        r0 = R[2 * i];
+        r1 = R[2 * i + 1];
+        r2 = r0 + r1 * (uint32)1526;
+        *out++ = r2;
+        r2 >>= 8;
+        R[i] = r2;
+    }
+
+    for (i = 0; i < 6; ++i) {
+        r0 = R[2 * i];
+        r1 = R[2 * i + 1];
+        r2 = r0 + r1 * (uint32)9097;
+        *out++ = r2;
+        r2 >>= 8;
+        *out++ = r2;
+        r2 >>= 8;
+        R[i] = r2;
+    }
+
+    for (i = 0; i < 3; ++i) {
+        r0 = R[2 * i];
+        r1 = R[2 * i + 1];
+        r2 = r0 + r1 * (uint32)1263;
+        *out++ = r2;
+        r2 >>= 8;
+        R[i] = r2;
+    }
+
+    r0 = R[0];
+    r1 = R[1];
+    r2 = r0 + r1 * (uint32)6232;
+    *out++ = r2;
+    r2 >>= 8;
+    *out++ = r2;
+    r2 >>= 8;
+    R[0] = r2;
+    R[1] = R[2];
+
+    r0 = R[0];
+    r1 = R[1];
+    r2 = r0 + r1 * (uint32)593;
+    *out++ = r2;
+    r2 >>= 8;
+    R[0] = r2;
+
+    r0 = R[0];
+    *out++ = r0;
+    r0 >>= 8;
+    *out++ = r0; /*clang-analyzer-deadcode.DeadStores*/ /*r0 >>= 8;*/
+}
diff --git a/crypto_kem/sntrup761/clean/crypto_encode_761x1531.h b/crypto_kem/sntrup761/clean/crypto_encode_761x1531.h
new file mode 100644
index 00000000..cc4f332c
--- /dev/null
+++ b/crypto_kem/sntrup761/clean/crypto_encode_761x1531.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_SNTRUP761_CLEAN_CRYPTO_ENCODE_761X1531_H
+#define PQCLEAN_SNTRUP761_CLEAN_CRYPTO_ENCODE_761X1531_H
+
+#include <stdint.h>
+#define PQCLEAN_SNTRUP761_CLEAN_crypto_encode_761x1531_STRBYTES 1007
+#define PQCLEAN_SNTRUP761_CLEAN_crypto_encode_761x1531_ITEMS 761
+#define PQCLEAN_SNTRUP761_CLEAN_crypto_encode_761x1531_ITEMBYTES 2
+
+void PQCLEAN_SNTRUP761_CLEAN_crypto_encode_761x1531(unsigned char *out, const void *v);
+#endif
diff --git a/crypto_kem/sntrup761/clean/crypto_encode_761x1531round.c b/crypto_kem/sntrup761/clean/crypto_encode_761x1531round.c
new file mode 100644
index 00000000..6b10ccb4
--- /dev/null
+++ b/crypto_kem/sntrup761/clean/crypto_encode_761x1531round.c
@@ -0,0 +1,17 @@
+#include "crypto_encode_761x1531.h"
+#include "crypto_encode_761x1531round.h"
+
+#define int16 int16_t
+
+#define p 761
+
+void PQCLEAN_SNTRUP761_CLEAN_crypto_encode_761x1531round(unsigned char *out, const void *v) {
+    const int16 *a = v;
+    int16 x[p];
+    int i;
+
+    for (i = 0; i < p; ++i) {
+        x[i] = 3 * ((10923 * a[i] + 16384) >> 15);
+    }
+    PQCLEAN_SNTRUP761_CLEAN_crypto_encode_761x1531(out, x);
+}
diff --git a/crypto_kem/sntrup761/clean/crypto_encode_761x1531round.h b/crypto_kem/sntrup761/clean/crypto_encode_761x1531round.h
new file mode 100644
index 00000000..64730f1f
--- /dev/null
+++ b/crypto_kem/sntrup761/clean/crypto_encode_761x1531round.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_SNTRUP761_CLEAN_CRYPTO_ENCODE_761X1531ROUND_H
+#define PQCLEAN_SNTRUP761_CLEAN_CRYPTO_ENCODE_761X1531ROUND_H
+
+#include <stdint.h>
+#define PQCLEAN_SNTRUP761_CLEAN_crypto_encode_761x1531round_STRBYTES 1007
+#define PQCLEAN_SNTRUP761_CLEAN_crypto_encode_761x1531round_ITEMS 761
+#define PQCLEAN_SNTRUP761_CLEAN_crypto_encode_761x1531round_ITEMBYTES 2
+
+void PQCLEAN_SNTRUP761_CLEAN_crypto_encode_761x1531round(unsigned char *out, const void *v);
+#endif
diff --git a/crypto_kem/sntrup761/clean/crypto_encode_761x3.c b/crypto_kem/sntrup761/clean/crypto_encode_761x3.c
new file mode 100644
index 00000000..54deace0
--- /dev/null
+++ b/crypto_kem/sntrup761/clean/crypto_encode_761x3.c
@@ -0,0 +1,21 @@
+#include "crypto_encode_761x3.h"
+
+#define uint8 uint8_t
+
+#define p 761
+
+void PQCLEAN_SNTRUP761_CLEAN_crypto_encode_761x3(unsigned char *s, const void *v) {
+    const uint8 *f = v;
+    uint8 x;
+    int i;
+
+    for (i = 0; i < p / 4; ++i) {
+        x = *f++ + 1;
+        x += (*f++ + 1) << 2;
+        x += (*f++ + 1) << 4;
+        x += (*f++ + 1) << 6;
+        *s++ = x;
+    }
+    x = *f++ + 1;
+    *s++ = x;
+}
diff --git a/crypto_kem/sntrup761/clean/crypto_encode_761x3.h b/crypto_kem/sntrup761/clean/crypto_encode_761x3.h
new file mode 100644
index 00000000..e5ab1b17
--- /dev/null
+++ b/crypto_kem/sntrup761/clean/crypto_encode_761x3.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_SNTRUP761_CLEAN_CRYPTO_ENCODE_761X3_H
+#define PQCLEAN_SNTRUP761_CLEAN_CRYPTO_ENCODE_761X3_H
+
+#include <stdint.h>
+#define PQCLEAN_SNTRUP761_CLEAN_crypto_encode_761x3_STRBYTES 191
+#define PQCLEAN_SNTRUP761_CLEAN_crypto_encode_761x3_ITEMS 761
+#define PQCLEAN_SNTRUP761_CLEAN_crypto_encode_761x3_ITEMBYTES 1
+
+void PQCLEAN_SNTRUP761_CLEAN_crypto_encode_761x3(unsigned char *s, const void *v);
+#endif
diff --git a/crypto_kem/sntrup761/clean/crypto_encode_761x4591.c b/crypto_kem/sntrup761/clean/crypto_encode_761x4591.c
new file mode 100644
index 00000000..c09514c2
--- /dev/null
+++ b/crypto_kem/sntrup761/clean/crypto_encode_761x4591.c
@@ -0,0 +1,147 @@
+#include "crypto_encode_761x4591.h"
+
+/* auto-generated; do not edit */
+
+#define int16 int16_t
+#define uint16 uint16_t
+#define uint32 uint32_t
+
+void PQCLEAN_SNTRUP761_CLEAN_crypto_encode_761x4591(unsigned char *out, const void *v) {
+    const int16 *R0 = v;
+    /* XXX: caller could overlap R with input */
+    uint16 R[381];
+    long i;
+    uint16 r0, r1;
+    uint32 r2;
+
+    for (i = 0; i < 380; ++i) {
+        r0 = (R0[2 * i] + 2295) & 16383;
+        r1 = (R0[2 * i + 1] + 2295) & 16383;
+        r2 = r0 + r1 * (uint32)4591;
+        *out++ = r2;
+        r2 >>= 8;
+        *out++ = r2;
+        r2 >>= 8;
+        R[i] = r2;
+    }
+    R[380] = (R0[760] + 2295) & 16383;
+
+    for (i = 0; i < 190; ++i) {
+        r0 = R[2 * i];
+        r1 = R[2 * i + 1];
+        r2 = r0 + r1 * (uint32)322;
+        *out++ = r2;
+        r2 >>= 8;
+        R[i] = r2;
+    }
+    R[190] = R[380];
+
+    for (i = 0; i < 95; ++i) {
+        r0 = R[2 * i];
+        r1 = R[2 * i + 1];
+        r2 = r0 + r1 * (uint32)406;
+        *out++ = r2;
+        r2 >>= 8;
+        R[i] = r2;
+    }
+    R[95] = R[190];
+
+    for (i = 0; i < 48; ++i) {
+        r0 = R[2 * i];
+        r1 = R[2 * i + 1];
+        r2 = r0 + r1 * (uint32)644;
+        *out++ = r2;
+        r2 >>= 8;
+        R[i] = r2;
+    }
+
+    for (i = 0; i < 23; ++i) {
+        r0 = R[2 * i];
+        r1 = R[2 * i + 1];
+        r2 = r0 + r1 * (uint32)1621;
+        *out++ = r2;
+        r2 >>= 8;
+        R[i] = r2;
+    }
+    r0 = R[46];
+    r1 = R[47];
+    r2 = r0 + r1 * (uint32)1621;
+    *out++ = r2;
+    r2 >>= 8;
+    *out++ = r2;
+    r2 >>= 8;
+    R[23] = r2;
+
+    for (i = 0; i < 11; ++i) {
+        r0 = R[2 * i];
+        r1 = R[2 * i + 1];
+        r2 = r0 + r1 * (uint32)10265;
+        *out++ = r2;
+        r2 >>= 8;
+        *out++ = r2;
+        r2 >>= 8;
+        R[i] = r2;
+    }
+    r0 = R[22];
+    r1 = R[23];
+    r2 = r0 + r1 * (uint32)10265;
+    *out++ = r2;
+    r2 >>= 8;
+    R[11] = r2;
+
+    for (i = 0; i < 5; ++i) {
+        r0 = R[2 * i];
+        r1 = R[2 * i + 1];
+        r2 = r0 + r1 * (uint32)1608;
+        *out++ = r2;
+        r2 >>= 8;
+        R[i] = r2;
+    }
+    r0 = R[10];
+    r1 = R[11];
+    r2 = r0 + r1 * (uint32)1608;
+    *out++ = r2;
+    r2 >>= 8;
+    *out++ = r2;
+    r2 >>= 8;
+    R[5] = r2;
+
+    for (i = 0; i < 2; ++i) {
+        r0 = R[2 * i];
+        r1 = R[2 * i + 1];
+        r2 = r0 + r1 * (uint32)10101;
+        *out++ = r2;
+        r2 >>= 8;
+        *out++ = r2;
+        r2 >>= 8;
+        R[i] = r2;
+    }
+    r0 = R[4];
+    r1 = R[5];
+    r2 = r0 + r1 * (uint32)10101;
+    *out++ = r2;
+    r2 >>= 8;
+    R[2] = r2;
+
+    r0 = R[0];
+    r1 = R[1];
+    r2 = r0 + r1 * (uint32)1557;
+    *out++ = r2;
+    r2 >>= 8;
+    R[0] = r2;
+    R[1] = R[2];
+
+    r0 = R[0];
+    r1 = R[1];
+    r2 = r0 + r1 * (uint32)9470;
+    *out++ = r2;
+    r2 >>= 8;
+    *out++ = r2;
+    r2 >>= 8;
+    R[0] = r2;
+
+    r0 = R[0];
+    *out++ = r0;
+    r0 >>= 8;
+    *out++ = r0; /*clang-analyzer-deadcode.DeadStores*/ /*r0 >>= 8;*/
+}
diff --git a/crypto_kem/sntrup761/clean/crypto_encode_761x4591.h b/crypto_kem/sntrup761/clean/crypto_encode_761x4591.h
new file mode 100644
index 00000000..b1d5f998
--- /dev/null
+++ b/crypto_kem/sntrup761/clean/crypto_encode_761x4591.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_SNTRUP761_CLEAN_CRYPTO_ENCODE_761X4591_H
+#define PQCLEAN_SNTRUP761_CLEAN_CRYPTO_ENCODE_761X4591_H
+
+#include <stdint.h>
+#define PQCLEAN_SNTRUP761_CLEAN_crypto_encode_761x4591_STRBYTES 1158
+#define PQCLEAN_SNTRUP761_CLEAN_crypto_encode_761x4591_ITEMS 761
+#define PQCLEAN_SNTRUP761_CLEAN_crypto_encode_761x4591_ITEMBYTES 2
+
+void PQCLEAN_SNTRUP761_CLEAN_crypto_encode_761x4591(unsigned char *out, const void *v);
+#endif
diff --git a/crypto_kem/sntrup761/clean/crypto_encode_761xfreeze3.c b/crypto_kem/sntrup761/clean/crypto_encode_761xfreeze3.c
new file mode 100644
index 00000000..d2b10e3e
--- /dev/null
+++ b/crypto_kem/sntrup761/clean/crypto_encode_761xfreeze3.c
@@ -0,0 +1,25 @@
+#include "crypto_encode_761xfreeze3.h"
+
+#define int16 int16_t
+
+#define p 761
+
+/* valid inputs: -16384 <= x < 16384 */
+/* then 3 divides x-F3_freeze(x) */
+/* and F3_freeze(x) is in {-1,0,1} */
+
+/* all inputs: 3 divides x-F3_freeze(x) */
+/* and F3_freeze(x) is in {-2,-1,0,1,2} */
+
+static inline unsigned char F3_freeze(int16 x) {
+    return x - 3 * ((10923 * x + 16384) >> 15);
+}
+
+void PQCLEAN_SNTRUP761_CLEAN_crypto_encode_761xfreeze3(unsigned char *s, const void *v) {
+    const int16 *r = v;
+
+    int i;
+    for (i = 0; i < p; ++i) {
+        s[i] = F3_freeze(r[i]);
+    }
+}
diff --git a/crypto_kem/sntrup761/clean/crypto_encode_761xfreeze3.h b/crypto_kem/sntrup761/clean/crypto_encode_761xfreeze3.h
new file mode 100644
index 00000000..14517f13
--- /dev/null
+++ b/crypto_kem/sntrup761/clean/crypto_encode_761xfreeze3.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_SNTRUP761_CLEAN_CRYPTO_ENCODE_761XFREEZE3_H
+#define PQCLEAN_SNTRUP761_CLEAN_CRYPTO_ENCODE_761XFREEZE3_H
+
+#include <stdint.h>
+#define PQCLEAN_SNTRUP761_CLEAN_crypto_encode_761xfreeze3_STRBYTES 761
+#define PQCLEAN_SNTRUP761_CLEAN_crypto_encode_761xfreeze3_ITEMS 761
+#define PQCLEAN_SNTRUP761_CLEAN_crypto_encode_761xfreeze3_ITEMBYTES 2
+
+void PQCLEAN_SNTRUP761_CLEAN_crypto_encode_761xfreeze3(unsigned char *s, const void *v);
+#endif
diff --git a/crypto_kem/sntrup761/clean/crypto_encode_761xint16.c b/crypto_kem/sntrup761/clean/crypto_encode_761xint16.c
new file mode 100644
index 00000000..02384178
--- /dev/null
+++ b/crypto_kem/sntrup761/clean/crypto_encode_761xint16.c
@@ -0,0 +1,13 @@
+#include "crypto_encode_761xint16.h"
+
+
+void PQCLEAN_SNTRUP761_CLEAN_crypto_encode_761xint16(unsigned char *s, const void *v) {
+    const uint16_t *x = v;
+    int i;
+
+    for (i = 0; i < 761; ++i) {
+        uint16_t u = *x++;
+        *s++ = u;
+        *s++ = u >> 8;
+    }
+}
diff --git a/crypto_kem/sntrup761/clean/crypto_encode_761xint16.h b/crypto_kem/sntrup761/clean/crypto_encode_761xint16.h
new file mode 100644
index 00000000..081fe9cc
--- /dev/null
+++ b/crypto_kem/sntrup761/clean/crypto_encode_761xint16.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_SNTRUP761_CLEAN_CRYPTO_ENCODE_761XINT16_H
+#define PQCLEAN_SNTRUP761_CLEAN_CRYPTO_ENCODE_761XINT16_H
+
+#include <stdint.h>
+#define PQCLEAN_SNTRUP761_CLEAN_crypto_encode_761xint16_STRBYTES 1522
+#define PQCLEAN_SNTRUP761_CLEAN_crypto_encode_761xint16_ITEMBYTES 2
+#define PQCLEAN_SNTRUP761_CLEAN_crypto_encode_761xint16_ITEMS 761
+
+void PQCLEAN_SNTRUP761_CLEAN_crypto_encode_761xint16(unsigned char *s, const void *v);
+#endif
diff --git a/crypto_kem/sntrup761/clean/crypto_encode_int16.c b/crypto_kem/sntrup761/clean/crypto_encode_int16.c
new file mode 100644
index 00000000..074814aa
--- /dev/null
+++ b/crypto_kem/sntrup761/clean/crypto_encode_int16.c
@@ -0,0 +1,9 @@
+#include "crypto_encode_int16.h"
+
+#define uint16 uint16_t
+
+void PQCLEAN_SNTRUP761_CLEAN_crypto_encode_int16(unsigned char *s, const void *x) {
+    uint16 u = *(const uint16 *) x;
+    s[0] = u;
+    s[1] = u >> 8;
+}
diff --git a/crypto_kem/sntrup761/clean/crypto_encode_int16.h b/crypto_kem/sntrup761/clean/crypto_encode_int16.h
new file mode 100644
index 00000000..1b96805e
--- /dev/null
+++ b/crypto_kem/sntrup761/clean/crypto_encode_int16.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_SNTRUP761_CLEAN_CRYPTO_ENCODE_INT16_H
+#define PQCLEAN_SNTRUP761_CLEAN_CRYPTO_ENCODE_INT16_H
+
+#include <stdint.h>
+#define PQCLEAN_SNTRUP761_CLEAN_crypto_encode_int16_STRBYTES 2
+#define PQCLEAN_SNTRUP761_CLEAN_crypto_encode_int16_ITEMBYTES 2
+#define PQCLEAN_SNTRUP761_CLEAN_crypto_encode_int16_ITEMS 1
+
+void PQCLEAN_SNTRUP761_CLEAN_crypto_encode_int16(unsigned char *s, const void *x);
+#endif
diff --git a/crypto_kem/sntrup761/clean/crypto_sort_int32.c b/crypto_kem/sntrup761/clean/crypto_sort_int32.c
new file mode 100644
index 00000000..1d2ca492
--- /dev/null
+++ b/crypto_kem/sntrup761/clean/crypto_sort_int32.c
@@ -0,0 +1,86 @@
+#include "crypto_sort_int32.h"
+#include <stdint.h>
+// Based on supercop-20190110/crypto_sort/int32/x86
+
+
+#define int32 int32_t
+
+#define int32_MINMAX(a,b) \
+    do { \
+        int32_t ab = (b) ^ (a); \
+        int32_t c = (int32_t)((int64_t)(b) - (int64_t)(a)); \
+        c ^= ab & (c ^ (b)); \
+        c >>= 31; \
+        c &= ab; \
+        (a) ^= c; \
+        (b) ^= c; \
+    } while(0)
+
+/* assume 2 <= n <= 0x40000000 */
+void PQCLEAN_SNTRUP761_CLEAN_crypto_sort_int32(int32 *array, size_t n) {
+    size_t top, p, q, r, i, j;
+    int32 *x = array;
+
+    top = 1;
+    while (top < n - top) {
+        top += top;
+    }
+
+    for (p = top; p >= 1; p >>= 1) {
+        i = 0;
+        while (i + 2 * p <= n) {
+            for (j = i; j < i + p; ++j) {
+                int32_MINMAX(x[j], x[j + p]);
+            }
+            i += 2 * p;
+        }
+        for (j = i; j < n - p; ++j) {
+            int32_MINMAX(x[j], x[j + p]);
+        }
+
+        i = 0;
+        j = 0;
+        for (q = top; q > p; q >>= 1) {
+            if (j != i) {
+                for (;;) {
+                    if (j == n - q) {
+                        goto done;
+                    }
+                    int32 a = x[j + p];
+                    for (r = q; r > p; r >>= 1) {
+                        int32_MINMAX(a, x[j + r]);
+                    }
+                    x[j + p] = a;
+                    ++j;
+                    if (j == i + p) {
+                        i += 2 * p;
+                        break;
+                    }
+                }
+            }
+            while (i + p <= n - q) {
+                for (j = i; j < i + p; ++j) {
+                    int32 a = x[j + p];
+                    for (r = q; r > p; r >>= 1) {
+                        int32_MINMAX(a, x[j + r]);
+                    }
+                    x[j + p] = a;
+                }
+                i += 2 * p;
+            }
+            /* now i + p > n - q */
+            j = i;
+            while (j < n - q) {
+                int32 a = x[j + p];
+                for (r = q; r > p; r >>= 1) {
+                    int32_MINMAX(a, x[j + r]);
+                }
+                x[j + p] = a;
+                ++j;
+            }
+
+done:
+            ;
+        }
+    }
+}
diff --git a/crypto_kem/sntrup761/clean/crypto_sort_int32.h b/crypto_kem/sntrup761/clean/crypto_sort_int32.h
new file mode 100644
index 00000000..33214f4f
--- /dev/null
+++ b/crypto_kem/sntrup761/clean/crypto_sort_int32.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_SNTRUP761_CLEAN_CRYPTO_SORT_INT32_H
+#define PQCLEAN_SNTRUP761_CLEAN_CRYPTO_SORT_INT32_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+
+void PQCLEAN_SNTRUP761_CLEAN_crypto_sort_int32(int32_t *array, size_t n);
+
+#endif
diff --git a/crypto_kem/sntrup761/clean/crypto_sort_uint32.c b/crypto_kem/sntrup761/clean/crypto_sort_uint32.c
new file mode 100644
index 00000000..a2b8667d
--- /dev/null
+++ b/crypto_kem/sntrup761/clean/crypto_sort_uint32.c
@@ -0,0 +1,20 @@
+#include "crypto_sort_int32.h"
+#include "crypto_sort_uint32.h"
+#include <stdint.h>
+
+#define uint32 uint32_t
+
+/* can save time by vectorizing xor loops */
+/* can save time by integrating xor loops with int32_sort */
+
+void PQCLEAN_SNTRUP761_CLEAN_crypto_sort_uint32(uint32_t *array, size_t n) {
+    uint32 *x = array;
+    size_t j;
+    for (j = 0; j < n; ++j) {
+        x[j] ^= 0x80000000;
+    }
+    PQCLEAN_SNTRUP761_CLEAN_crypto_sort_int32((int32_t *)array, n);
+    for (j = 0; j < n; ++j) {
+        x[j] ^= 0x80000000;
+    }
+}
diff --git a/crypto_kem/sntrup761/clean/crypto_sort_uint32.h b/crypto_kem/sntrup761/clean/crypto_sort_uint32.h
new file mode 100644
index 00000000..c24792fe
--- /dev/null
+++ b/crypto_kem/sntrup761/clean/crypto_sort_uint32.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_SNTRUP761_CLEAN_CRYPTO_SORT_UINT32_H
+#define PQCLEAN_SNTRUP761_CLEAN_CRYPTO_SORT_UINT32_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+
+void PQCLEAN_SNTRUP761_CLEAN_crypto_sort_uint32(uint32_t *array, size_t n);
+
+#endif
diff --git a/crypto_kem/sntrup761/clean/crypto_stream_aes256ctr.c b/crypto_kem/sntrup761/clean/crypto_stream_aes256ctr.c
new file mode 100644
index 00000000..bfa46323
--- /dev/null
+++ b/crypto_kem/sntrup761/clean/crypto_stream_aes256ctr.c
@@ -0,0 +1,15 @@
+#include "crypto_stream_aes256ctr.h"
+
+
+int PQCLEAN_SNTRUP761_CLEAN_crypto_stream_aes256ctr(
+    uint8_t *out,
+    size_t outlen,
+    const uint8_t nonce[AESCTR_NONCEBYTES],
+    const uint8_t key[AES256_KEYBYTES]) {
+
+    aes256ctx state;
+    aes256_ctr_keyexp(&state, key);
+    aes256_ctr(out, outlen, nonce, &state);
+    aes256_ctx_release(&state);
+    return 0;
+}
diff --git a/crypto_kem/sntrup761/clean/crypto_stream_aes256ctr.h b/crypto_kem/sntrup761/clean/crypto_stream_aes256ctr.h
new file mode 100644
index 00000000..df9dfe9b
--- /dev/null
+++ b/crypto_kem/sntrup761/clean/crypto_stream_aes256ctr.h
@@ -0,0 +1,15 @@
+#ifndef PQCLEAN_SNTRUP761_CLEAN_CRYPTO_STREAM_AES256CTR_H
+#define PQCLEAN_SNTRUP761_CLEAN_CRYPTO_STREAM_AES256CTR_H
+#include "aes.h"
+#include <stddef.h>
+#include <stdint.h>
+
+
+
+int PQCLEAN_SNTRUP761_CLEAN_crypto_stream_aes256ctr(
+    uint8_t *out,
+    size_t outlen,
+    const uint8_t nonce[AESCTR_NONCEBYTES],
+    const uint8_t key[AES256_KEYBYTES]);
+
+#endif
diff --git a/crypto_kem/sntrup761/clean/crypto_verify_1039.c b/crypto_kem/sntrup761/clean/crypto_verify_1039.c
new file mode 100644
index 00000000..9307fa45
--- /dev/null
+++ b/crypto_kem/sntrup761/clean/crypto_verify_1039.c
@@ -0,0 +1,13 @@
+#include "crypto_verify_1039.h"
+
+
+int PQCLEAN_SNTRUP761_CLEAN_crypto_verify_1039(const unsigned char *x, const unsigned char *y) {
+    unsigned int differentbits = 0;
+    int i;
+
+    for (i = 0; i < PQCLEAN_SNTRUP761_CLEAN_crypto_verify_1039_BYTES; ++i) {
+        differentbits |= x[i] ^ y[i];
+    }
+
+    return (int) (1 & ((differentbits - 1) >> 8)) - 1;
+}
diff --git a/crypto_kem/sntrup761/clean/crypto_verify_1039.h b/crypto_kem/sntrup761/clean/crypto_verify_1039.h
new file mode 100644
index 00000000..f3ad9ba6
--- /dev/null
+++ b/crypto_kem/sntrup761/clean/crypto_verify_1039.h
@@ -0,0 +1,8 @@
+#ifndef PQCLEAN_SNTRUP761_CLEAN_CRYPTO_VERIFY_1039_H
+#define PQCLEAN_SNTRUP761_CLEAN_CRYPTO_VERIFY_1039_H
+
+#include <stdint.h>
+#define PQCLEAN_SNTRUP761_CLEAN_crypto_verify_1039_BYTES 1039
+
+int PQCLEAN_SNTRUP761_CLEAN_crypto_verify_1039(const unsigned char *x, const unsigned char *y);
+#endif
diff --git a/crypto_kem/sntrup761/clean/kem.c b/crypto_kem/sntrup761/clean/kem.c
new file mode 100644
index 00000000..7e545aa7
--- /dev/null
+++ b/crypto_kem/sntrup761/clean/kem.c
@@ -0,0 +1,247 @@
+#include "api.h"
+#include "crypto_sort_uint32.h"
+#include "params.h"
+#include "randombytes.h"
+#include "sha2.h"
+
+
+
+#define int8 int8_t
+#define int16 int16_t
+#define int32 int32_t
+#define uint16 uint16_t
+#define uint32 uint32_t
+
+/* ----- arithmetic mod 3 */
+
+typedef int8 small;
+/* F3 is always represented as -1,0,1 */
+
+/* ----- arithmetic mod q */
+
+typedef int16 Fq;
+/* always represented as -(q-1)/2...(q-1)/2 */
+
+/* ----- small polynomials */
+
+/* R3_fromR(R_fromRq(r)) */
+static void R3_fromRq(small *out, const Fq *r) {
+    crypto_encode_pxfreeze3((unsigned char *) out, (unsigned char *) r);
+}
+
+/* h = f*g in the ring R3 */
+static void R3_mult(small *h, const small *f, const small *g) {
+    crypto_core_mult3((unsigned char *) h, (const unsigned char *) f, (const unsigned char *) g);
+}
+
+/* ----- polynomials mod q */
+
+/* h = h*g in the ring Rq */
+static void Rq_mult_small(Fq *h, const small *g) {
+    crypto_encode_pxint16((unsigned char *) h, h);
+    crypto_core_mult((unsigned char *) h, (const unsigned char *) h, (const unsigned char *) g);
+    crypto_decode_pxint16(h, (const unsigned char *) h);
+}
+
+/* h = 3f in Rq */
+static void Rq_mult3(Fq *h, const Fq *f) {
+    crypto_encode_pxint16((unsigned char *) h, f);
+    crypto_core_scale3((unsigned char *) h, (const unsigned char *) h);
+    crypto_decode_pxint16(h, (const unsigned char *) h);
+}
+
+/* out = 1/(3*in) in Rq */
+/* caller must have 2p+1 bytes free in out, not just 2p */
+static void Rq_recip3(Fq *out, const small *in) {
+    crypto_core_inv((unsigned char *) out, (const unsigned char *) in);
+    /* could check byte 2*p for failure; but, in context, inv always works */
+    crypto_decode_pxint16(out, (unsigned char *) out);
+}
+
+/* ----- underlying hash function */
+
+#define Hash_bytes 32
+
+static void Hash(unsigned char *out, const unsigned char *in, int inlen) {
+    unsigned char h[64];
+    int i;
+    sha512(h, in, inlen);
+    for (i = 0; i < 32; ++i) {
+        out[i] = h[i];
+    }
+}
+
+/* ----- higher-level randomness */
+
+static void Short_random(small *out) {
+    uint32 L[ppadsort];
+    int i;
+
+    randombytes((unsigned char *) L, 4 * p);
+    crypto_decode_pxint32(L, (unsigned char *) L);
+    for (i = 0; i < w; ++i) {
+        L[i] = L[i] & (uint32) - 2;
+    }
+    for (i = w; i < p; ++i) {
+        L[i] = (L[i] & (uint32) - 3) | 1;
+    }
+    for (i = p; i < ppadsort; ++i) {
+        L[i] = 0xffffffff;
+    }
+    PQCLEAN_SNTRUP761_CLEAN_crypto_sort_uint32(L, ppadsort);
+    for (i = 0; i < p; ++i) {
+        out[i] = (L[i] & 3) - 1;
+    }
+}
+
+static void Small_random(small *out) {
+    uint32 L[p];
+    int i;
+
+    randombytes((unsigned char *) L, sizeof L);
+    crypto_decode_pxint32(L, (unsigned char *) L);
+    for (i = 0; i < p; ++i) {
+        out[i] = (((L[i] & 0x3fffffff) * 3) >> 30) - 1;
+    }
+}
+
+/* ----- Streamlined NTRU Prime */
+
+typedef small Inputs[p]; /* passed by reference */
+#define Ciphertexts_bytes Rounded_bytes
+#define SecretKeys_bytes (2*Small_bytes)
+#define PublicKeys_bytes Rq_bytes
+#define Confirm_bytes 32
+
+/* c,r_enc[1:] = Hide(r,pk,cache); cache is Hash4(pk) */
+/* also set r_enc[0]=3 */
+/* also set x[0]=2, and x[1:1+Hash_bytes] = Hash3(r_enc) */
+/* also overwrite x[1+Hash_bytes:1+2*Hash_bytes] */
+static void Hide(unsigned char *x, unsigned char *c, unsigned char *r_enc, const Inputs r, const unsigned char *pk, const unsigned char *cache) {
+    Fq h[p];
+    int i;
+
+    Small_encode(r_enc + 1, r);
+    Rq_decode(h, pk);
+    Rq_mult_small(h, r);
+    Round_and_encode(c, h);
+    r_enc[0] = 3;
+    Hash(x + 1, r_enc, 1 + Small_bytes);
+    for (i = 0; i < Hash_bytes; ++i) {
+        x[1 + Hash_bytes + i] = cache[i];
+    }
+    x[0] = 2;
+    Hash(c + Ciphertexts_bytes, x, 1 + Hash_bytes * 2);
+}
+
+
+int PQCLEAN_SNTRUP761_CLEAN_crypto_kem_keypair(unsigned char *pk, unsigned char *sk) {
+    small g[p];
+    for (;;) {
+        Small_random(g);
+        {
+            small v[p + 1];
+            crypto_core_inv3((unsigned char *) v, (const unsigned char *) g);
+            if (v[p] == 0) {
+                Small_encode(sk + Small_bytes, v);
+                break;
+            }
+        }
+    }
+    {
+        small f[p];
+        Short_random(f);
+        Small_encode(sk, f);
+        {
+            Fq h[p + 1];
+            Rq_recip3(h, f); /* always works */
+            Rq_mult_small(h, g);
+            Rq_encode(pk, h);
+        }
+    }
+    {
+        int i;
+        unsigned char sksave = sk[SecretKeys_bytes - 1];
+        for (i = 0; i < PublicKeys_bytes; ++i) {
+            sk[SecretKeys_bytes + i] = pk[i];
+        }
+        sk[SecretKeys_bytes - 1] = 4;
+        Hash(sk + SecretKeys_bytes + PublicKeys_bytes + Small_bytes, sk + SecretKeys_bytes - 1, 1 + PublicKeys_bytes);
+        sk[SecretKeys_bytes - 1] = sksave;
+        randombytes(sk + SecretKeys_bytes + PublicKeys_bytes, Small_bytes);
+    }
+    return 0;
+}
+
+int PQCLEAN_SNTRUP761_CLEAN_crypto_kem_enc(unsigned char *c, unsigned char *k, const unsigned char *pk) {
+    unsigned char cache[Hash_bytes];
+    int i;
+    {
+        unsigned char y[1 + PublicKeys_bytes]; /* XXX: can eliminate with incremental hashing */
+        for (i = 0; i < PublicKeys_bytes; ++i) {
+            y[1 + i] = pk[i];
+        }
+        y[0] = 4;
+        Hash(cache, y, sizeof y);
+    }
+    {
+        Inputs r;
+        Short_random(r);
+        {
+            unsigned char r_enc[Small_bytes + 1];
+            unsigned char x[1 + Hash_bytes + Ciphertexts_bytes + Confirm_bytes];
+            Hide(x, c, r_enc, r, pk, cache);
+            for (i = 0; i < Ciphertexts_bytes + Confirm_bytes; ++i) {
+                x[1 + Hash_bytes + i] = c[i];
+            }
+            x[0] = 1;
+            Hash(k, x, sizeof x);
+        }
+    }
+    return 0;
+}
+
+int PQCLEAN_SNTRUP761_CLEAN_crypto_kem_dec(unsigned char *k, const unsigned char *c, const unsigned char *sk) {
+    const unsigned char *pk = sk + SecretKeys_bytes;
+    const unsigned char *rho = pk + PublicKeys_bytes;
+    const unsigned char *cache = rho + Small_bytes;
+    int mask, i;
+    Inputs r;
+    {
+        Fq d[p];
+        Rounded_decode(d, c);
+        {
+            small f[p];
+            Small_decode(f, sk);
+            Rq_mult_small(d, f);
+            Rq_mult3(d, d);
+        }
+        {
+            small e[p];
+            small v[p];
+            R3_fromRq(e, d);
+            Small_decode(v, sk + Small_bytes);
+            R3_mult(r, e, v);
+        }
+        crypto_core_wforce((unsigned char *) r, (unsigned char *) r);
+    }
+    {
+        unsigned char r_enc[1 + Small_bytes];
+        unsigned char cnew[Ciphertexts_bytes + Confirm_bytes];
+        unsigned char x[1 + Hash_bytes + Ciphertexts_bytes + Confirm_bytes];
+        /* XXX: can use incremental hashing to reduce x size */
+
+        Hide(x, cnew, r_enc, r, pk, cache);
+        mask = crypto_verify_clen(c, cnew);
+        for (i = 0; i < Small_bytes; ++i) {
+            r_enc[i + 1] ^= mask & (r_enc[i + 1] ^ rho[i]);
+        }
+        Hash(x + 1, r_enc, 1 + Small_bytes); /* XXX: can instead do cmov on cached hash of rho */
+        for (i = 0; i < Ciphertexts_bytes + Confirm_bytes; ++i) {
+            x[1 + Hash_bytes + i] = c[i];
+        }
+        x[0] = 1 + mask;
+        Hash(k, x, sizeof x);
+    }
+    return 0;
+}
diff --git a/crypto_kem/sntrup761/clean/params.h b/crypto_kem/sntrup761/clean/params.h
new file mode 100644
index 00000000..f64a7454
--- /dev/null
+++ b/crypto_kem/sntrup761/clean/params.h
@@ -0,0 +1,68 @@
+#ifndef params_H
+#define params_H
+#include "crypto_core_inv3sntrup761.h"
+#include "crypto_core_invsntrup761.h"
+#include "crypto_core_mult3sntrup761.h"
+#include "crypto_core_multsntrup761.h"
+#include "crypto_core_scale3sntrup761.h"
+#include "crypto_core_weightsntrup761.h"
+#include "crypto_core_wforcesntrup761.h"
+#include "crypto_decode_761x1531.h"
+#include "crypto_decode_761x3.h"
+#include "crypto_decode_761x4591.h"
+#include "crypto_decode_761xint16.h"
+#include "crypto_decode_761xint32.h"
+#include "crypto_encode_761x1531.h"
+#include "crypto_encode_761x1531round.h"
+#include "crypto_encode_761x3.h"
+#include "crypto_encode_761x4591.h"
+#include "crypto_encode_761xfreeze3.h"
+#include "crypto_encode_761xint16.h"
+#include "crypto_encode_int16.h"
+#include "crypto_verify_1039.h"
+
+
+#define p 761
+#define q27 29235 /* closest integer to 2^27/q */
+#define q18 57 /* closest integer to 2^18/q */
+#define q 4591
+#define w 286
+
+#define ppadsort 768
+
+#define crypto_verify_clen PQCLEAN_SNTRUP761_CLEAN_crypto_verify_1039
+
+#define Rq_bytes PQCLEAN_SNTRUP761_CLEAN_crypto_encode_761x4591_STRBYTES
+#define Rq_encode PQCLEAN_SNTRUP761_CLEAN_crypto_encode_761x4591
+#define Rq_decode PQCLEAN_SNTRUP761_CLEAN_crypto_decode_761x4591
+
+#define Rounded_bytes PQCLEAN_SNTRUP761_CLEAN_crypto_decode_761x1531_STRBYTES
+#define Rounded_decode PQCLEAN_SNTRUP761_CLEAN_crypto_decode_761x1531
+
+#define Round_and_encode PQCLEAN_SNTRUP761_CLEAN_crypto_encode_761x1531round
+
+#define Small_bytes PQCLEAN_SNTRUP761_CLEAN_crypto_encode_761x3_STRBYTES
+#define Small_encode PQCLEAN_SNTRUP761_CLEAN_crypto_encode_761x3
+#define Small_decode PQCLEAN_SNTRUP761_CLEAN_crypto_decode_761x3
+
+#define crypto_encode_pxfreeze3 PQCLEAN_SNTRUP761_CLEAN_crypto_encode_761xfreeze3
+
+#define crypto_decode_pxint32 PQCLEAN_SNTRUP761_CLEAN_crypto_decode_761xint32
+
+#define crypto_decode_pxint16 PQCLEAN_SNTRUP761_CLEAN_crypto_decode_761xint16
+
+#define crypto_encode_pxint16 PQCLEAN_SNTRUP761_CLEAN_crypto_encode_761xint16
+
+#define crypto_core_wforce PQCLEAN_SNTRUP761_CLEAN_crypto_core_wforcesntrup761
+
+#define crypto_core_scale3 PQCLEAN_SNTRUP761_CLEAN_crypto_core_scale3sntrup761
+
+#define crypto_core_inv PQCLEAN_SNTRUP761_CLEAN_crypto_core_invsntrup761
+
+#define crypto_core_inv3 PQCLEAN_SNTRUP761_CLEAN_crypto_core_inv3sntrup761
+
+#define crypto_core_mult3 PQCLEAN_SNTRUP761_CLEAN_crypto_core_mult3sntrup761
+
+#define crypto_core_mult PQCLEAN_SNTRUP761_CLEAN_crypto_core_multsntrup761
+
+#endif
diff --git a/crypto_kem/sntrup857/META.yml b/crypto_kem/sntrup857/META.yml
new file mode 100644
index 00000000..b6ddd07e
--- /dev/null
+++ b/crypto_kem/sntrup857/META.yml
@@ -0,0 +1,26 @@
+name: sntrup857
+type: kem
+claimed-nist-level: 4
+claimed-security: IND-CCA2
+length-public-key: 1322
+length-secret-key: 1999
+length-ciphertext: 1184
+length-shared-secret: 32
+nistkat-sha256: eb63dfbd70483c57c558f00db8211a723255c0c86e395ab4ce88148a623b2d27
+principal-submitters:
+  - Daniel J. Bernstein
+  - Chitchanok Chuengsatiansup
+  - Tanja Lange
+  - Christine van Vredendaal
+implementations:
+    - name: clean
+      version: supercop-20200826
+    - name: avx2
+      version: supercop-20200826
+      supported_platforms:
+          - architecture: x86_64
+            operating_systems:
+                - Linux
+                - Darwin
+            required_flags:
+                - avx2
diff --git a/crypto_kem/sntrup857/avx2/LICENSE b/crypto_kem/sntrup857/avx2/LICENSE
new file mode 100644
index 00000000..d5d21fff
--- /dev/null
+++ b/crypto_kem/sntrup857/avx2/LICENSE
@@ -0,0 +1 @@
+Public Domain
diff --git a/crypto_kem/sntrup857/avx2/Makefile b/crypto_kem/sntrup857/avx2/Makefile
new file mode 100644
index 00000000..58d4d010
--- /dev/null
+++ b/crypto_kem/sntrup857/avx2/Makefile
@@ -0,0 +1,22 @@
+# This Makefile can be used with GNU Make or BSD Make
+
+LIB=libsntrup857_avx2.a
+HEADERS=api.h crypto_core_inv3sntrup857.h crypto_core_invsntrup857.h crypto_core_mult3sntrup857.h crypto_core_multsntrup857.h crypto_core_multsntrup857_ntt.h crypto_core_scale3sntrup857.h crypto_core_weightsntrup857.h crypto_core_wforcesntrup857.h crypto_decode_857x1723.h crypto_decode_857x3.h crypto_decode_857x5167.h crypto_decode_857xint16.h crypto_decode_857xint32.h crypto_decode_int16.h crypto_encode_857x1723.h crypto_encode_857x1723round.h crypto_encode_857x3.h crypto_encode_857x5167.h crypto_encode_857xfreeze3.h crypto_encode_857xint16.h crypto_encode_int16.h crypto_sort_int32.h crypto_sort_uint32.h crypto_stream_aes256ctr.h crypto_verify_1184.h params.h 
+OBJECTS=crypto_core_inv3sntrup857.o crypto_core_invsntrup857.o crypto_core_mult3sntrup857.o crypto_core_multsntrup857.o crypto_core_multsntrup857_ntt.o crypto_core_scale3sntrup857.o crypto_core_weightsntrup857.o crypto_core_wforcesntrup857.o crypto_decode_857x1723.o crypto_decode_857x3.o crypto_decode_857x5167.o crypto_decode_857xint16.o crypto_decode_857xint32.o crypto_decode_int16.o crypto_encode_857x1723.o crypto_encode_857x1723round.o crypto_encode_857x3.o crypto_encode_857x5167.o crypto_encode_857xfreeze3.o crypto_encode_857xint16.o crypto_encode_int16.o crypto_sort_int32.o crypto_sort_uint32.o crypto_stream_aes256ctr.o crypto_verify_1184.o kem.o 
+
+CFLAGS=-O3 -mavx2 -mbmi2 -Wall -Wextra -Wpedantic -Wvla -Werror -Wredundant-decls -Wmissing-prototypes -std=c99 -I../../../common $(EXTRAFLAGS)
+
+all: $(LIB)
+
+%.o: %.s $(HEADERS)
+	$(AS) -o $@ $<
+
+%.o: %.c $(HEADERS)
+	$(CC) $(CFLAGS) -c -o $@ $<
+
+$(LIB): $(OBJECTS)
+	$(AR) -r $@ $(OBJECTS)
+
+clean:
+	$(RM) $(OBJECTS)
+	$(RM) $(LIB)
diff --git a/crypto_kem/sntrup857/avx2/api.h b/crypto_kem/sntrup857/avx2/api.h
new file mode 100644
index 00000000..e21ba752
--- /dev/null
+++ b/crypto_kem/sntrup857/avx2/api.h
@@ -0,0 +1,16 @@
+#ifndef PQCLEAN_SNTRUP857_AVX2_API_H
+#define PQCLEAN_SNTRUP857_AVX2_API_H
+
+
+
+#define PQCLEAN_SNTRUP857_AVX2_CRYPTO_ALGNAME "sntrup857"
+
+#define PQCLEAN_SNTRUP857_AVX2_CRYPTO_SECRETKEYBYTES 1999
+#define PQCLEAN_SNTRUP857_AVX2_CRYPTO_PUBLICKEYBYTES 1322
+#define PQCLEAN_SNTRUP857_AVX2_CRYPTO_CIPHERTEXTBYTES 1184
+#define PQCLEAN_SNTRUP857_AVX2_CRYPTO_BYTES 32
+
+int PQCLEAN_SNTRUP857_AVX2_crypto_kem_keypair(unsigned char *pk, unsigned char *sk);
+int PQCLEAN_SNTRUP857_AVX2_crypto_kem_enc(unsigned char *c, unsigned char *k, const unsigned char *pk);
+int PQCLEAN_SNTRUP857_AVX2_crypto_kem_dec(unsigned char *k, const unsigned char *c, const unsigned char *sk);
+#endif
diff --git a/crypto_kem/sntrup857/avx2/crypto_core_inv3sntrup857.c b/crypto_kem/sntrup857/avx2/crypto_core_inv3sntrup857.c
new file mode 100644
index 00000000..70a67231
--- /dev/null
+++ b/crypto_kem/sntrup857/avx2/crypto_core_inv3sntrup857.c
@@ -0,0 +1,658 @@
+#include "crypto_core_inv3sntrup857.h"
+#include <immintrin.h>
+
+
+#define int8 int8_t
+typedef int8 small;
+
+#define p 857
+#define ppad 1024
+#define numvec 4
+
+typedef __m256i vec256;
+
+/*
+This code stores 1024-coeff poly as vec256[4].
+Order of 256 coefficients in each vec256
+is optimized in light of costs of vector instructions:
+  0,4,...,252 in 64-bit word;
+  1,5,...,253 in 64-bit word;
+  2,6,...,254 in 64-bit word;
+  3,7,...,255 in 64-bit word.
+*/
+
+static inline void vec256_frombits(vec256 *v, const small *b) {
+    int i;
+
+    for (i = 0; i < numvec; ++i) {
+        vec256 b0 = _mm256_loadu_si256((vec256 *) b);
+        b += 32; /* 0,1,...,31 */
+        vec256 b1 = _mm256_loadu_si256((vec256 *) b);
+        b += 32; /* 32,33,... */
+        vec256 b2 = _mm256_loadu_si256((vec256 *) b);
+        b += 32;
+        vec256 b3 = _mm256_loadu_si256((vec256 *) b);
+        b += 32;
+        vec256 b4 = _mm256_loadu_si256((vec256 *) b);
+        b += 32;
+        vec256 b5 = _mm256_loadu_si256((vec256 *) b);
+        b += 32;
+        vec256 b6 = _mm256_loadu_si256((vec256 *) b);
+        b += 32;
+        vec256 b7 = _mm256_loadu_si256((vec256 *) b);
+        b += 32;
+
+        vec256 c0 = _mm256_unpacklo_epi32(b0, b1); /* 0 1 2 3 32 33 34 35 4 5 6 7 36 37 38 39 ... 55 */
+        vec256 c1 = _mm256_unpackhi_epi32(b0, b1); /* 8 9 10 11 40 41 42 43 ... 63 */
+        vec256 c2 = _mm256_unpacklo_epi32(b2, b3);
+        vec256 c3 = _mm256_unpackhi_epi32(b2, b3);
+        vec256 c4 = _mm256_unpacklo_epi32(b4, b5);
+        vec256 c5 = _mm256_unpackhi_epi32(b4, b5);
+        vec256 c6 = _mm256_unpacklo_epi32(b6, b7);
+        vec256 c7 = _mm256_unpackhi_epi32(b6, b7);
+
+        vec256 d0 = c0 | _mm256_slli_epi32(c1, 2); /* 0 8, 1 9, 2 10, 3 11, 32 40, 33 41, ..., 55 63 */
+        vec256 d2 = c2 | _mm256_slli_epi32(c3, 2);
+        vec256 d4 = c4 | _mm256_slli_epi32(c5, 2);
+        vec256 d6 = c6 | _mm256_slli_epi32(c7, 2);
+
+        vec256 e0 = _mm256_unpacklo_epi64(d0, d2);
+        vec256 e2 = _mm256_unpackhi_epi64(d0, d2);
+        vec256 e4 = _mm256_unpacklo_epi64(d4, d6);
+        vec256 e6 = _mm256_unpackhi_epi64(d4, d6);
+
+        vec256 f0 = e0 | _mm256_slli_epi32(e2, 1);
+        vec256 f4 = e4 | _mm256_slli_epi32(e6, 1);
+
+        vec256 g0 = _mm256_permute2x128_si256(f0, f4, 0x20);
+        vec256 g4 = _mm256_permute2x128_si256(f0, f4, 0x31);
+
+        vec256 h = g0 | _mm256_slli_epi32(g4, 4);
+
+#define TRANSPOSE _mm256_set_epi8( 31,27,23,19, 30,26,22,18, 29,25,21,17, 28,24,20,16, 15,11,7,3, 14,10,6,2, 13,9,5,1, 12,8,4,0 )
+        h = _mm256_shuffle_epi8(h, TRANSPOSE);
+        h = _mm256_permute4x64_epi64(h, 0xd8);
+        h = _mm256_shuffle_epi32(h, 0xd8);
+
+        *v++ = h;
+    }
+}
+
+static inline void vec256_tobits(const vec256 *v, small *b) {
+    int i;
+
+    for (i = 0; i < numvec; ++i) {
+        vec256 h = *v++;
+
+        h = _mm256_shuffle_epi32(h, 0xd8);
+        h = _mm256_permute4x64_epi64(h, 0xd8);
+        h = _mm256_shuffle_epi8(h, TRANSPOSE);
+
+        vec256 g0 = h & _mm256_set1_epi8(15);
+        vec256 g4 = _mm256_srli_epi32(h, 4) & _mm256_set1_epi8(15);
+
+        vec256 f0 = _mm256_permute2x128_si256(g0, g4, 0x20);
+        vec256 f4 = _mm256_permute2x128_si256(g0, g4, 0x31);
+
+        vec256 e0 = f0 & _mm256_set1_epi8(5);
+        vec256 e2 = _mm256_srli_epi32(f0, 1) & _mm256_set1_epi8(5);
+        vec256 e4 = f4 & _mm256_set1_epi8(5);
+        vec256 e6 = _mm256_srli_epi32(f4, 1) & _mm256_set1_epi8(5);
+
+        vec256 d0 = _mm256_unpacklo_epi32(e0, e2);
+        vec256 d2 = _mm256_unpackhi_epi32(e0, e2);
+        vec256 d4 = _mm256_unpacklo_epi32(e4, e6);
+        vec256 d6 = _mm256_unpackhi_epi32(e4, e6);
+
+        vec256 c0 = d0 & _mm256_set1_epi8(1);
+        vec256 c1 = _mm256_srli_epi32(d0, 2) & _mm256_set1_epi8(1);
+        vec256 c2 = d2 & _mm256_set1_epi8(1);
+        vec256 c3 = _mm256_srli_epi32(d2, 2) & _mm256_set1_epi8(1);
+        vec256 c4 = d4 & _mm256_set1_epi8(1);
+        vec256 c5 = _mm256_srli_epi32(d4, 2) & _mm256_set1_epi8(1);
+        vec256 c6 = d6 & _mm256_set1_epi8(1);
+        vec256 c7 = _mm256_srli_epi32(d6, 2) & _mm256_set1_epi8(1);
+
+        vec256 b0 = _mm256_unpacklo_epi64(c0, c1);
+        vec256 b1 = _mm256_unpackhi_epi64(c0, c1);
+        vec256 b2 = _mm256_unpacklo_epi64(c2, c3);
+        vec256 b3 = _mm256_unpackhi_epi64(c2, c3);
+        vec256 b4 = _mm256_unpacklo_epi64(c4, c5);
+        vec256 b5 = _mm256_unpackhi_epi64(c4, c5);
+        vec256 b6 = _mm256_unpacklo_epi64(c6, c7);
+        vec256 b7 = _mm256_unpackhi_epi64(c6, c7);
+
+        _mm256_storeu_si256((vec256 *) b, b0);
+        b += 32;
+        _mm256_storeu_si256((vec256 *) b, b1);
+        b += 32;
+        _mm256_storeu_si256((vec256 *) b, b2);
+        b += 32;
+        _mm256_storeu_si256((vec256 *) b, b3);
+        b += 32;
+        _mm256_storeu_si256((vec256 *) b, b4);
+        b += 32;
+        _mm256_storeu_si256((vec256 *) b, b5);
+        b += 32;
+        _mm256_storeu_si256((vec256 *) b, b6);
+        b += 32;
+        _mm256_storeu_si256((vec256 *) b, b7);
+        b += 32;
+    }
+}
+
+static void vec256_init(vec256 *G0, vec256 *G1, const small *s) {
+    int i;
+    small srev[ppad + (ppad - p)];
+    small si;
+    small g0[ppad];
+    small g1[ppad];
+
+    for (i = 0; i < p; ++i) {
+        srev[ppad - 1 - i] = s[i];
+    }
+    for (i = 0; i < ppad - p; ++i) {
+        srev[i] = 0;
+    }
+    for (i = p; i < ppad; ++i) {
+        srev[i + ppad - p] = 0;
+    }
+
+    for (i = 0; i < ppad; ++i) {
+        si = srev[i + ppad - p];
+        g0[i] = si & 1;
+        g1[i] = (si >> 1) & g0[i];
+    }
+
+    vec256_frombits(G0, g0);
+    vec256_frombits(G1, g1);
+}
+
+static void vec256_final(small *out, const vec256 *V0, const vec256 *V1) {
+    int i;
+    small v0[ppad];
+    small v1[ppad];
+    small v[ppad];
+    small vrev[ppad + (ppad - p)];
+
+    vec256_tobits(V0, v0);
+    vec256_tobits(V1, v1);
+
+    for (i = 0; i < ppad; ++i) {
+        v[i] = v0[i] + 2 * v1[i] - 4 * (v0[i] & v1[i]);
+    }
+
+    for (i = 0; i < ppad; ++i) {
+        vrev[i] = v[ppad - 1 - i];
+    }
+    for (i = ppad; i < ppad + (ppad - p); ++i) {
+        vrev[i] = 0;
+    }
+
+    for (i = 0; i < p; ++i) {
+        out[i] = vrev[i + ppad - p];
+    }
+}
+
+static inline int negative_mask(int x) {
+    return x >> 31;
+}
+
+static inline void vec256_swap(vec256 *f, vec256 *g, int len, vec256 mask) {
+    vec256 flip;
+    int i;
+
+    for (i = 0; i < len; ++i) {
+        flip = mask & (f[i] ^ g[i]);
+        f[i] ^= flip;
+        g[i] ^= flip;
+    }
+}
+
+static inline void vec256_scale(vec256 *f0, vec256 *f1, const vec256 c0, const vec256 c1) {
+    int i;
+
+    for (i = 0; i < numvec; ++i) {
+        vec256 f0i = f0[i];
+        vec256 f1i = f1[i];
+
+        f0i &= c0;
+        f1i ^= c1;
+        f1i &= f0i;
+
+        f0[i] = f0i;
+        f1[i] = f1i;
+    }
+}
+
+static inline void vec256_eliminate(vec256 *f0, vec256 *f1, vec256 *g0, vec256 *g1, int len, const vec256 c0, const vec256 c1) {
+    int i;
+
+    for (i = 0; i < len; ++i) {
+        vec256 f0i = f0[i];
+        vec256 f1i = f1[i];
+        vec256 g0i = g0[i];
+        vec256 g1i = g1[i];
+        vec256 t;
+
+        f0i &= c0;
+        f1i ^= c1;
+        f1i &= f0i;
+
+        t = g0i ^ f0i;
+        g0[i] = t | (g1i ^ f1i);
+        g1[i] = (g1i ^ f0i) & (f1i ^ t);
+    }
+}
+
+static inline int vec256_bit0mask(vec256 *f) {
+    return -(_mm_cvtsi128_si32(_mm256_castsi256_si128(f[0])) & 1);
+}
+
+static inline void vec256_divx_1(vec256 *f) {
+    vec256 f0 = f[0];
+
+    unsigned long long low0 = _mm_cvtsi128_si64(_mm256_castsi256_si128(f0));
+
+    low0 = low0 >> 1;
+
+    f0 = _mm256_blend_epi32(f0, _mm256_set_epi64x(0, 0, 0, low0), 0x3);
+
+    f[0] = _mm256_permute4x64_epi64(f0, 0x39);
+}
+
+static inline void vec256_divx_2(vec256 *f) {
+    vec256 f0 = f[0];
+    vec256 f1 = f[1];
+
+    unsigned long long low0 = _mm_cvtsi128_si64(_mm256_castsi256_si128(f0));
+    unsigned long long low1 = _mm_cvtsi128_si64(_mm256_castsi256_si128(f1));
+
+    low0 = (low0 >> 1) | (low1 << 63);
+    low1 = low1 >> 1;
+
+    f0 = _mm256_blend_epi32(f0, _mm256_set_epi64x(0, 0, 0, low0), 0x3);
+    f1 = _mm256_blend_epi32(f1, _mm256_set_epi64x(0, 0, 0, low1), 0x3);
+
+    f[0] = _mm256_permute4x64_epi64(f0, 0x39);
+    f[1] = _mm256_permute4x64_epi64(f1, 0x39);
+}
+
+static inline void vec256_divx_3(vec256 *f) {
+    vec256 f0 = f[0];
+    vec256 f1 = f[1];
+    vec256 f2 = f[2];
+
+    unsigned long long low0 = _mm_cvtsi128_si64(_mm256_castsi256_si128(f0));
+    unsigned long long low1 = _mm_cvtsi128_si64(_mm256_castsi256_si128(f1));
+    unsigned long long low2 = _mm_cvtsi128_si64(_mm256_castsi256_si128(f2));
+
+    low0 = (low0 >> 1) | (low1 << 63);
+    low1 = (low1 >> 1) | (low2 << 63);
+    low2 = low2 >> 1;
+
+    f0 = _mm256_blend_epi32(f0, _mm256_set_epi64x(0, 0, 0, low0), 0x3);
+    f1 = _mm256_blend_epi32(f1, _mm256_set_epi64x(0, 0, 0, low1), 0x3);
+    f2 = _mm256_blend_epi32(f2, _mm256_set_epi64x(0, 0, 0, low2), 0x3);
+
+    f[0] = _mm256_permute4x64_epi64(f0, 0x39);
+    f[1] = _mm256_permute4x64_epi64(f1, 0x39);
+    f[2] = _mm256_permute4x64_epi64(f2, 0x39);
+}
+
+static inline void vec256_divx_4(vec256 *f) {
+    vec256 f0 = f[0];
+    vec256 f1 = f[1];
+    vec256 f2 = f[2];
+    vec256 f3 = f[3];
+
+    unsigned long long low0 = _mm_cvtsi128_si64(_mm256_castsi256_si128(f0));
+    unsigned long long low1 = _mm_cvtsi128_si64(_mm256_castsi256_si128(f1));
+    unsigned long long low2 = _mm_cvtsi128_si64(_mm256_castsi256_si128(f2));
+    unsigned long long low3 = _mm_cvtsi128_si64(_mm256_castsi256_si128(f3));
+
+    low0 = (low0 >> 1) | (low1 << 63);
+    low1 = (low1 >> 1) | (low2 << 63);
+    low2 = (low2 >> 1) | (low3 << 63);
+    low3 = low3 >> 1;
+
+    f0 = _mm256_blend_epi32(f0, _mm256_set_epi64x(0, 0, 0, low0), 0x3);
+    f1 = _mm256_blend_epi32(f1, _mm256_set_epi64x(0, 0, 0, low1), 0x3);
+    f2 = _mm256_blend_epi32(f2, _mm256_set_epi64x(0, 0, 0, low2), 0x3);
+    f3 = _mm256_blend_epi32(f3, _mm256_set_epi64x(0, 0, 0, low3), 0x3);
+
+    f[0] = _mm256_permute4x64_epi64(f0, 0x39);
+    f[1] = _mm256_permute4x64_epi64(f1, 0x39);
+    f[2] = _mm256_permute4x64_epi64(f2, 0x39);
+    f[3] = _mm256_permute4x64_epi64(f3, 0x39);
+}
+
+static inline void vec256_timesx_1(vec256 *f) {
+    vec256 f0 = _mm256_permute4x64_epi64(f[0], 0x93);
+
+    unsigned long long low0 = _mm_cvtsi128_si64(_mm256_castsi256_si128(f0));
+
+    low0 = low0 << 1;
+
+    f0 = _mm256_blend_epi32(f0, _mm256_set_epi64x(0, 0, 0, low0), 0x3);
+
+    f[0] = f0;
+}
+
+static inline void vec256_timesx_2(vec256 *f) {
+    vec256 f0 = _mm256_permute4x64_epi64(f[0], 0x93);
+    vec256 f1 = _mm256_permute4x64_epi64(f[1], 0x93);
+
+    unsigned long long low0 = _mm_cvtsi128_si64(_mm256_castsi256_si128(f0));
+    unsigned long long low1 = _mm_cvtsi128_si64(_mm256_castsi256_si128(f1));
+
+    low1 = (low1 << 1) | (low0 >> 63);
+    low0 = low0 << 1;
+
+    f0 = _mm256_blend_epi32(f0, _mm256_set_epi64x(0, 0, 0, low0), 0x3);
+    f1 = _mm256_blend_epi32(f1, _mm256_set_epi64x(0, 0, 0, low1), 0x3);
+
+    f[0] = f0;
+    f[1] = f1;
+}
+
+static inline void vec256_timesx_3(vec256 *f) {
+    vec256 f0 = _mm256_permute4x64_epi64(f[0], 0x93);
+    vec256 f1 = _mm256_permute4x64_epi64(f[1], 0x93);
+    vec256 f2 = _mm256_permute4x64_epi64(f[2], 0x93);
+
+    unsigned long long low0 = *(unsigned long long *) &f0;
+    unsigned long long low1 = *(unsigned long long *) &f1;
+    unsigned long long low2 = _mm_cvtsi128_si64(_mm256_castsi256_si128(f2));
+
+    low2 = (low2 << 1) | (low1 >> 63);
+    low1 = (low1 << 1) | (low0 >> 63);
+    low0 = low0 << 1;
+
+    *(unsigned long long *) &f0 = low0;
+    *(unsigned long long *) &f1 = low1;
+    f2 = _mm256_blend_epi32(f2, _mm256_set_epi64x(0, 0, 0, low2), 0x3);
+
+    f[0] = f0;
+    f[1] = f1;
+    f[2] = f2;
+}
+
+static inline void vec256_timesx_4(vec256 *f) {
+    vec256 f0 = _mm256_permute4x64_epi64(f[0], 0x93);
+    vec256 f1 = _mm256_permute4x64_epi64(f[1], 0x93);
+    vec256 f2 = _mm256_permute4x64_epi64(f[2], 0x93);
+    vec256 f3 = _mm256_permute4x64_epi64(f[3], 0x93);
+
+    unsigned long long low0 = _mm_cvtsi128_si64(_mm256_castsi256_si128(f0));
+    unsigned long long low1 = _mm_cvtsi128_si64(_mm256_castsi256_si128(f1));
+    unsigned long long low2 = _mm_cvtsi128_si64(_mm256_castsi256_si128(f2));
+    unsigned long long low3 = _mm_cvtsi128_si64(_mm256_castsi256_si128(f3));
+
+    low3 = (low3 << 1) | (low2 >> 63);
+    low2 = (low2 << 1) | (low1 >> 63);
+    low1 = (low1 << 1) | (low0 >> 63);
+    low0 = low0 << 1;
+
+    f0 = _mm256_blend_epi32(f0, _mm256_set_epi64x(0, 0, 0, low0), 0x3);
+    f1 = _mm256_blend_epi32(f1, _mm256_set_epi64x(0, 0, 0, low1), 0x3);
+    f2 = _mm256_blend_epi32(f2, _mm256_set_epi64x(0, 0, 0, low2), 0x3);
+    f3 = _mm256_blend_epi32(f3, _mm256_set_epi64x(0, 0, 0, low3), 0x3);
+
+    f[0] = f0;
+    f[1] = f1;
+    f[2] = f2;
+    f[3] = f3;
+}
+
+
+int PQCLEAN_SNTRUP857_AVX2_crypto_core_inv3sntrup857(unsigned char *outbytes, const unsigned char *inbytes) {
+    small *out = (void *) outbytes;
+    small *in = (void *) inbytes;
+    vec256 F0[numvec];
+    vec256 F1[numvec];
+    vec256 G0[numvec];
+    vec256 G1[numvec];
+    vec256 V0[numvec];
+    vec256 V1[numvec];
+    vec256 R0[numvec];
+    vec256 R1[numvec];
+    vec256 c0vec, c1vec;
+    int loop;
+    int c0, c1;
+    int minusdelta = -1;
+    int swapmask;
+    vec256 swapvec;
+
+    vec256_init(G0, G1, in);
+    F0[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, 1);
+    F0[1] = _mm256_set1_epi32(0);
+    F0[2] = _mm256_set1_epi32(0);
+    F0[3] = _mm256_set_epi32(0, 0, 0, 0, 0, 4194304, 0, 4194304);
+    F1[0] = _mm256_set1_epi32(0);
+    F1[1] = _mm256_set1_epi32(0);
+    F1[2] = _mm256_set1_epi32(0);
+    F1[3] = _mm256_set_epi32(0, 0, 0, 0, 0, 4194304, 0, 4194304);
+
+    V0[0] = _mm256_set1_epi32(0);
+    V1[0] = _mm256_set1_epi32(0);
+    V0[1] = _mm256_set1_epi32(0);
+    V1[1] = _mm256_set1_epi32(0);
+    V0[2] = _mm256_set1_epi32(0);
+    V1[2] = _mm256_set1_epi32(0);
+    V0[3] = _mm256_set1_epi32(0);
+    V1[3] = _mm256_set1_epi32(0);
+
+    R0[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, 1);
+    R1[0] = _mm256_set1_epi32(0);
+    R0[1] = _mm256_set1_epi32(0);
+    R1[1] = _mm256_set1_epi32(0);
+    R0[2] = _mm256_set1_epi32(0);
+    R1[2] = _mm256_set1_epi32(0);
+    R0[3] = _mm256_set1_epi32(0);
+    R1[3] = _mm256_set1_epi32(0);
+
+    for (loop = 256; loop > 0; --loop) {
+        vec256_timesx_1(V0);
+        vec256_timesx_1(V1);
+        swapmask = negative_mask(minusdelta) & vec256_bit0mask(G0);
+
+        c0 = vec256_bit0mask(F0) & vec256_bit0mask(G0);
+        c1 = vec256_bit0mask(F1) ^ vec256_bit0mask(G1);
+        c1 &= c0;
+
+        minusdelta ^= swapmask & (minusdelta ^ -minusdelta);
+        minusdelta -= 1;
+
+        swapvec = _mm256_set1_epi32(swapmask);
+        vec256_swap(F0, G0, 4, swapvec);
+        vec256_swap(F1, G1, 4, swapvec);
+
+        c0vec = _mm256_set1_epi32(c0);
+        c1vec = _mm256_set1_epi32(c1);
+
+        vec256_eliminate(F0, F1, G0, G1, 4, c0vec, c1vec);
+        vec256_divx_4(G0);
+        vec256_divx_4(G1);
+
+        vec256_swap(V0, R0, 1, swapvec);
+        vec256_swap(V1, R1, 1, swapvec);
+        vec256_eliminate(V0, V1, R0, R1, 1, c0vec, c1vec);
+    }
+
+    for (loop = 256; loop > 0; --loop) {
+        vec256_timesx_2(V0);
+        vec256_timesx_2(V1);
+        swapmask = negative_mask(minusdelta) & vec256_bit0mask(G0);
+
+        c0 = vec256_bit0mask(F0) & vec256_bit0mask(G0);
+        c1 = vec256_bit0mask(F1) ^ vec256_bit0mask(G1);
+        c1 &= c0;
+
+        minusdelta ^= swapmask & (minusdelta ^ -minusdelta);
+        minusdelta -= 1;
+
+        swapvec = _mm256_set1_epi32(swapmask);
+        vec256_swap(F0, G0, 4, swapvec);
+        vec256_swap(F1, G1, 4, swapvec);
+
+        c0vec = _mm256_set1_epi32(c0);
+        c1vec = _mm256_set1_epi32(c1);
+
+        vec256_eliminate(F0, F1, G0, G1, 4, c0vec, c1vec);
+        vec256_divx_4(G0);
+        vec256_divx_4(G1);
+
+        vec256_swap(V0, R0, 2, swapvec);
+        vec256_swap(V1, R1, 2, swapvec);
+        vec256_eliminate(V0, V1, R0, R1, 2, c0vec, c1vec);
+    }
+
+    for (loop = 256; loop > 0; --loop) {
+        vec256_timesx_3(V0);
+        vec256_timesx_3(V1);
+        swapmask = negative_mask(minusdelta) & vec256_bit0mask(G0);
+
+        c0 = vec256_bit0mask(F0) & vec256_bit0mask(G0);
+        c1 = vec256_bit0mask(F1) ^ vec256_bit0mask(G1);
+        c1 &= c0;
+
+        minusdelta ^= swapmask & (minusdelta ^ -minusdelta);
+        minusdelta -= 1;
+
+        swapvec = _mm256_set1_epi32(swapmask);
+        vec256_swap(F0, G0, 4, swapvec);
+        vec256_swap(F1, G1, 4, swapvec);
+
+        c0vec = _mm256_set1_epi32(c0);
+        c1vec = _mm256_set1_epi32(c1);
+
+        vec256_eliminate(F0, F1, G0, G1, 4, c0vec, c1vec);
+        vec256_divx_4(G0);
+        vec256_divx_4(G1);
+
+        vec256_swap(V0, R0, 3, swapvec);
+        vec256_swap(V1, R1, 3, swapvec);
+        vec256_eliminate(V0, V1, R0, R1, 3, c0vec, c1vec);
+    }
+
+    for (loop = 177; loop > 0; --loop) {
+        vec256_timesx_4(V0);
+        vec256_timesx_4(V1);
+        swapmask = negative_mask(minusdelta) & vec256_bit0mask(G0);
+
+        c0 = vec256_bit0mask(F0) & vec256_bit0mask(G0);
+        c1 = vec256_bit0mask(F1) ^ vec256_bit0mask(G1);
+        c1 &= c0;
+
+        minusdelta ^= swapmask & (minusdelta ^ -minusdelta);
+        minusdelta -= 1;
+
+        swapvec = _mm256_set1_epi32(swapmask);
+        vec256_swap(F0, G0, 4, swapvec);
+        vec256_swap(F1, G1, 4, swapvec);
+
+        c0vec = _mm256_set1_epi32(c0);
+        c1vec = _mm256_set1_epi32(c1);
+
+        vec256_eliminate(F0, F1, G0, G1, 4, c0vec, c1vec);
+        vec256_divx_4(G0);
+        vec256_divx_4(G1);
+
+        vec256_swap(V0, R0, 4, swapvec);
+        vec256_swap(V1, R1, 4, swapvec);
+        vec256_eliminate(V0, V1, R0, R1, 4, c0vec, c1vec);
+    }
+
+    for (loop = 256; loop > 0; --loop) {
+        vec256_timesx_4(V0);
+        vec256_timesx_4(V1);
+        swapmask = negative_mask(minusdelta) & vec256_bit0mask(G0);
+
+        c0 = vec256_bit0mask(F0) & vec256_bit0mask(G0);
+        c1 = vec256_bit0mask(F1) ^ vec256_bit0mask(G1);
+        c1 &= c0;
+
+        minusdelta ^= swapmask & (minusdelta ^ -minusdelta);
+        minusdelta -= 1;
+
+        swapvec = _mm256_set1_epi32(swapmask);
+        vec256_swap(F0, G0, 3, swapvec);
+        vec256_swap(F1, G1, 3, swapvec);
+
+        c0vec = _mm256_set1_epi32(c0);
+        c1vec = _mm256_set1_epi32(c1);
+
+        vec256_eliminate(F0, F1, G0, G1, 3, c0vec, c1vec);
+        vec256_divx_3(G0);
+        vec256_divx_3(G1);
+
+        vec256_swap(V0, R0, 4, swapvec);
+        vec256_swap(V1, R1, 4, swapvec);
+        vec256_eliminate(V0, V1, R0, R1, 4, c0vec, c1vec);
+    }
+
+    for (loop = 256; loop > 0; --loop) {
+        vec256_timesx_4(V0);
+        vec256_timesx_4(V1);
+        swapmask = negative_mask(minusdelta) & vec256_bit0mask(G0);
+
+        c0 = vec256_bit0mask(F0) & vec256_bit0mask(G0);
+        c1 = vec256_bit0mask(F1) ^ vec256_bit0mask(G1);
+        c1 &= c0;
+
+        minusdelta ^= swapmask & (minusdelta ^ -minusdelta);
+        minusdelta -= 1;
+
+        swapvec = _mm256_set1_epi32(swapmask);
+        vec256_swap(F0, G0, 2, swapvec);
+        vec256_swap(F1, G1, 2, swapvec);
+
+        c0vec = _mm256_set1_epi32(c0);
+        c1vec = _mm256_set1_epi32(c1);
+
+        vec256_eliminate(F0, F1, G0, G1, 2, c0vec, c1vec);
+        vec256_divx_2(G0);
+        vec256_divx_2(G1);
+
+        vec256_swap(V0, R0, 4, swapvec);
+        vec256_swap(V1, R1, 4, swapvec);
+        vec256_eliminate(V0, V1, R0, R1, 4, c0vec, c1vec);
+    }
+
+    for (loop = 256; loop > 0; --loop) {
+        vec256_timesx_4(V0);
+        vec256_timesx_4(V1);
+        swapmask = negative_mask(minusdelta) & vec256_bit0mask(G0);
+
+        c0 = vec256_bit0mask(F0) & vec256_bit0mask(G0);
+        c1 = vec256_bit0mask(F1) ^ vec256_bit0mask(G1);
+        c1 &= c0;
+
+        minusdelta ^= swapmask & (minusdelta ^ -minusdelta);
+        minusdelta -= 1;
+
+        swapvec = _mm256_set1_epi32(swapmask);
+        vec256_swap(F0, G0, 1, swapvec);
+        vec256_swap(F1, G1, 1, swapvec);
+
+        c0vec = _mm256_set1_epi32(c0);
+        c1vec = _mm256_set1_epi32(c1);
+
+        vec256_eliminate(F0, F1, G0, G1, 1, c0vec, c1vec);
+        vec256_divx_1(G0);
+        vec256_divx_1(G1);
+
+        vec256_swap(V0, R0, 4, swapvec);
+        vec256_swap(V1, R1, 4, swapvec);
+        vec256_eliminate(V0, V1, R0, R1, 4, c0vec, c1vec);
+    }
+
+    c0vec = _mm256_set1_epi32(vec256_bit0mask(F0));
+    c1vec = _mm256_set1_epi32(vec256_bit0mask(F1));
+    vec256_scale(V0, V1, c0vec, c1vec);
+
+    vec256_final(out, V0, V1);
+    out[p] = negative_mask(minusdelta);
+    return 0;
+}
diff --git a/crypto_kem/sntrup857/avx2/crypto_core_inv3sntrup857.h b/crypto_kem/sntrup857/avx2/crypto_core_inv3sntrup857.h
new file mode 100644
index 00000000..d86a59ee
--- /dev/null
+++ b/crypto_kem/sntrup857/avx2/crypto_core_inv3sntrup857.h
@@ -0,0 +1,11 @@
+#ifndef PQCLEAN_SNTRUP857_AVX2_CRYPTO_CORE_INV3SNTRUP857_H
+#define PQCLEAN_SNTRUP857_AVX2_CRYPTO_CORE_INV3SNTRUP857_H
+
+#include <stdint.h>
+#define PQCLEAN_SNTRUP857_AVX2_crypto_core_inv3sntrup857_OUTPUTBYTES 858
+#define PQCLEAN_SNTRUP857_AVX2_crypto_core_inv3sntrup857_INPUTBYTES 857
+#define PQCLEAN_SNTRUP857_AVX2_crypto_core_inv3sntrup857_KEYBYTES 0
+#define PQCLEAN_SNTRUP857_AVX2_crypto_core_inv3sntrup857_CONSTBYTES 0
+
+int PQCLEAN_SNTRUP857_AVX2_crypto_core_inv3sntrup857(unsigned char *outbytes, const unsigned char *inbytes);
+#endif
diff --git a/crypto_kem/sntrup857/avx2/crypto_core_invsntrup857.c b/crypto_kem/sntrup857/avx2/crypto_core_invsntrup857.c
new file mode 100644
index 00000000..764c10ed
--- /dev/null
+++ b/crypto_kem/sntrup857/avx2/crypto_core_invsntrup857.c
@@ -0,0 +1,202 @@
+#include "crypto_core_invsntrup857.h"
+#include "params.h"
+#include <immintrin.h>
+
+#define int8 int8_t
+#define int16 int16_t
+#define int32 int32_t
+#define uint16 uint16_t
+#define uint32 uint32_t
+
+
+
+/* ----- masks */
+
+/* return -1 if x!=0; else return 0 */
+static int int16_nonzero_mask(int16 x) {
+    uint16 u = x; /* 0, else 1...65535 */
+    uint32 v = u; /* 0, else 1...65535 */
+    v = -v; /* 0, else 2^32-65535...2^32-1 */
+    v >>= 31; /* 0, else 1 */
+    return -v; /* 0, else -1 */
+}
+
+/* return -1 if x<0; otherwise return 0 */
+static int int16_negative_mask(int16 x) {
+    return x >> 15; /* XXX: theoretically need gcc -fwrapv for this */
+}
+
+/* ----- arithmetic mod q */
+
+typedef int8 small;
+
+typedef int16 Fq;
+/* always represented as -(q-1)/2...(q-1)/2 */
+
+/* works for -14000000 < x < 14000000 if q in 4591, 4621, 5167 */
+static Fq Fq_freeze(int32 x) {
+    x -= q * ((q18 * x) >> 18);
+    x -= q * ((q27 * x + 67108864) >> 27);
+    return x;
+}
+
+/* nonnegative e */
+static Fq Fq_pow(Fq a, int e) {
+    if (e == 0) {
+        return 1;
+    }
+    if (e == 1) {
+        return a;
+    }
+    if (e & 1) {
+        return Fq_freeze(a * (int32)Fq_pow(a, e - 1));
+    }
+    a = Fq_freeze(a * (int32)a);
+    return Fq_pow(a, e >> 1);
+}
+
+static Fq Fq_recip(Fq a) {
+    return Fq_pow(a, q - 2);
+}
+
+/* ----- more */
+
+#define qvec _mm256_set1_epi16(q)
+#define qinvvec _mm256_set1_epi16(qinv)
+
+static inline __m256i montproduct(__m256i x, __m256i y, __m256i yqinv) {
+    __m256i hi, d, e;
+
+    d = _mm256_mullo_epi16(x, yqinv);
+    hi = _mm256_mulhi_epi16(x, y);
+    e = _mm256_mulhi_epi16(d, qvec);
+    return _mm256_sub_epi16(hi, e);
+}
+
+static inline void vectormodq_swapeliminate(Fq *f, Fq *g, int len, const Fq f0, const Fq g0, int mask) {
+    __m256i f0vec = _mm256_set1_epi16(f0);
+    __m256i g0vec = _mm256_set1_epi16(g0);
+    __m256i f0vecqinv = _mm256_mullo_epi16(f0vec, qinvvec);
+    __m256i g0vecqinv = _mm256_mullo_epi16(g0vec, qinvvec);
+    __m256i maskvec = _mm256_set1_epi32(mask);
+
+    while (len > 0) {
+        __m256i fi = _mm256_loadu_si256((__m256i *) f);
+        __m256i gi = _mm256_loadu_si256((__m256i *) g);
+        __m256i finew = _mm256_blendv_epi8(fi, gi, maskvec);
+        __m256i ginew = _mm256_blendv_epi8(gi, fi, maskvec);
+        ginew = _mm256_sub_epi16(montproduct(ginew, f0vec, f0vecqinv), montproduct(finew, g0vec, g0vecqinv));
+        _mm256_storeu_si256((__m256i *) f, finew);
+        _mm256_storeu_si256((__m256i *) (g - 1), ginew);
+        f += 16;
+        g += 16;
+        len -= 16;
+    }
+}
+
+static inline void vectormodq_xswapeliminate(Fq *f, Fq *g, int len, const Fq f0, const Fq g0, int mask) {
+    __m256i f0vec = _mm256_set1_epi16(f0);
+    __m256i g0vec = _mm256_set1_epi16(g0);
+    __m256i f0vecqinv = _mm256_mullo_epi16(f0vec, qinvvec);
+    __m256i g0vecqinv = _mm256_mullo_epi16(g0vec, qinvvec);
+    __m256i maskvec = _mm256_set1_epi32(mask);
+
+    f += len + (-len & 15);
+    g += len + (-len & 15);
+    while (len > 0) {
+        f -= 16;
+        g -= 16;
+        len -= 16;
+        __m256i fi = _mm256_loadu_si256((__m256i *) f);
+        __m256i gi = _mm256_loadu_si256((__m256i *) g);
+        __m256i finew = _mm256_blendv_epi8(fi, gi, maskvec);
+        __m256i ginew = _mm256_blendv_epi8(gi, fi, maskvec);
+        ginew = _mm256_sub_epi16(montproduct(ginew, f0vec, f0vecqinv), montproduct(finew, g0vec, g0vecqinv));
+        _mm256_storeu_si256((__m256i *) (f + 1), finew);
+        _mm256_storeu_si256((__m256i *) g, ginew);
+    }
+}
+
+int PQCLEAN_SNTRUP857_AVX2_crypto_core_invsntrup857(unsigned char *outbytes, const unsigned char *inbytes) {
+    small *in = (void *) inbytes;
+    int loop;
+    Fq out[p], f[ppad], g[ppad], v[ppad], r[ppad];
+    Fq f0, g0;
+    Fq scale;
+    int i;
+    int delta = 1;
+    int minusdelta;
+    int fgflip;
+    int swap;
+
+    for (i = 0; i < ppad; ++i) {
+        f[i] = 0;
+    }
+    f[0] = 1;
+    f[p - 1] = -1;
+    f[p] = -1;
+    /* generalization: initialize f to reversal of any deg-p polynomial m */
+
+    for (i = 0; i < p; ++i) {
+        g[i] = in[p - 1 - i];
+    }
+    for (i = p; i < ppad; ++i) {
+        g[i] = 0;
+    }
+
+    for (i = 0; i < ppad; ++i) {
+        r[i] = 0;
+    }
+    r[0] = Fq_recip(3);
+
+    for (i = 0; i < ppad; ++i) {
+        v[i] = 0;
+    }
+
+    for (loop = 0; loop < p; ++loop) {
+        g0 = Fq_freeze(g[0]);
+        f0 = f[0];
+
+        minusdelta = -delta;
+        swap = int16_negative_mask(minusdelta) & int16_nonzero_mask(g0);
+        delta ^= swap & (delta ^ minusdelta);
+        delta += 1;
+
+        fgflip = swap & (f0 ^ g0);
+        f0 ^= fgflip;
+        g0 ^= fgflip;
+
+        f[0] = f0;
+
+        vectormodq_swapeliminate(f + 1, g + 1, p, f0, g0, swap);
+        vectormodq_xswapeliminate(v, r, loop + 1, f0, g0, swap);
+    }
+
+    for (loop = p - 1; loop > 0; --loop) {
+        g0 = Fq_freeze(g[0]);
+        f0 = f[0];
+
+        minusdelta = -delta;
+        swap = int16_negative_mask(minusdelta) & int16_nonzero_mask(g0);
+        delta ^= swap & (delta ^ minusdelta);
+        delta += 1;
+
+        fgflip = swap & (f0 ^ g0);
+        f0 ^= fgflip;
+        g0 ^= fgflip;
+
+        f[0] = f0;
+
+        vectormodq_swapeliminate(f + 1, g + 1, loop, f0, g0, swap);
+        vectormodq_xswapeliminate(v, r, p, f0, g0, swap);
+    }
+
+    scale = Fq_recip(Fq_freeze(f[0]));
+    for (i = 0; i < p; ++i) {
+        out[i] = Fq_freeze(scale * (int32)Fq_freeze(v[p - i]));
+    }
+
+    crypto_encode_pxint16(outbytes, out);
+    outbytes[2 * p] = int16_nonzero_mask(delta);
+    return 0;
+}
diff --git a/crypto_kem/sntrup857/avx2/crypto_core_invsntrup857.h b/crypto_kem/sntrup857/avx2/crypto_core_invsntrup857.h
new file mode 100644
index 00000000..789e1a8b
--- /dev/null
+++ b/crypto_kem/sntrup857/avx2/crypto_core_invsntrup857.h
@@ -0,0 +1,11 @@
+#ifndef PQCLEAN_SNTRUP857_AVX2_CRYPTO_CORE_INVSNTRUP857_H
+#define PQCLEAN_SNTRUP857_AVX2_CRYPTO_CORE_INVSNTRUP857_H
+
+#include <stdint.h>
+#define PQCLEAN_SNTRUP857_AVX2_crypto_core_invsntrup857_OUTPUTBYTES 1715
+#define PQCLEAN_SNTRUP857_AVX2_crypto_core_invsntrup857_INPUTBYTES 857
+#define PQCLEAN_SNTRUP857_AVX2_crypto_core_invsntrup857_KEYBYTES 0
+#define PQCLEAN_SNTRUP857_AVX2_crypto_core_invsntrup857_CONSTBYTES 0
+
+int PQCLEAN_SNTRUP857_AVX2_crypto_core_invsntrup857(unsigned char *outbytes, const unsigned char *inbytes);
+#endif
diff --git a/crypto_kem/sntrup857/avx2/crypto_core_mult3sntrup857.c b/crypto_kem/sntrup857/avx2/crypto_core_mult3sntrup857.c
new file mode 100644
index 00000000..dd4b9fa4
--- /dev/null
+++ b/crypto_kem/sntrup857/avx2/crypto_core_mult3sntrup857.c
@@ -0,0 +1,296 @@
+#include "crypto_core_mult3sntrup857.h"
+#include "crypto_core_multsntrup857_ntt.h"
+#include "crypto_decode_857xint16.h"
+#include "crypto_encode_857xint16.h"
+#include <immintrin.h>
+
+typedef int8_t int8;
+typedef int16_t int16;
+
+#define int16x16 __m256i
+#define load_x16(p) _mm256_loadu_si256((int16x16 *) (p))
+#define store_x16(p,v) _mm256_storeu_si256((int16x16 *) (p),(v))
+#define const_x16 _mm256_set1_epi16
+#define add_x16 _mm256_add_epi16
+#define sub_x16 _mm256_sub_epi16
+#define mullo_x16 _mm256_mullo_epi16
+#define mulhi_x16 _mm256_mulhi_epi16
+#define mulhrs_x16 _mm256_mulhrs_epi16
+#define signmask_x16(x) _mm256_srai_epi16((x),15)
+
+typedef union {
+    int16 v[512];
+    int16x16 _dummy;
+} vec512;
+
+typedef union {
+    int16 v[4][512];
+    int16x16 _dummy;
+} vec4x512;
+
+typedef union {
+    int16 v[1024];
+    int16x16 _dummy;
+} vec1024;
+
+typedef union {
+    int16 v[4 * 512];
+    int16x16 _dummy;
+} vec2048;
+
+static int16x16 squeeze_3_x16(int16x16 x) {
+    return sub_x16(x, mullo_x16(mulhrs_x16(x, const_x16(10923)), const_x16(3)));
+}
+
+static int16x16 squeeze_7681_x16(int16x16 x) {
+    return sub_x16(x, mullo_x16(mulhrs_x16(x, const_x16(4)), const_x16(7681)));
+}
+
+static int16x16 mulmod_7681_x16(int16x16 x, int16x16 y) {
+    int16x16 yqinv = mullo_x16(y, const_x16(-7679)); /* XXX: precompute */
+    int16x16 b = mulhi_x16(x, y);
+    int16x16 d = mullo_x16(x, yqinv);
+    int16x16 e = mulhi_x16(d, const_x16(7681));
+    return sub_x16(b, e);
+}
+
+static void stride(int16 fpad[4][512], const int16 f[1024]) {
+    int16x16 f0, f1, f2, f3, g0, g1, g2, g3;
+    int i, j;
+
+    for (j = 0; j < 256; j += 16) {
+        f0 = load_x16(&f[0]);
+        f1 = load_x16(&f[16]);
+        f2 = load_x16(&f[32]);
+        f3 = load_x16(&f[48]);
+        f += 64;
+
+        g0 = _mm256_permute2x128_si256(f0, f2, 0x20);
+        g1 = _mm256_permute2x128_si256(f0, f2, 0x31);
+        g2 = _mm256_permute2x128_si256(f1, f3, 0x20);
+        g3 = _mm256_permute2x128_si256(f1, f3, 0x31);
+        f0 = _mm256_unpacklo_epi16(g0, g1);
+        f1 = _mm256_unpackhi_epi16(g0, g1);
+        f2 = _mm256_unpacklo_epi16(g2, g3);
+        f3 = _mm256_unpackhi_epi16(g2, g3);
+        g0 = _mm256_unpacklo_epi16(f0, f1);
+        g1 = _mm256_unpackhi_epi16(f0, f1);
+        g2 = _mm256_unpacklo_epi16(f2, f3);
+        g3 = _mm256_unpackhi_epi16(f2, f3);
+        f0 = _mm256_unpacklo_epi64(g0, g2);
+        f1 = _mm256_unpackhi_epi64(g0, g2);
+        f2 = _mm256_unpacklo_epi64(g1, g3);
+        f3 = _mm256_unpackhi_epi64(g1, g3);
+
+        store_x16(&fpad[0][j], f0);
+        store_x16(&fpad[1][j], f1);
+        store_x16(&fpad[2][j], f2);
+        store_x16(&fpad[3][j], f3);
+    }
+
+    for (i = 0; i < 4; ++i) {
+        for (j = 256; j < 512; ++j) {
+            fpad[i][j] = 0;
+        }
+    }
+}
+
+static void unstride(int16 f[2048], const int16 fpad[4][512]) {
+    int16x16 f0, f1, f2, f3, g0, g1, g2, g3, h0, h1, h2, h3;
+    int j;
+
+    for (j = 0; j < 512; j += 16) {
+        f0 = load_x16(&fpad[0][j]);
+        f1 = load_x16(&fpad[1][j]);
+        f2 = load_x16(&fpad[2][j]);
+        f3 = load_x16(&fpad[3][j]);
+
+        g2 = _mm256_unpacklo_epi16(f2, f3);
+        g3 = _mm256_unpackhi_epi16(f2, f3);
+        g0 = _mm256_unpacklo_epi16(f0, f1);
+        h0 = _mm256_unpacklo_epi32(g0, g2);
+        h1 = _mm256_unpackhi_epi32(g0, g2);
+        g1 = _mm256_unpackhi_epi16(f0, f1);
+        h2 = _mm256_unpacklo_epi32(g1, g3);
+        h3 = _mm256_unpackhi_epi32(g1, g3);
+        f1 = _mm256_permute2x128_si256(h2, h3, 0x20);
+        f3 = _mm256_permute2x128_si256(h2, h3, 0x31);
+        f0 = _mm256_permute2x128_si256(h0, h1, 0x20);
+        f2 = _mm256_permute2x128_si256(h0, h1, 0x31);
+
+        store_x16(&f[0], f0);
+        store_x16(&f[16], f1);
+        store_x16(&f[32], f2);
+        store_x16(&f[48], f3);
+        f += 64;
+    }
+}
+
+static const vec512 y_7681 = { .v = {
+        -3593, -617, -2804, 3266, -2194, -1296, -1321, 810, 1414, 3706, -549, -396, -121, -2088, -2555, 1305,
+            -3777, 1921, 103, 3600, -2456, 1483, 1399, -1887, -1701, 2006, 1535, -3174, -2250, 2816, -2440, -1760,
+            -3625, 2830, 2043, -3689, 1100, 1525, -514, 7, 2876, -1599, 3153, -1881, -2495, -2237, -2535, 438,
+            3182, 3364, -1431, 1738, 3696, -2557, -2956, 638, -2319, -1993, -2310, -3555, 834, -1986, 3772, -679,
+            3593, 617, 2804, -3266, 2194, 1296, 1321, -810, -1414, -3706, 549, 396, 121, 2088, 2555, -1305,
+            3777, -1921, -103, -3600, 2456, -1483, -1399, 1887, 1701, -2006, -1535, 3174, 2250, -2816, 2440, 1760,
+            3625, -2830, -2043, 3689, -1100, -1525, 514, -7, -2876, 1599, -3153, 1881, 2495, 2237, 2535, -438,
+            -3182, -3364, 1431, -1738, -3696, 2557, 2956, -638, 2319, 1993, 2310, 3555, -834, 1986, -3772, 679,
+            2665, 727, -2572, 2426, -2133, -1386, 1681, -1054, 2579, 3750, 373, 3417, 404, -2233, 3135, -3405,
+            -1799, 1521, 1497, -3831, -3480, -3428, 2883, -1698, -859, -2762, 2175, -194, -486, -3816, -1756, 2385,
+            -783, 1533, 3145, 2, 3310, -2743, 2224, -1166, 2649, -1390, 3692, 2789, 1919, 2835, -2391, -2732,
+            1056, 1464, 1350, -915, -1168, -921, -3588, 3456, -2160, -1598, 730, 2919, 1532, -2764, -660, -2113,
+            -2665, -727, 2572, -2426, 2133, 1386, -1681, 1054, -2579, -3750, -373, -3417, -404, 2233, -3135, 3405,
+            1799, -1521, -1497, 3831, 3480, 3428, -2883, 1698, 859, 2762, -2175, 194, 486, 3816, 1756, -2385,
+            783, -1533, -3145, -2, -3310, 2743, -2224, 1166, -2649, 1390, -3692, -2789, -1919, -2835, 2391, 2732,
+            -1056, -1464, -1350, 915, 1168, 921, 3588, -3456, 2160, 1598, -730, -2919, -1532, 2764, 660, 2113,
+            2005, -188, 2345, -3723, -1403, 2070, 83, -3214, -3752, -1012, 1837, -3208, 3287, 3335, -293, 796,
+            592, 1519, -1338, 1931, 509, -2262, -3408, 3334, 3677, 2130, 642, 589, -2167, -1084, -370, -3163,
+            3763, -893, -2303, -402, 2937, -1689, -1526, -3745, -2460, 2874, 2965, 124, -1669, -1441, -3312, 3781,
+            2812, -2386, -2515, -429, -3343, 777, -826, -3366, -3657, -1404, -791, -2963, -692, 2532, 2083, 2258,
+            -2005, 188, -2345, 3723, 1403, -2070, -83, 3214, 3752, 1012, -1837, 3208, -3287, -3335, 293, -796,
+            -592, -1519, 1338, -1931, -509, 2262, 3408, -3334, -3677, -2130, -642, -589, 2167, 1084, 370, 3163,
+            -3763, 893, 2303, 402, -2937, 1689, 1526, 3745, 2460, -2874, -2965, -124, 1669, 1441, 3312, -3781,
+            -2812, 2386, 2515, 429, 3343, -777, 826, 3366, 3657, 1404, 791, 2963, 692, -2532, -2083, -2258,
+            179, 1121, 2891, -3581, 3177, -658, -3314, -1509, -17, 151, 2815, 2786, 1278, -2767, -1072, -1151,
+            -1242, -2071, 2340, -1586, 2072, 1476, 2998, 2918, -3744, -3794, -1295, 451, -929, 2378, -1144, 434,
+            -1070, -436, -3550, -3568, 1649, 715, 3461, -1407, -2001, -1203, 3770, 1712, 2230, -3542, 2589, -3547,
+            -2059, -236, 3434, -3693, 2161, -670, 2719, 2339, -2422, 1181, 3450, 222, 1348, -226, 2247, -1779,
+            -179, -1121, -2891, 3581, -3177, 658, 3314, 1509, 17, -151, -2815, -2786, -1278, 2767, 1072, 1151,
+            1242, 2071, -2340, 1586, -2072, -1476, -2998, -2918, 3744, 3794, 1295, -451, 929, -2378, 1144, -434,
+            1070, 436, 3550, 3568, -1649, -715, -3461, 1407, 2001, 1203, -3770, -1712, -2230, 3542, -2589, 3547,
+            2059, 236, -3434, 3693, -2161, 670, -2719, -2339, 2422, -1181, -3450, -222, -1348, 226, -2247, 1779,
+        }
+} ;
+
+static void mult1024(int16 h[2048], const int16 f[1024], const int16 g[1024]) {
+    vec4x512 x1, x2;
+    vec2048 x3;
+#define fpad (x1.v)
+#define gpad (x2.v)
+#define hpad fpad
+#define h_7681 (x3.v)
+    int i;
+
+    stride(fpad, f);
+    PQCLEAN_SNTRUP857_AVX2_ntt512_7681(fpad[0], 4);
+
+    stride(gpad, g);
+    PQCLEAN_SNTRUP857_AVX2_ntt512_7681(gpad[0], 4);
+
+    /* XXX: try arbitrary-degree Karatsuba */
+
+    for (i = 0; i < 512; i += 16) {
+        int16x16 f0 = squeeze_7681_x16(load_x16(&fpad[0][i]));
+        int16x16 f1 = squeeze_7681_x16(load_x16(&fpad[1][i]));
+        int16x16 f2 = squeeze_7681_x16(load_x16(&fpad[2][i]));
+        int16x16 f3 = squeeze_7681_x16(load_x16(&fpad[3][i]));
+        int16x16 g0 = squeeze_7681_x16(load_x16(&gpad[0][i]));
+        int16x16 g1 = squeeze_7681_x16(load_x16(&gpad[1][i]));
+        int16x16 g2 = squeeze_7681_x16(load_x16(&gpad[2][i]));
+        int16x16 g3 = squeeze_7681_x16(load_x16(&gpad[3][i]));
+        int16x16 d0 = mulmod_7681_x16(f0, g0);
+        int16x16 d1 = mulmod_7681_x16(f1, g1);
+        int16x16 d2 = mulmod_7681_x16(f2, g2);
+        int16x16 d3 = mulmod_7681_x16(f3, g3);
+        int16x16 d0d1 = add_x16(d0, d1);
+        int16x16 d0d1d2 = add_x16(d0d1, d2);
+        int16x16 d0d1d2d3 = squeeze_7681_x16(add_x16(d0d1d2, d3));
+        int16x16 d2d3 = add_x16(d2, d3);
+        int16x16 d1d2d3 = add_x16(d1, d2d3);
+        int16x16 e01 = mulmod_7681_x16(sub_x16(f0, f1), sub_x16(g0, g1));
+        int16x16 e02 = mulmod_7681_x16(sub_x16(f0, f2), sub_x16(g0, g2));
+        int16x16 e03 = mulmod_7681_x16(sub_x16(f0, f3), sub_x16(g0, g3));
+        int16x16 e12 = mulmod_7681_x16(sub_x16(f1, f2), sub_x16(g1, g2));
+        int16x16 e13 = mulmod_7681_x16(sub_x16(f1, f3), sub_x16(g1, g3));
+        int16x16 e23 = mulmod_7681_x16(sub_x16(f2, f3), sub_x16(g2, g3));
+        int16x16 h0 = d0;
+        int16x16 h1 = sub_x16(d0d1, e01);
+        int16x16 h2 = sub_x16(d0d1d2, e02);
+        int16x16 h3 = sub_x16(d0d1d2d3, add_x16(e12, e03));
+        int16x16 h4 = sub_x16(d1d2d3, e13);
+        int16x16 h5 = sub_x16(d2d3, e23);
+        int16x16 h6 = d3;
+        int16x16 twist = load_x16(&y_7681.v[i]);
+        h4 = mulmod_7681_x16(h4, twist);
+        h5 = mulmod_7681_x16(h5, twist);
+        h6 = mulmod_7681_x16(h6, twist);
+        h0 = add_x16(h0, h4);
+        h1 = add_x16(h1, h5);
+        h2 = add_x16(h2, h6);
+        store_x16(&hpad[0][i], squeeze_7681_x16(h0));
+        store_x16(&hpad[1][i], squeeze_7681_x16(h1));
+        store_x16(&hpad[2][i], squeeze_7681_x16(h2));
+        store_x16(&hpad[3][i], squeeze_7681_x16(h3));
+    }
+
+    PQCLEAN_SNTRUP857_AVX2_invntt512_7681(hpad[0], 4);
+    unstride(h_7681, (const int16(*)[512]) hpad);
+
+    for (i = 0; i < 2048; i += 16) {
+        int16x16 u = load_x16(&h_7681[i]);
+        u = mulmod_7681_x16(u, const_x16(956));
+        store_x16(&h[i], u);
+    }
+}
+
+#define crypto_decode_pxint16 PQCLEAN_SNTRUP857_AVX2_crypto_decode_857xint16
+#define crypto_encode_pxint16 PQCLEAN_SNTRUP857_AVX2_crypto_encode_857xint16
+
+#define p 857
+
+static inline int16x16 freeze_3_x16(int16x16 x) {
+    int16x16 mask, x3;
+    x = add_x16(x, const_x16(3)&signmask_x16(x));
+    mask = signmask_x16(sub_x16(x, const_x16(2)));
+    x3 = sub_x16(x, const_x16(3));
+    x = _mm256_blendv_epi8(x3, x, mask);
+    return x;
+}
+
+int PQCLEAN_SNTRUP857_AVX2_crypto_core_mult3sntrup857(unsigned char *outbytes, const unsigned char *inbytes, const unsigned char *kbytes) {
+    vec1024 x1, x2;
+    vec2048 x3;
+#define f (x1.v)
+#define g (x2.v)
+#define fg (x3.v)
+#define h f
+    int i;
+    int16x16 x;
+
+    x = const_x16(0);
+    for (i = p & ~15; i < 1024; i += 16) {
+        store_x16(&f[i], x);
+    }
+    for (i = p & ~15; i < 1024; i += 16) {
+        store_x16(&g[i], x);
+    }
+
+    for (i = 0; i < p; ++i) {
+        int8 fi = inbytes[i];
+        int8 fi0 = fi & 1;
+        f[i] = fi0 - (fi & (fi0 << 1));
+    }
+    for (i = 0; i < p; ++i) {
+        int8 gi = kbytes[i];
+        int8 gi0 = gi & 1;
+        g[i] = gi0 - (gi & (gi0 << 1));
+    }
+
+    mult1024(fg, f, g);
+
+    fg[0] -= fg[p - 1];
+    for (i = 0; i < 1024; i += 16) {
+        int16x16 fgi = load_x16(&fg[i]);
+        int16x16 fgip = load_x16(&fg[i + p]);
+        int16x16 fgip1 = load_x16(&fg[i + p - 1]);
+        x = add_x16(fgi, add_x16(fgip, fgip1));
+        x = freeze_3_x16(squeeze_3_x16(x));
+        store_x16(&h[i], x);
+    }
+
+    for (i = 0; i < p; ++i) {
+        outbytes[i] = h[i];
+    }
+
+    return 0;
+}
diff --git a/crypto_kem/sntrup857/avx2/crypto_core_mult3sntrup857.h b/crypto_kem/sntrup857/avx2/crypto_core_mult3sntrup857.h
new file mode 100644
index 00000000..fc189828
--- /dev/null
+++ b/crypto_kem/sntrup857/avx2/crypto_core_mult3sntrup857.h
@@ -0,0 +1,11 @@
+#ifndef PQCLEAN_SNTRUP857_AVX2_CRYPTO_CORE_MULT3SNTRUP857_H
+#define PQCLEAN_SNTRUP857_AVX2_CRYPTO_CORE_MULT3SNTRUP857_H
+
+#include <stdint.h>
+#define PQCLEAN_SNTRUP857_AVX2_crypto_core_mult3sntrup857_OUTPUTBYTES 857
+#define PQCLEAN_SNTRUP857_AVX2_crypto_core_mult3sntrup857_INPUTBYTES 857
+#define PQCLEAN_SNTRUP857_AVX2_crypto_core_mult3sntrup857_KEYBYTES 857
+#define PQCLEAN_SNTRUP857_AVX2_crypto_core_mult3sntrup857_CONSTBYTES 0
+
+int PQCLEAN_SNTRUP857_AVX2_crypto_core_mult3sntrup857(unsigned char *outbytes, const unsigned char *inbytes, const unsigned char *kbytes);
+#endif
diff --git a/crypto_kem/sntrup857/avx2/crypto_core_multsntrup857.c b/crypto_kem/sntrup857/avx2/crypto_core_multsntrup857.c
new file mode 100644
index 00000000..3cd25ff6
--- /dev/null
+++ b/crypto_kem/sntrup857/avx2/crypto_core_multsntrup857.c
@@ -0,0 +1,421 @@
+#include "crypto_core_multsntrup857.h"
+#include "crypto_core_multsntrup857_ntt.h"
+#include "crypto_decode_857xint16.h"
+#include "crypto_encode_857xint16.h"
+#include <immintrin.h>
+
+typedef int8_t int8;
+typedef int16_t int16;
+
+#define int16x16 __m256i
+#define load_x16(p) _mm256_loadu_si256((int16x16 *) (p))
+#define store_x16(p,v) _mm256_storeu_si256((int16x16 *) (p),(v))
+#define const_x16 _mm256_set1_epi16
+#define add_x16 _mm256_add_epi16
+#define sub_x16 _mm256_sub_epi16
+#define mullo_x16 _mm256_mullo_epi16
+#define mulhi_x16 _mm256_mulhi_epi16
+#define mulhrs_x16 _mm256_mulhrs_epi16
+#define signmask_x16(x) _mm256_srai_epi16((x),15)
+
+typedef union {
+    int16 v[512];
+    int16x16 _dummy;
+} vec512;
+
+typedef union {
+    int16 v[4][512];
+    int16x16 _dummy;
+} vec4x512;
+
+typedef union {
+    int16 v[1024];
+    int16x16 _dummy;
+} vec1024;
+
+typedef union {
+    int16 v[4 * 512];
+    int16x16 _dummy;
+} vec2048;
+
+static inline int16x16 squeeze_5167_x16(int16x16 x) {
+    return sub_x16(x, mullo_x16(mulhrs_x16(x, const_x16(6)), const_x16(5167)));
+}
+
+static inline int16x16 squeeze_7681_x16(int16x16 x) {
+    return sub_x16(x, mullo_x16(mulhrs_x16(x, const_x16(4)), const_x16(7681)));
+}
+
+static inline int16x16 squeeze_10753_x16(int16x16 x) {
+    return sub_x16(x, mullo_x16(mulhrs_x16(x, const_x16(3)), const_x16(10753)));
+}
+
+static inline int16x16 mulmod_5167_x16(int16x16 x, int16x16 y) {
+    int16x16 yqinv = mullo_x16(y, const_x16(-19761)); /* XXX: precompute */
+    int16x16 b = mulhi_x16(x, y);
+    int16x16 d = mullo_x16(x, yqinv);
+    int16x16 e = mulhi_x16(d, const_x16(5167));
+    return sub_x16(b, e);
+}
+
+static inline int16x16 mulmod_7681_x16(int16x16 x, int16x16 y) {
+    int16x16 yqinv = mullo_x16(y, const_x16(-7679)); /* XXX: precompute */
+    int16x16 b = mulhi_x16(x, y);
+    int16x16 d = mullo_x16(x, yqinv);
+    int16x16 e = mulhi_x16(d, const_x16(7681));
+    return sub_x16(b, e);
+}
+
+static inline int16x16 mulmod_10753_x16(int16x16 x, int16x16 y) {
+    int16x16 yqinv = mullo_x16(y, const_x16(-10751)); /* XXX: precompute */
+    int16x16 b = mulhi_x16(x, y);
+    int16x16 d = mullo_x16(x, yqinv);
+    int16x16 e = mulhi_x16(d, const_x16(10753));
+    return sub_x16(b, e);
+}
+
+static void stride(int16 fpad[4][512], const int16 f[1024]) {
+    int16x16 f0, f1, f2, f3, g0, g1, g2, g3;
+    int i, j;
+
+    for (j = 0; j < 256; j += 16) {
+        f0 = load_x16(&f[0]);
+        f1 = load_x16(&f[16]);
+        f2 = load_x16(&f[32]);
+        f3 = load_x16(&f[48]);
+        f += 64;
+
+        g0 = _mm256_permute2x128_si256(f0, f2, 0x20);
+        g1 = _mm256_permute2x128_si256(f0, f2, 0x31);
+        g2 = _mm256_permute2x128_si256(f1, f3, 0x20);
+        g3 = _mm256_permute2x128_si256(f1, f3, 0x31);
+        f0 = _mm256_unpacklo_epi16(g0, g1);
+        f1 = _mm256_unpackhi_epi16(g0, g1);
+        f2 = _mm256_unpacklo_epi16(g2, g3);
+        f3 = _mm256_unpackhi_epi16(g2, g3);
+        g0 = _mm256_unpacklo_epi16(f0, f1);
+        g1 = _mm256_unpackhi_epi16(f0, f1);
+        g2 = _mm256_unpacklo_epi16(f2, f3);
+        g3 = _mm256_unpackhi_epi16(f2, f3);
+        f0 = _mm256_unpacklo_epi64(g0, g2);
+        f1 = _mm256_unpackhi_epi64(g0, g2);
+        f2 = _mm256_unpacklo_epi64(g1, g3);
+        f3 = _mm256_unpackhi_epi64(g1, g3);
+
+        store_x16(&fpad[0][j], f0);
+        store_x16(&fpad[1][j], f1);
+        store_x16(&fpad[2][j], f2);
+        store_x16(&fpad[3][j], f3);
+    }
+
+    for (i = 0; i < 4; ++i) {
+        for (j = 256; j < 512; ++j) {
+            fpad[i][j] = 0;
+        }
+    }
+}
+
+static void unstride(int16 f[2048], const int16 fpad[4][512]) {
+    int16x16 f0, f1, f2, f3, g0, g1, g2, g3, h0, h1, h2, h3;
+    int j;
+
+    for (j = 0; j < 512; j += 16) {
+        f0 = load_x16(&fpad[0][j]);
+        f1 = load_x16(&fpad[1][j]);
+        f2 = load_x16(&fpad[2][j]);
+        f3 = load_x16(&fpad[3][j]);
+
+        g2 = _mm256_unpacklo_epi16(f2, f3);
+        g3 = _mm256_unpackhi_epi16(f2, f3);
+        g0 = _mm256_unpacklo_epi16(f0, f1);
+        h0 = _mm256_unpacklo_epi32(g0, g2);
+        h1 = _mm256_unpackhi_epi32(g0, g2);
+        g1 = _mm256_unpackhi_epi16(f0, f1);
+        h2 = _mm256_unpacklo_epi32(g1, g3);
+        h3 = _mm256_unpackhi_epi32(g1, g3);
+        f1 = _mm256_permute2x128_si256(h2, h3, 0x20);
+        f3 = _mm256_permute2x128_si256(h2, h3, 0x31);
+        f0 = _mm256_permute2x128_si256(h0, h1, 0x20);
+        f2 = _mm256_permute2x128_si256(h0, h1, 0x31);
+
+        store_x16(&f[0], f0);
+        store_x16(&f[16], f1);
+        store_x16(&f[32], f2);
+        store_x16(&f[48], f3);
+        f += 64;
+    }
+}
+
+static const vec512 y_7681 = { .v = {
+        -3593, -617, -2804, 3266, -2194, -1296, -1321, 810, 1414, 3706, -549, -396, -121, -2088, -2555, 1305,
+            -3777, 1921, 103, 3600, -2456, 1483, 1399, -1887, -1701, 2006, 1535, -3174, -2250, 2816, -2440, -1760,
+            -3625, 2830, 2043, -3689, 1100, 1525, -514, 7, 2876, -1599, 3153, -1881, -2495, -2237, -2535, 438,
+            3182, 3364, -1431, 1738, 3696, -2557, -2956, 638, -2319, -1993, -2310, -3555, 834, -1986, 3772, -679,
+            3593, 617, 2804, -3266, 2194, 1296, 1321, -810, -1414, -3706, 549, 396, 121, 2088, 2555, -1305,
+            3777, -1921, -103, -3600, 2456, -1483, -1399, 1887, 1701, -2006, -1535, 3174, 2250, -2816, 2440, 1760,
+            3625, -2830, -2043, 3689, -1100, -1525, 514, -7, -2876, 1599, -3153, 1881, 2495, 2237, 2535, -438,
+            -3182, -3364, 1431, -1738, -3696, 2557, 2956, -638, 2319, 1993, 2310, 3555, -834, 1986, -3772, 679,
+            2665, 727, -2572, 2426, -2133, -1386, 1681, -1054, 2579, 3750, 373, 3417, 404, -2233, 3135, -3405,
+            -1799, 1521, 1497, -3831, -3480, -3428, 2883, -1698, -859, -2762, 2175, -194, -486, -3816, -1756, 2385,
+            -783, 1533, 3145, 2, 3310, -2743, 2224, -1166, 2649, -1390, 3692, 2789, 1919, 2835, -2391, -2732,
+            1056, 1464, 1350, -915, -1168, -921, -3588, 3456, -2160, -1598, 730, 2919, 1532, -2764, -660, -2113,
+            -2665, -727, 2572, -2426, 2133, 1386, -1681, 1054, -2579, -3750, -373, -3417, -404, 2233, -3135, 3405,
+            1799, -1521, -1497, 3831, 3480, 3428, -2883, 1698, 859, 2762, -2175, 194, 486, 3816, 1756, -2385,
+            783, -1533, -3145, -2, -3310, 2743, -2224, 1166, -2649, 1390, -3692, -2789, -1919, -2835, 2391, 2732,
+            -1056, -1464, -1350, 915, 1168, 921, 3588, -3456, 2160, 1598, -730, -2919, -1532, 2764, 660, 2113,
+            2005, -188, 2345, -3723, -1403, 2070, 83, -3214, -3752, -1012, 1837, -3208, 3287, 3335, -293, 796,
+            592, 1519, -1338, 1931, 509, -2262, -3408, 3334, 3677, 2130, 642, 589, -2167, -1084, -370, -3163,
+            3763, -893, -2303, -402, 2937, -1689, -1526, -3745, -2460, 2874, 2965, 124, -1669, -1441, -3312, 3781,
+            2812, -2386, -2515, -429, -3343, 777, -826, -3366, -3657, -1404, -791, -2963, -692, 2532, 2083, 2258,
+            -2005, 188, -2345, 3723, 1403, -2070, -83, 3214, 3752, 1012, -1837, 3208, -3287, -3335, 293, -796,
+            -592, -1519, 1338, -1931, -509, 2262, 3408, -3334, -3677, -2130, -642, -589, 2167, 1084, 370, 3163,
+            -3763, 893, 2303, 402, -2937, 1689, 1526, 3745, 2460, -2874, -2965, -124, 1669, 1441, 3312, -3781,
+            -2812, 2386, 2515, 429, 3343, -777, 826, 3366, 3657, 1404, 791, 2963, 692, -2532, -2083, -2258,
+            179, 1121, 2891, -3581, 3177, -658, -3314, -1509, -17, 151, 2815, 2786, 1278, -2767, -1072, -1151,
+            -1242, -2071, 2340, -1586, 2072, 1476, 2998, 2918, -3744, -3794, -1295, 451, -929, 2378, -1144, 434,
+            -1070, -436, -3550, -3568, 1649, 715, 3461, -1407, -2001, -1203, 3770, 1712, 2230, -3542, 2589, -3547,
+            -2059, -236, 3434, -3693, 2161, -670, 2719, 2339, -2422, 1181, 3450, 222, 1348, -226, 2247, -1779,
+            -179, -1121, -2891, 3581, -3177, 658, 3314, 1509, 17, -151, -2815, -2786, -1278, 2767, 1072, 1151,
+            1242, 2071, -2340, 1586, -2072, -1476, -2998, -2918, 3744, 3794, 1295, -451, 929, -2378, 1144, -434,
+            1070, 436, 3550, 3568, -1649, -715, -3461, 1407, 2001, 1203, -3770, -1712, -2230, 3542, -2589, 3547,
+            2059, 236, -3434, 3693, -2161, 670, -2719, -2339, 2422, -1181, -3450, -222, -1348, 226, -2247, 1779,
+        }
+} ;
+static const vec512 y_10753 = { .v = {
+        1018, -1520, -2935, -4189, 2413, 918, 4, 1299, -2695, 1341, -205, -4744, -3784, 2629, 2565, -3062,
+        223, -4875, 2790, -2576, -3686, -2503, 3550, -3085, 730, 1931, -4513, 4876, -3364, 5213, 2178, 2984,
+        4188, -4035, 4129, -544, 357, 4347, 1284, -2388, -4855, 341, -1287, 4102, 425, 5175, -4616, -4379,
+        -3688, 5063, 3091, 1085, -376, 3012, -268, -1009, -2236, -3823, 2982, -4742, -4544, -4095, 193, 847,
+        -1018, 1520, 2935, 4189, -2413, -918, -4, -1299, 2695, -1341, 205, 4744, 3784, -2629, -2565, 3062,
+        -223, 4875, -2790, 2576, 3686, 2503, -3550, 3085, -730, -1931, 4513, -4876, 3364, -5213, -2178, -2984,
+        -4188, 4035, -4129, 544, -357, -4347, -1284, 2388, 4855, -341, 1287, -4102, -425, -5175, 4616, 4379,
+        3688, -5063, -3091, -1085, 376, -3012, 268, 1009, 2236, 3823, -2982, 4742, 4544, 4095, -193, -847,
+        -4734, 4977, -400, -864, 567, -5114, -4286, 635, 512, -1356, -779, -2973, 675, -5064, -1006, 1268,
+        2998, 2981, -151, -3337, 3198, -909, 2737, -970, 2774, 886, 2206, 1324, 2271, 454, -326, -3715,
+        -3441, -4580, 636, 2234, -794, 3615, 578, -472, 3057, -5156, -2740, 2684, 1615, -1841, -336, -1586,
+        5341, -116, 5294, 4123, 5023, -1458, -3169, 467, -2045, 4828, -1572, -5116, -2213, -4808, 2884, 1068,
+        4734, -4977, 400, 864, -567, 5114, 4286, -635, -512, 1356, 779, 2973, -675, 5064, 1006, -1268,
+        -2998, -2981, 151, 3337, -3198, 909, -2737, 970, -2774, -886, -2206, -1324, -2271, -454, 326, 3715,
+        3441, 4580, -636, -2234, 794, -3615, -578, 472, -3057, 5156, 2740, -2684, -1615, 1841, 336, 1586,
+        -5341, 116, -5294, -4123, -5023, 1458, 3169, -467, 2045, -4828, 1572, 5116, 2213, 4808, -2884, -1068,
+        3453, 2196, 2118, 5005, 2428, -2062, -1930, 2283, 4601, 3524, -3241, -1409, -2230, -5015, 4359, 4254,
+        5309, 2657, -2050, -4428, 4250, -2015, -3148, -778, 2624, -1573, 40, 2237, -573, -4447, 2909, 1122,
+        854, -4782, 2439, 4408, 5172, 4784, 4144, 1639, 3760, 2139, 2680, -663, 4621, 3135, 1349, -97,
+        5215, 3410, -2117, -1992, -1381, -1635, 274, -2419, 3570, 458, 2087, -2374, -1132, 2662, -1722, 5313,
+        -3453, -2196, -2118, -5005, -2428, 2062, 1930, -2283, -4601, -3524, 3241, 1409, 2230, 5015, -4359, -4254,
+        -5309, -2657, 2050, 4428, -4250, 2015, 3148, 778, -2624, 1573, -40, -2237, 573, 4447, -2909, -1122,
+        -854, 4782, -2439, -4408, -5172, -4784, -4144, -1639, -3760, -2139, -2680, 663, -4621, -3135, -1349, 97,
+        -5215, -3410, 2117, 1992, 1381, 1635, -274, 2419, -3570, -458, -2087, 2374, 1132, -2662, 1722, -5313,
+        -2487, -554, 4519, 2449, 73, 3419, 624, -1663, -1053, 4889, 279, 1893, 1111, 1510, 2279, -4540,
+        2529, 2963, 5120, -3995, -5107, -3360, -5356, 2625, -4403, 152, -5083, -2807, 2113, -4000, -4328, 3125,
+        -2605, 4967, -1056, 1160, 1927, 693, -4003, 3827, -4670, -569, 3535, -5268, 1782, 825, 355, 5068,
+        5334, 4859, -1689, -2788, -4891, -3260, 1204, 3891, -4720, -4973, 2813, 2205, 834, -4393, -2151, 3096,
+        2487, 554, -4519, -2449, -73, -3419, -624, 1663, 1053, -4889, -279, -1893, -1111, -1510, -2279, 4540,
+        -2529, -2963, -5120, 3995, 5107, 3360, 5356, -2625, 4403, -152, 5083, 2807, -2113, 4000, 4328, -3125,
+        2605, -4967, 1056, -1160, -1927, -693, 4003, -3827, 4670, 569, -3535, 5268, -1782, -825, -355, -5068,
+        -5334, -4859, 1689, 2788, 4891, 3260, -1204, -3891, 4720, 4973, -2813, -2205, -834, 4393, 2151, -3096,
+    }
+} ;
+/*
+  can also compute these on the fly, and share storage,
+  at expense of 2 NTTs on top of the 24 NTTs below:
+  ...
+  for (i = 0;i < 512;++i) y_7681[i] = 0;
+  y_7681[1] = -3593;
+  PQCLEAN_SNTRUP857_AVX2_ntt512_7681(y_7681,1);
+  ...
+  for (i = 0;i < 512;++i) y_10753[i] = 0;
+  y_10753[1] = 1018;
+  PQCLEAN_SNTRUP857_AVX2_ntt512_10753(y_10753,1);
+*/
+
+static void mult1024(int16 h[2048], const int16 f[1024], const int16 g[1024]) {
+    vec4x512 x1, x2;
+    vec2048 x3, x4;
+#define fpad (x1.v)
+#define gpad (x2.v)
+#define hpad fpad
+#define h_7681 (x3.v)
+#define h_10753 (x4.v)
+    int i;
+
+    stride(fpad, f);
+    PQCLEAN_SNTRUP857_AVX2_ntt512_7681(fpad[0], 4);
+
+    stride(gpad, g);
+    PQCLEAN_SNTRUP857_AVX2_ntt512_7681(gpad[0], 4);
+
+    for (i = 0; i < 512; i += 16) {
+        int16x16 f0 = squeeze_7681_x16(load_x16(&fpad[0][i]));
+        int16x16 f1 = squeeze_7681_x16(load_x16(&fpad[1][i]));
+        int16x16 f2 = squeeze_7681_x16(load_x16(&fpad[2][i]));
+        int16x16 f3 = squeeze_7681_x16(load_x16(&fpad[3][i]));
+        int16x16 g0 = squeeze_7681_x16(load_x16(&gpad[0][i]));
+        int16x16 g1 = squeeze_7681_x16(load_x16(&gpad[1][i]));
+        int16x16 g2 = squeeze_7681_x16(load_x16(&gpad[2][i]));
+        int16x16 g3 = squeeze_7681_x16(load_x16(&gpad[3][i]));
+        int16x16 d0 = mulmod_7681_x16(f0, g0);
+        int16x16 d1 = mulmod_7681_x16(f1, g1);
+        int16x16 d2 = mulmod_7681_x16(f2, g2);
+        int16x16 d3 = mulmod_7681_x16(f3, g3);
+        int16x16 d0d1 = add_x16(d0, d1);
+        int16x16 d0d1d2 = add_x16(d0d1, d2);
+        int16x16 d0d1d2d3 = squeeze_7681_x16(add_x16(d0d1d2, d3));
+        int16x16 d2d3 = add_x16(d2, d3);
+        int16x16 d1d2d3 = add_x16(d1, d2d3);
+        int16x16 e01 = mulmod_7681_x16(sub_x16(f0, f1), sub_x16(g0, g1));
+        int16x16 e02 = mulmod_7681_x16(sub_x16(f0, f2), sub_x16(g0, g2));
+        int16x16 e03 = mulmod_7681_x16(sub_x16(f0, f3), sub_x16(g0, g3));
+        int16x16 e12 = mulmod_7681_x16(sub_x16(f1, f2), sub_x16(g1, g2));
+        int16x16 e13 = mulmod_7681_x16(sub_x16(f1, f3), sub_x16(g1, g3));
+        int16x16 e23 = mulmod_7681_x16(sub_x16(f2, f3), sub_x16(g2, g3));
+        int16x16 h0 = d0;
+        int16x16 h1 = sub_x16(d0d1, e01);
+        int16x16 h2 = sub_x16(d0d1d2, e02);
+        int16x16 h3 = sub_x16(d0d1d2d3, add_x16(e12, e03));
+        int16x16 h4 = sub_x16(d1d2d3, e13);
+        int16x16 h5 = sub_x16(d2d3, e23);
+        int16x16 h6 = d3;
+        int16x16 twist = load_x16(&y_7681.v[i]);
+        h4 = mulmod_7681_x16(h4, twist);
+        h5 = mulmod_7681_x16(h5, twist);
+        h6 = mulmod_7681_x16(h6, twist);
+        h0 = add_x16(h0, h4);
+        h1 = add_x16(h1, h5);
+        h2 = add_x16(h2, h6);
+        store_x16(&hpad[0][i], squeeze_7681_x16(h0));
+        store_x16(&hpad[1][i], squeeze_7681_x16(h1));
+        store_x16(&hpad[2][i], squeeze_7681_x16(h2));
+        store_x16(&hpad[3][i], squeeze_7681_x16(h3));
+    }
+
+    PQCLEAN_SNTRUP857_AVX2_invntt512_7681(hpad[0], 4);
+    unstride(h_7681, (const int16(*)[512]) hpad);
+
+    stride(fpad, f);
+    PQCLEAN_SNTRUP857_AVX2_ntt512_10753(fpad[0], 4);
+
+    stride(gpad, g);
+    PQCLEAN_SNTRUP857_AVX2_ntt512_10753(gpad[0], 4);
+
+    for (i = 0; i < 512; i += 16) {
+        int16x16 f0 = squeeze_10753_x16(load_x16(&fpad[0][i]));
+        int16x16 f1 = squeeze_10753_x16(load_x16(&fpad[1][i]));
+        int16x16 f2 = squeeze_10753_x16(load_x16(&fpad[2][i]));
+        int16x16 f3 = squeeze_10753_x16(load_x16(&fpad[3][i]));
+        int16x16 g0 = squeeze_10753_x16(load_x16(&gpad[0][i]));
+        int16x16 g1 = squeeze_10753_x16(load_x16(&gpad[1][i]));
+        int16x16 g2 = squeeze_10753_x16(load_x16(&gpad[2][i]));
+        int16x16 g3 = squeeze_10753_x16(load_x16(&gpad[3][i]));
+        int16x16 d0 = mulmod_10753_x16(f0, g0);
+        int16x16 d1 = mulmod_10753_x16(f1, g1);
+        int16x16 d2 = mulmod_10753_x16(f2, g2);
+        int16x16 d3 = mulmod_10753_x16(f3, g3);
+        int16x16 d0d1 = add_x16(d0, d1);
+        int16x16 d0d1d2 = add_x16(d0d1, d2);
+        int16x16 d0d1d2d3 = squeeze_10753_x16(add_x16(d0d1d2, d3));
+        int16x16 d2d3 = add_x16(d2, d3);
+        int16x16 d1d2d3 = add_x16(d1, d2d3);
+        int16x16 e01 = mulmod_10753_x16(sub_x16(f0, f1), sub_x16(g0, g1));
+        int16x16 e02 = mulmod_10753_x16(sub_x16(f0, f2), sub_x16(g0, g2));
+        int16x16 e03 = mulmod_10753_x16(sub_x16(f0, f3), sub_x16(g0, g3));
+        int16x16 e12 = mulmod_10753_x16(sub_x16(f1, f2), sub_x16(g1, g2));
+        int16x16 e13 = mulmod_10753_x16(sub_x16(f1, f3), sub_x16(g1, g3));
+        int16x16 e23 = mulmod_10753_x16(sub_x16(f2, f3), sub_x16(g2, g3));
+        int16x16 h0 = d0;
+        int16x16 h1 = sub_x16(d0d1, e01);
+        int16x16 h2 = sub_x16(d0d1d2, e02);
+        int16x16 h3 = sub_x16(d0d1d2d3, add_x16(e12, e03));
+        int16x16 h4 = sub_x16(d1d2d3, e13);
+        int16x16 h5 = sub_x16(d2d3, e23);
+        int16x16 h6 = d3;
+        int16x16 twist = load_x16(&y_10753.v[i]);
+        h4 = mulmod_10753_x16(h4, twist);
+        h5 = mulmod_10753_x16(h5, twist);
+        h6 = mulmod_10753_x16(h6, twist);
+        h0 = add_x16(h0, h4);
+        h1 = add_x16(h1, h5);
+        h2 = add_x16(h2, h6);
+        store_x16(&hpad[0][i], squeeze_10753_x16(h0));
+        store_x16(&hpad[1][i], squeeze_10753_x16(h1));
+        store_x16(&hpad[2][i], squeeze_10753_x16(h2));
+        store_x16(&hpad[3][i], squeeze_10753_x16(h3));
+    }
+
+    PQCLEAN_SNTRUP857_AVX2_invntt512_10753(hpad[0], 4);
+    unstride(h_10753, (const int16(*)[512]) hpad);
+
+    for (i = 0; i < 2048; i += 16) {
+        int16x16 u1 = load_x16(&h_10753[i]);
+        int16x16 u2 = load_x16(&h_7681[i]);
+        int16x16 t;
+        u1 = mulmod_10753_x16(u1, const_x16(1268));
+        u2 = mulmod_7681_x16(u2, const_x16(956));
+        t = mulmod_7681_x16(sub_x16(u2, u1), const_x16(-2539));
+        t = add_x16(u1, mulmod_5167_x16(t, const_x16(2146)));
+        store_x16(&h[i], t);
+    }
+}
+
+#define crypto_decode_pxint16 PQCLEAN_SNTRUP857_AVX2_crypto_decode_857xint16
+#define crypto_encode_pxint16 PQCLEAN_SNTRUP857_AVX2_crypto_encode_857xint16
+
+#define p 857
+#define q 5167
+
+static inline int16x16 freeze_5167_x16(int16x16 x) {
+    int16x16 mask, xq;
+    x = add_x16(x, const_x16(q)&signmask_x16(x));
+    mask = signmask_x16(sub_x16(x, const_x16((q + 1) / 2)));
+    xq = sub_x16(x, const_x16(q));
+    x = _mm256_blendv_epi8(xq, x, mask);
+    return x;
+}
+
+int PQCLEAN_SNTRUP857_AVX2_crypto_core_multsntrup857(unsigned char *outbytes, const unsigned char *inbytes, const unsigned char *kbytes) {
+    vec1024 x1, x2;
+    vec2048 x3;
+#define f (x1.v)
+#define g (x2.v)
+#define fg (x3.v)
+#define h f
+    int i;
+    int16x16 x;
+
+    x = const_x16(0);
+    for (i = p & ~15; i < 1024; i += 16) {
+        store_x16(&f[i], x);
+    }
+    for (i = p & ~15; i < 1024; i += 16) {
+        store_x16(&g[i], x);
+    }
+
+    crypto_decode_pxint16(f, inbytes);
+
+    for (i = 0; i < 1024; i += 16) {
+        x = load_x16(&f[i]);
+        x = freeze_5167_x16(squeeze_5167_x16(x));
+        store_x16(&f[i], x);
+    }
+    for (i = 0; i < p; ++i) {
+        int8 gi = kbytes[i];
+        int8 gi0 = gi & 1;
+        g[i] = gi0 - (gi & (gi0 << 1));
+    }
+
+    mult1024(fg, f, g);
+
+    fg[0] -= fg[p - 1];
+    for (i = 0; i < 1024; i += 16) {
+        int16x16 fgi = load_x16(&fg[i]);
+        int16x16 fgip = load_x16(&fg[i + p]);
+        int16x16 fgip1 = load_x16(&fg[i + p - 1]);
+        x = add_x16(fgi, add_x16(fgip, fgip1));
+        x = freeze_5167_x16(squeeze_5167_x16(x));
+        store_x16(&h[i], x);
+    }
+
+    crypto_encode_pxint16(outbytes, h);
+
+    return 0;
+}
diff --git a/crypto_kem/sntrup857/avx2/crypto_core_multsntrup857.h b/crypto_kem/sntrup857/avx2/crypto_core_multsntrup857.h
new file mode 100644
index 00000000..db467cc1
--- /dev/null
+++ b/crypto_kem/sntrup857/avx2/crypto_core_multsntrup857.h
@@ -0,0 +1,11 @@
+#ifndef PQCLEAN_SNTRUP857_AVX2_CRYPTO_CORE_MULTSNTRUP857_H
+#define PQCLEAN_SNTRUP857_AVX2_CRYPTO_CORE_MULTSNTRUP857_H
+
+#include <stdint.h>
+#define PQCLEAN_SNTRUP857_AVX2_crypto_core_multsntrup857_OUTPUTBYTES 1714
+#define PQCLEAN_SNTRUP857_AVX2_crypto_core_multsntrup857_INPUTBYTES 1714
+#define PQCLEAN_SNTRUP857_AVX2_crypto_core_multsntrup857_KEYBYTES 857
+#define PQCLEAN_SNTRUP857_AVX2_crypto_core_multsntrup857_CONSTBYTES 0
+
+int PQCLEAN_SNTRUP857_AVX2_crypto_core_multsntrup857(unsigned char *outbytes, const unsigned char *inbytes, const unsigned char *kbytes);
+#endif
diff --git a/crypto_kem/sntrup857/avx2/crypto_core_multsntrup857_ntt.c b/crypto_kem/sntrup857/avx2/crypto_core_multsntrup857_ntt.c
new file mode 100644
index 00000000..05fbade0
--- /dev/null
+++ b/crypto_kem/sntrup857/avx2/crypto_core_multsntrup857_ntt.c
@@ -0,0 +1,927 @@
+#include "crypto_core_multsntrup857.h"
+#include "crypto_core_multsntrup857_ntt.h"
+#include <immintrin.h>
+#include <stdint.h>
+
+/* auto-generated; do not edit */
+
+
+typedef int8_t int8;
+typedef int16_t int16;
+
+#define zeta(n,i) (((__m256i *) zeta_##n)[(i)])
+#define zeta_x4(n,i) (((__m256i *) zeta_x4_##n)[(i)])
+#define zeta_qinv(n,i) (((__m256i *) qinvzeta_##n)[(i)])
+#define zeta_x4_qinv(n,i) (((__m256i *) qinvzeta_x4_##n)[(i)])
+#define zetainv(n,i) _mm256_loadu_reverse16((__m256i *) ((int16 *) zeta_##n+(n)/2+1-16*((i)+1)))
+#define zetainv_x4(n,i) _mm256_loadu_reverse16((__m256i *) ((int16 *) zeta_x4_##n+2*(n)+4-16*((i)+1)))
+#define zetainv_qinv(n,i) _mm256_loadu_reverse16((__m256i *) ((int16 *) qinvzeta_##n+(n)/2+1-16*((i)+1)))
+#define zetainv_x4_qinv(n,i) _mm256_loadu_reverse16((__m256i *) ((int16 *) qinvzeta_x4_##n+2*(n)+4-16*((i)+1)))
+
+typedef union {
+    int16 data[93 * 16];
+    __m256i _dummy;
+} vec1488;
+
+static const vec1488 qdata_7681 = { .data = {
+
+#define q_x16 (qdata[0])
+        7681, 7681, 7681, 7681, 7681, 7681, 7681, 7681, 7681, 7681, 7681, 7681, 7681, 7681, 7681, 7681,
+
+#define qrecip_x16 (qdata[1])
+        17474, 17474, 17474, 17474, 17474, 17474, 17474, 17474, 17474, 17474, 17474, 17474, 17474, 17474, 17474, 17474,
+
+#define qshift_x16 (qdata[2])
+        16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+
+#define zeta4_x16 (qdata[3])
+        -3777, -3777, -3777, -3777, -3777, -3777, -3777, -3777, -3777, -3777, -3777, -3777, -3777, -3777, -3777, -3777,
+
+#define zeta4_x16_qinv (qdata[4])
+        -28865, -28865, -28865, -28865, -28865, -28865, -28865, -28865, -28865, -28865, -28865, -28865, -28865, -28865, -28865, -28865,
+
+#define zeta8_x16 (qdata[5])
+        -3625, -3625, -3625, -3625, -3625, -3625, -3625, -3625, -3625, -3625, -3625, -3625, -3625, -3625, -3625, -3625,
+
+#define zeta8_x16_qinv (qdata[6])
+        -16425, -16425, -16425, -16425, -16425, -16425, -16425, -16425, -16425, -16425, -16425, -16425, -16425, -16425, -16425, -16425,
+
+#define zetainv8_x16 (qdata[7])
+        -3182, -3182, -3182, -3182, -3182, -3182, -3182, -3182, -3182, -3182, -3182, -3182, -3182, -3182, -3182, -3182,
+
+#define zetainv8_x16_qinv (qdata[8])
+        -10350, -10350, -10350, -10350, -10350, -10350, -10350, -10350, -10350, -10350, -10350, -10350, -10350, -10350, -10350, -10350,
+
+#define zeta_x4_16 (qdata+9)
+        -3593, -3593, -3593, -3593, -2194, -2194, -2194, -2194, -3625, -3625, -3625, -3625, 1100, 1100, 1100, 1100,
+        -3777, -3777, -3777, -3777, -2456, -2456, -2456, -2456, 3182, 3182, 3182, 3182, 3696, 3696, 3696, 3696,
+        3593, 3593, 3593, 3593, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+#define qinvzeta_x4_16 (qdata+12)
+        -9, -9, -9, -9, 4974, 4974, 4974, 4974, -16425, -16425, -16425, -16425, 7244, 7244, 7244, 7244,
+        -28865, -28865, -28865, -28865, -14744, -14744, -14744, -14744, 10350, 10350, 10350, 10350, -4496, -4496, -4496, -4496,
+        9, 9, 9, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+#define zeta_x4_32 (qdata+15)
+        -3593, -3593, -3593, -3593, 1414, 1414, 1414, 1414, -2194, -2194, -2194, -2194, -2495, -2495, -2495, -2495,
+        -3625, -3625, -3625, -3625, 2876, 2876, 2876, 2876, 1100, 1100, 1100, 1100, -2250, -2250, -2250, -2250,
+        -3777, -3777, -3777, -3777, -1701, -1701, -1701, -1701, -2456, -2456, -2456, -2456, 834, 834, 834, 834,
+        3182, 3182, 3182, 3182, -2319, -2319, -2319, -2319, 3696, 3696, 3696, 3696, 121, 121, 121, 121,
+        3593, 3593, 3593, 3593, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+#define qinvzeta_x4_32 (qdata+20)
+        -9, -9, -9, -9, 20870, 20870, 20870, 20870, 4974, 4974, 4974, 4974, 22593, 22593, 22593, 22593,
+        -16425, -16425, -16425, -16425, 828, 828, 828, 828, 7244, 7244, 7244, 7244, -23754, -23754, -23754, -23754,
+        -28865, -28865, -28865, -28865, 20315, 20315, 20315, 20315, -14744, -14744, -14744, -14744, 18242, 18242, 18242, 18242,
+        10350, 10350, 10350, 10350, -18191, -18191, -18191, -18191, -4496, -4496, -4496, -4496, -11655, -11655, -11655, -11655,
+        9, 9, 9, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+#define zeta_64 (qdata+25)
+        -3593, -617, 1414, 3706, -2194, -1296, -2495, -2237, -3625, 2830, 2876, -1599, 1100, 1525, -2250, 2816,
+        -3777, 1921, -1701, 2006, -2456, 1483, 834, -1986, 3182, 3364, -2319, -1993, 3696, -2557, 121, 2088,
+        3593, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+#define qinvzeta_64 (qdata+28)
+        -9, 19351, 20870, -15750, 4974, -9488, 22593, 7491, -16425, 26382, 828, 23489, 7244, 20469, -23754, 2816,
+        -28865, -5759, 20315, -3114, -14744, 15307, 18242, -19394, 10350, -10972, -18191, -31177, -4496, -25597, -11655, 22568,
+        9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+#define zeta_128 (qdata+31)
+        -3593, -2804, -617, -396, 1414, -549, 3706, 810, -2194, -1321, -1296, 438, -2495, -2535, -2237, -3689,
+        -3625, 2043, 2830, -1881, 2876, 3153, -1599, 7, 1100, -514, 1525, -1760, -2250, -2440, 2816, 3600,
+        -3777, 103, 1921, -3174, -1701, 1535, 2006, -1887, -2456, 1399, 1483, -679, 834, 3772, -1986, 1738,
+        3182, -1431, 3364, -3555, -2319, -2310, -1993, 638, 3696, -2956, -2557, -1305, 121, 2555, 2088, -3266,
+        3593, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+#define qinvzeta_128 (qdata+36)
+        -9, -29428, 19351, 26228, 20870, 21467, -15750, 5930, 4974, -14121, -9488, -21066, 22593, 2073, 7491, 16279,
+        -16425, -25093, 26382, 26279, 828, -29103, 23489, 11783, 7244, 14846, 20469, 14624, -23754, -6536, 2816, 11792,
+        -28865, -4505, -5759, -6246, 20315, 9215, -3114, 6817, -14744, 4983, 15307, -28839, 18242, 1724, -19394, 23242,
+        10350, -21399, -10972, -29667, -18191, -21766, -31177, 15998, -4496, 23668, -25597, -5913, -11655, -24581, 22568, -20674,
+        9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+#define zeta_256 (qdata+41)
+        -3593, 2665, -2804, -2572, -617, 727, -396, 3417, 1414, 2579, -549, 373, 3706, 3750, 810, -1054,
+        -2194, -2133, -1321, 1681, -1296, -1386, 438, -2732, -2495, 1919, -2535, -2391, -2237, 2835, -3689, 2,
+        -3625, -783, 2043, 3145, 2830, 1533, -1881, 2789, 2876, 2649, 3153, 3692, -1599, -1390, 7, -1166,
+        1100, 3310, -514, 2224, 1525, -2743, -1760, 2385, -2250, -486, -2440, -1756, 2816, -3816, 3600, -3831,
+        -3777, -1799, 103, 1497, 1921, 1521, -3174, -194, -1701, -859, 1535, 2175, 2006, -2762, -1887, -1698,
+        -2456, -3480, 1399, 2883, 1483, -3428, -679, -2113, 834, 1532, 3772, -660, -1986, -2764, 1738, -915,
+        3182, 1056, -1431, 1350, 3364, 1464, -3555, 2919, -2319, -2160, -2310, 730, -1993, -1598, 638, 3456,
+        3696, -1168, -2956, -3588, -2557, -921, -1305, 3405, 121, -404, 2555, -3135, 2088, 2233, -3266, -2426,
+        3593, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+#define qinvzeta_256 (qdata+50)
+        -9, -17303, -29428, 24052, 19351, -12073, 26228, -24743, 20870, -12269, 21467, 19317, -15750, -25946, 5930, 32738,
+        4974, -4693, -14121, 2193, -9488, 26262, -21066, 7508, 22593, 9599, 2073, 10409, 7491, -12013, 16279, -15358,
+        -16425, -16655, -25093, 32329, 26382, 24573, 26279, 13541, 828, -25511, -29103, 26220, 23489, -8558, 11783, -24718,
+        7244, 10478, 14846, 26800, 20469, 26441, 14624, -29871, -23754, -3558, -6536, -16092, 2816, 8472, 11792, -7415,
+        -28865, -13575, -4505, -26663, -5759, -14351, -6246, -17602, 20315, -22875, 9215, 9855, -3114, -24266, 6817, -2722,
+        -14744, -15768, 4983, 12611, 15307, -21860, -28839, -27201, 18242, 32252, 1724, 21868, -19394, -8908, 23242, 13933,
+        10350, 17440, -21399, -11962, -10972, 30136, -29667, -1689, -18191, 6032, -21766, 30426, -31177, 15810, 15998, 3456,
+        -4496, -9360, 23668, 27132, -25597, -5529, -5913, 1869, -11655, 22124, -24581, 21953, 22568, 23225, -20674, 17030,
+        9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+#define zeta_512 (qdata+59)
+        -3593, 2005, 2665, 2891, -2804, 2345, -2572, 1121, -617, -188, 727, 2786, -396, -3208, 3417, -17,
+        1414, -3752, 2579, 2815, -549, 1837, 373, 151, 3706, -1012, 3750, -1509, 810, -3214, -1054, 3177,
+        -2194, -1403, -2133, -3314, -1321, 83, 1681, -658, -1296, 2070, -1386, -3547, 438, 3781, -2732, 2230,
+        -2495, -1669, 1919, 2589, -2535, -3312, -2391, -3542, -2237, -1441, 2835, -3568, -3689, -402, 2, -1070,
+        -3625, 3763, -783, -3550, 2043, -2303, 3145, -436, 2830, -893, 1533, 1712, -1881, 124, 2789, -2001,
+        2876, -2460, 2649, 3770, 3153, 2965, 3692, -1203, -1599, 2874, -1390, -1407, 7, -3745, -1166, 1649,
+        1100, 2937, 3310, 3461, -514, -1526, 2224, 715, 1525, -1689, -2743, 434, -1760, -3163, 2385, -929,
+        -2250, -2167, -486, -1144, -2440, -370, -1756, 2378, 2816, -1084, -3816, -1586, 3600, 1931, -3831, -1242,
+        -3777, 592, -1799, 2340, 103, -1338, 1497, -2071, 1921, 1519, 1521, 451, -3174, 589, -194, -3744,
+        -1701, 3677, -859, -1295, 1535, 642, 2175, -3794, 2006, 2130, -2762, 2918, -1887, 3334, -1698, 2072,
+        -2456, 509, -3480, 2998, 1399, -3408, 2883, 1476, 1483, -2262, -3428, -1779, -679, 2258, -2113, 1348,
+        834, -692, 1532, 2247, 3772, 2083, -660, -226, -1986, 2532, -2764, -3693, 1738, -429, -915, -2059,
+        3182, 2812, 1056, 3434, -1431, -2515, 1350, -236, 3364, -2386, 1464, 222, -3555, -2963, 2919, -2422,
+        -2319, -3657, -2160, 3450, -2310, -791, 730, 1181, -1993, -1404, -1598, 2339, 638, -3366, 3456, 2161,
+        3696, -3343, -1168, 2719, -2956, -826, -3588, -670, -2557, 777, -921, 1151, -1305, -796, 3405, -1278,
+        121, -3287, -404, 1072, 2555, 293, -3135, 2767, 2088, -3335, 2233, 3581, -3266, 3723, -2426, -179,
+        3593, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+#define qinvzeta_512 (qdata+76)
+        -9, 4565, -17303, 16715, -29428, 15145, 24052, -22943, 19351, 1860, -12073, -28958, 26228, -7304, -24743, -529,
+        20870, -24232, -12269, 10495, 21467, -16083, 19317, 20119, -15750, -27636, -25946, -12261, 5930, -26766, 32738, -16791,
+        4974, 25733, -4693, 20238, -14121, 18003, 2193, 6510, -9488, 29718, 26262, -25563, -21066, -1851, 7508, -19274,
+        22593, -28805, 9599, -23523, 2073, 4880, 10409, 1578, 7491, -10145, -12013, 4624, 16279, 6766, -15358, 24530,
+        -16425, 5299, -16655, -2526, -25093, -9983, 32329, 5708, 26382, -23933, 24573, 26288, 26279, 30844, 13541, 30255,
+        828, 15972, -25511, 17082, -29103, -27243, 26220, -2739, 23489, 16186, -8558, -9087, 11783, -12449, -24718, -14223,
+        7244, -8839, 10478, 30597, 14846, -12790, 26800, 14539, 20469, -6297, 26441, 9650, 14624, -25179, -29871, -9633,
+        -23754, -5751, -3558, 2952, -6536, 23182, -16092, 23882, 2816, 964, 8472, -10802, 11792, -17013, -7415, -30938,
+        -28865, -23984, -13575, -11996, -4505, -14650, -26663, -22039, -5759, 1007, -14351, 10179, -6246, -947, -17602, -20128,
+        20315, 10333, -22875, -17167, 9215, -14718, 9855, -29394, -3114, 27730, -24266, 5990, 6817, 22790, -2722, 14360,
+        -14744, 23549, -15768, -18506, 4983, 21168, 12611, 3524, 15307, 2858, -21860, 29453, -28839, 27858, -27201, 3396,
+        18242, 5452, 32252, -18745, 1724, -4573, 21868, 31518, -19394, 20964, -8908, -18541, 23242, 17491, 13933, 16885,
+        10350, -32004, 17440, -24214, -21399, -20435, -11962, -22764, -10972, -27986, 30136, -802, -29667, 11885, -1689, -13686,
+        -18191, 32695, 6032, -16006, -21766, -20759, 30426, -24931, -31177, -32124, 15810, -4317, 15998, 26330, 3456, -13711,
+        -4496, -19215, -9360, 26783, 23668, -14138, 27132, -32414, -25597, -2807, -5529, 8831, -5913, 17636, 1869, -16638,
+        -11655, 9513, 22124, 25648, -24581, -21723, 21953, -14129, 22568, -15111, 23225, 26621, -20674, -15221, 17030, -1715,
+        9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    }
+};
+
+static const vec1488 qdata_10753 = { .data = {
+
+        10753, 10753, 10753, 10753, 10753, 10753, 10753, 10753, 10753, 10753, 10753, 10753, 10753, 10753, 10753, 10753,
+
+        24964, 24964, 24964, 24964, 24964, 24964, 24964, 24964, 24964, 24964, 24964, 24964, 24964, 24964, 24964, 24964,
+
+        8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+
+        223, 223, 223, 223, 223, 223, 223, 223, 223, 223, 223, 223, 223, 223, 223, 223,
+
+        27359, 27359, 27359, 27359, 27359, 27359, 27359, 27359, 27359, 27359, 27359, 27359, 27359, 27359, 27359, 27359,
+
+        4188, 4188, 4188, 4188, 4188, 4188, 4188, 4188, 4188, 4188, 4188, 4188, 4188, 4188, 4188, 4188,
+
+        -1956, -1956, -1956, -1956, -1956, -1956, -1956, -1956, -1956, -1956, -1956, -1956, -1956, -1956, -1956, -1956,
+
+        3688, 3688, 3688, 3688, 3688, 3688, 3688, 3688, 3688, 3688, 3688, 3688, 3688, 3688, 3688, 3688,
+
+        -408, -408, -408, -408, -408, -408, -408, -408, -408, -408, -408, -408, -408, -408, -408, -408,
+
+        1018, 1018, 1018, 1018, 2413, 2413, 2413, 2413, 4188, 4188, 4188, 4188, 357, 357, 357, 357,
+        223, 223, 223, 223, -3686, -3686, -3686, -3686, -3688, -3688, -3688, -3688, -376, -376, -376, -376,
+        -1018, -1018, -1018, -1018, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+        -6, -6, -6, -6, 10093, 10093, 10093, 10093, -1956, -1956, -1956, -1956, 28517, 28517, 28517, 28517,
+        27359, 27359, 27359, 27359, -21094, -21094, -21094, -21094, 408, 408, 408, 408, -20856, -20856, -20856, -20856,
+        6, 6, 6, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+        1018, 1018, 1018, 1018, -2695, -2695, -2695, -2695, 2413, 2413, 2413, 2413, 425, 425, 425, 425,
+        4188, 4188, 4188, 4188, -4855, -4855, -4855, -4855, 357, 357, 357, 357, -3364, -3364, -3364, -3364,
+        223, 223, 223, 223, 730, 730, 730, 730, -3686, -3686, -3686, -3686, -4544, -4544, -4544, -4544,
+        -3688, -3688, -3688, -3688, -2236, -2236, -2236, -2236, -376, -376, -376, -376, 3784, 3784, 3784, 3784,
+        -1018, -1018, -1018, -1018, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+        -6, -6, -6, -6, 7033, 7033, 7033, 7033, 10093, 10093, 10093, 10093, 18345, 18345, 18345, 18345,
+        -1956, -1956, -1956, -1956, 29449, 29449, 29449, 29449, 28517, 28517, 28517, 28517, -9508, -9508, -9508, -9508,
+        27359, 27359, 27359, 27359, 16090, 16090, 16090, 16090, -21094, -21094, -21094, -21094, 28224, 28224, 28224, 28224,
+        408, 408, 408, 408, -12476, -12476, -12476, -12476, -20856, -20856, -20856, -20856, 16072, 16072, 16072, 16072,
+        6, 6, 6, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+        1018, -1520, -2695, 1341, 2413, 918, 425, 5175, 4188, -4035, -4855, 341, 357, 4347, -3364, 5213,
+        223, -4875, 730, 1931, -3686, -2503, -4544, -4095, -3688, 5063, -2236, -3823, -376, 3012, 3784, -2629,
+        -1018, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+        -6, 23056, 7033, 829, 10093, 26518, 18345, 3639, -1956, -4547, 29449, 3925, 28517, -7429, -9508, -11683,
+        27359, -17675, 16090, 14731, -21094, -25543, 28224, -14847, 408, 28103, -12476, 10001, -20856, -7228, 16072, 18363,
+        6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+        1018, -2935, -1520, -4744, -2695, -205, 1341, 1299, 2413, 4, 918, -4379, 425, -4616, 5175, -544,
+        4188, 4129, -4035, 4102, -4855, -1287, 341, -2388, 357, 1284, 4347, 2984, -3364, 2178, 5213, -2576,
+        223, 2790, -4875, 4876, 730, -4513, 1931, -3085, -3686, 3550, -2503, 847, -4544, 193, -4095, 1085,
+        -3688, 3091, 5063, -4742, -2236, 2982, -3823, -1009, -376, -268, 3012, 3062, 3784, -2565, -2629, 4189,
+        -1018, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+        -6, 31369, 23056, 15736, 7033, -24269, 829, -6381, 10093, 22532, 26518, 23781, 18345, 15864, 3639, 15840,
+        -1956, -23007, -4547, 5126, 29449, 8441, 3925, -16724, 28517, 23812, -7429, 31656, -9508, -19326, -11683, -27152,
+        27359, 20198, -17675, 6924, 16090, 22623, 14731, 5619, -21094, -24098, -25543, 3407, 28224, 22209, -14847, 573,
+        408, -4589, 28103, -5766, -12476, -12378, 10001, -31217, -20856, -2316, -7228, -20490, 16072, -14341, 18363, -12707,
+        6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+        1018, -4734, -2935, -400, -1520, 4977, -4744, -2973, -2695, 512, -205, -779, 1341, -1356, 1299, 635,
+        2413, 567, 4, -4286, 918, -5114, -4379, -1586, 425, 1615, -4616, -336, 5175, -1841, -544, 2234,
+        4188, -3441, 4129, 636, -4035, -4580, 4102, 2684, -4855, 3057, -1287, -2740, 341, -5156, -2388, -472,
+        357, -794, 1284, 578, 4347, 3615, 2984, -3715, -3364, 2271, 2178, -326, 5213, 454, -2576, -3337,
+        223, 2998, 2790, -151, -4875, 2981, 4876, 1324, 730, 2774, -4513, 2206, 1931, 886, -3085, -970,
+        -3686, 3198, 3550, 2737, -2503, -909, 847, 1068, -4544, -2213, 193, 2884, -4095, -4808, 1085, 4123,
+        -3688, 5341, 3091, 5294, 5063, -116, -4742, -5116, -2236, -2045, 2982, -1572, -3823, 4828, -1009, 467,
+        -376, 5023, -268, -3169, 3012, -1458, 3062, -1268, 3784, -675, -2565, 1006, -2629, 5064, 4189, 864,
+        -1018, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+        -6, -26238, 31369, -24976, 23056, -30351, 15736, -18845, 7033, 512, -24269, -13579, 829, 29364, -6381, -11141,
+        10093, -969, 22532, 6978, 26518, -4090, 23781, 11726, 18345, 4175, 15864, 7856, 3639, 719, 15840, -31558,
+        -1956, 31887, -23007, -21892, -4547, 22044, 5126, -19844, 29449, -32271, 8441, 32076, 3925, -11300, -16724, 28200,
+        28517, 16614, 23812, 11842, -7429, -2017, 31656, 28541, -9508, 29407, -19326, 31418, -11683, -31290, -27152, 27895,
+        27359, 12214, 20198, -14999, -17675, -1627, 6924, -13012, 16090, -4394, 22623, 7326, 14731, -22666, 5619, 8246,
+        -21094, 24702, -24098, 177, -25543, 7795, 3407, -13268, 28224, 2395, 22209, -7356, -14847, -17096, 573, -24037,
+        408, -11555, -4589, -30546, 28103, 1932, -5766, 17412, -12476, 31235, -12378, -7716, 10001, -1316, -31217, 25555,
+        -20856, -609, -2316, -8801, -7228, 11854, -20490, 780, 16072, -17571, -14341, -2066, 18363, 17352, -12707, 17248,
+        6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+        1018, 3453, -4734, 4519, -2935, 2118, -400, -554, -1520, 2196, 4977, 1893, -4744, -1409, -2973, -1053,
+        -2695, 4601, 512, 279, -205, -3241, -779, 4889, 1341, 3524, -1356, -1663, 1299, 2283, 635, 73,
+        2413, 2428, 567, 624, 4, -1930, -4286, 3419, 918, -2062, -5114, 5068, -4379, -97, -1586, 1782,
+        425, 4621, 1615, 355, -4616, 1349, -336, 825, 5175, 3135, -1841, 1160, -544, 4408, 2234, -2605,
+        4188, 854, -3441, -1056, 4129, 2439, 636, 4967, -4035, -4782, -4580, -5268, 4102, -663, 2684, -4670,
+        -4855, 3760, 3057, 3535, -1287, 2680, -2740, -569, 341, 2139, -5156, 3827, -2388, 1639, -472, 1927,
+        357, 5172, -794, -4003, 1284, 4144, 578, 693, 4347, 4784, 3615, 3125, 2984, 1122, -3715, 2113,
+        -3364, -573, 2271, -4328, 2178, 2909, -326, -4000, 5213, -4447, 454, -3995, -2576, -4428, -3337, 2529,
+        223, 5309, 2998, 5120, 2790, -2050, -151, 2963, -4875, 2657, 2981, -2807, 4876, 2237, 1324, -4403,
+        730, 2624, 2774, -5083, -4513, 40, 2206, 152, 1931, -1573, 886, 2625, -3085, -778, -970, -5107,
+        -3686, 4250, 3198, -5356, 3550, -3148, 2737, -3360, -2503, -2015, -909, 3096, 847, 5313, 1068, 834,
+        -4544, -1132, -2213, -2151, 193, -1722, 2884, -4393, -4095, 2662, -4808, -2788, 1085, -1992, 4123, 5334,
+        -3688, 5215, 5341, -1689, 3091, -2117, 5294, 4859, 5063, 3410, -116, 2205, -4742, -2374, -5116, -4720,
+        -2236, 3570, -2045, 2813, 2982, 2087, -1572, -4973, -3823, 458, 4828, 3891, -1009, -2419, 467, -4891,
+        -376, -1381, 5023, 1204, -268, 274, -3169, -3260, 3012, -1635, -1458, 4540, 3062, -4254, -1268, -1111,
+        3784, 2230, -675, -2279, -2565, -4359, 1006, -1510, -2629, 5015, 5064, -2449, 4189, -5005, 864, 2487,
+        -1018, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+        -6, -29827, -26238, -21593, 31369, -29626, -24976, -7722, 23056, -16236, -30351, 30053, 15736, 9343, -18845, -16925,
+        7033, 14329, 512, 15127, -24269, -21161, -13579, -1767, 829, -6716, 29364, -12415, -6381, 31467, -11141, 1609,
+        10093, -20100, -969, -23952, 22532, -25482, 6978, 8027, 26518, 17394, -4090, -25652, 23781, -5729, 11726, -21770,
+        18345, -4083, 4175, -15517, 15864, -19643, 7856, -22215, 3639, -18881, 719, -19320, 15840, -7880, -31558, 22483,
+        -1956, -6314, 31887, 15328, -23007, -7289, -21892, 11623, -4547, 31058, 22044, 13164, 5126, -15511, -19844, 6594,
+        29449, 11952, -32271, 6095, 8441, 23160, 32076, 22471, 3925, 6747, -11300, 12531, -16724, 8295, 28200, -7801,
+        28517, -29644, 16614, -20899, 23812, 12336, 11842, 20661, -7429, 12976, -2017, 23093, 31656, -3998, 28541, 24129,
+        -9508, -61, 29407, -232, -19326, -13987, 31418, 12384, -11683, -31583, -31290, 24165, -27152, 26292, 27895, 8161,
+        27359, 4797, 12214, 5120, 20198, 19454, -14999, -4717, -17675, 8289, -1627, 31497, 6924, 1725, -13012, 19661,
+        16090, -30144, -4394, -9691, 22623, 28712, 7326, 4248, 14731, 3035, -22666, 24641, 5619, -24330, 8246, -13811,
+        -21094, -13158, 24702, -23788, -24098, 27572, 177, 13024, -25543, -29151, 7795, 7192, 3407, 27329, -13268, 12098,
+        28224, -19564, 2395, -8807, 22209, 32070, -7356, -22313, -14847, 20070, -17096, 23836, 573, -14280, -24037, -1834,
+        408, 32351, -11555, 4967, -4589, 18875, -30546, -6917, 28103, -26286, 1932, 18077, -5766, 29370, 17412, 19856,
+        -12476, 23026, 31235, -30467, -12378, -24025, -7716, -12653, 10001, -8758, -1316, -20173, -31217, -11123, 25555, 23269,
+        -20856, -29541, -609, 31924, -2316, 3346, -8801, -13500, -7228, 14237, 11854, 14780, -20490, -9374, 780, 16809,
+        16072, 11446, -17571, -8935, -14341, 5369, -2066, -18918, 18363, 19863, 17352, -16273, -12707, 3699, 17248, 951,
+        6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    }
+};
+
+static inline __m256i sub_x16(__m256i a, __m256i b) {
+    //__asm__("vpsubw %1,%0,%0" : "+x"(a),"+x"(b));
+    return _mm256_sub_epi16(a, b);
+}
+
+static inline __m256i add_x16(__m256i a, __m256i b) {
+    return _mm256_add_epi16(a, b);
+}
+
+static inline __m256i reduce_x16(const __m256i *qdata, __m256i x) {
+    __m256i y = _mm256_mulhi_epi16(x, qrecip_x16);
+    y = _mm256_mulhrs_epi16(y, qshift_x16);
+    y = _mm256_mullo_epi16(y, q_x16);
+    return sub_x16(x, y);
+}
+
+static inline __m256i mulmod_x16_scaled(const __m256i *qdata, __m256i x, __m256i y, __m256i yqinv) {
+    __m256i b = _mm256_mulhi_epi16(x, y);
+    __m256i d = _mm256_mullo_epi16(x, yqinv);
+    __m256i e = _mm256_mulhi_epi16(d, q_x16);
+    return sub_x16(b, e);
+}
+
+typedef union {
+    int8 data[32];
+    __m256i _dummy;
+} byte32;
+static const byte32 shuffle_buf = { .data = {
+        14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1,
+        14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1,
+    }
+};
+#define shuffle (*(__m256i *) shuffle_buf.data)
+
+static inline __m256i _mm256_loadu_reverse16(const __m256i *p) {
+    __m256i x = _mm256_loadu_si256(p);
+    x = _mm256_permute2x128_si256(x, x, 1);
+    x = _mm256_shuffle_epi8(x, shuffle);
+    return x;
+}
+
+static void ntt128(int16 *f, int reps, const __m256i *qdata) {
+    __m256i f0, f1, f2, f3, g0, g1, g2, g3, h0, h1, h2, h3;
+    int16 *origf = f;
+    int rep;
+    __m256i zetainv_128_0 = zetainv(128, 0);
+    __m256i zetainv_qinv_128_0 = zetainv_qinv(128, 0);
+    __m256i zetainv_x4_32_0 = zetainv_x4(32, 0);
+    __m256i zetainv_x4_qinv_32_0 = zetainv_x4_qinv(32, 0);
+    __m256i zetainv_128_1 = zetainv(128, 1);
+    __m256i zetainv_qinv_128_1 = zetainv_qinv(128, 1);
+    __m256i zetainv_x4_32_1 = zetainv_x4(32, 1);
+    __m256i zetainv_x4_qinv_32_1 = zetainv_x4_qinv(32, 1);
+    for (rep = 0; rep < reps; ++rep) {
+        f1 = _mm256_loadu_si256((__m256i *) (f + 32));
+        f3 = _mm256_loadu_si256((__m256i *) (f + 96));
+        g3 = sub_x16(f1, f3);
+        g3 = mulmod_x16_scaled(qdata, g3, zeta4_x16, zeta4_x16_qinv);
+        g1 = add_x16(f1, f3);
+
+        f0 = _mm256_loadu_si256((__m256i *) (f + 0));
+        f2 = _mm256_loadu_si256((__m256i *) (f + 64));
+        g2 = sub_x16(f0, f2);
+        g0 = add_x16(f0, f2);
+
+        f3 = sub_x16(g3, g2);
+        f2 = add_x16(g2, g3);
+        f3 = mulmod_x16_scaled(qdata, f3, zetainv_128_0, zetainv_qinv_128_0);
+        f2 = mulmod_x16_scaled(qdata, f2, zeta(128, 0), zeta_qinv(128, 0));
+
+        g2 = _mm256_unpacklo_epi16(f2, f3);
+        g3 = _mm256_unpackhi_epi16(f2, f3);
+
+        f1 = sub_x16(g0, g1);
+        f0 = add_x16(g0, g1);
+        f1 = mulmod_x16_scaled(qdata, f1, zeta(64, 0), zeta_qinv(64, 0));
+        f0 = reduce_x16(qdata, f0);
+
+        g0 = _mm256_unpacklo_epi16(f0, f1);
+        h0 = _mm256_unpacklo_epi32(g0, g2);
+        h1 = _mm256_unpackhi_epi32(g0, g2);
+        g1 = _mm256_unpackhi_epi16(f0, f1);
+        h2 = _mm256_unpacklo_epi32(g1, g3);
+        h3 = _mm256_unpackhi_epi32(g1, g3);
+        f0 = _mm256_permute2x128_si256(h0, h1, 0x20);
+        f2 = _mm256_permute2x128_si256(h0, h1, 0x31);
+        f1 = _mm256_permute2x128_si256(h2, h3, 0x20);
+        f3 = _mm256_permute2x128_si256(h2, h3, 0x31);
+
+        _mm256_storeu_si256((__m256i *) (f + 0), f0);
+        _mm256_storeu_si256((__m256i *) (f + 64), f2);
+        _mm256_storeu_si256((__m256i *) (f + 32), f1);
+        _mm256_storeu_si256((__m256i *) (f + 96), f3);
+
+        f1 = _mm256_loadu_si256((__m256i *) (f + 48));
+        f3 = _mm256_loadu_si256((__m256i *) (f + 112));
+        g3 = sub_x16(f1, f3);
+        g3 = mulmod_x16_scaled(qdata, g3, zeta4_x16, zeta4_x16_qinv);
+        g1 = add_x16(f1, f3);
+
+        f0 = _mm256_loadu_si256((__m256i *) (f + 16));
+        f2 = _mm256_loadu_si256((__m256i *) (f + 80));
+        g2 = sub_x16(f0, f2);
+        g0 = add_x16(f0, f2);
+
+        f3 = sub_x16(g3, g2);
+        f2 = add_x16(g2, g3);
+        f3 = mulmod_x16_scaled(qdata, f3, zetainv_128_1, zetainv_qinv_128_1);
+        f2 = mulmod_x16_scaled(qdata, f2, zeta(128, 1), zeta_qinv(128, 1));
+
+        g2 = _mm256_unpacklo_epi16(f2, f3);
+        g3 = _mm256_unpackhi_epi16(f2, f3);
+
+        f1 = sub_x16(g0, g1);
+        f0 = add_x16(g0, g1);
+        f1 = mulmod_x16_scaled(qdata, f1, zeta(64, 1), zeta_qinv(64, 1));
+        f0 = reduce_x16(qdata, f0);
+
+        g0 = _mm256_unpacklo_epi16(f0, f1);
+        h0 = _mm256_unpacklo_epi32(g0, g2);
+        h1 = _mm256_unpackhi_epi32(g0, g2);
+        g1 = _mm256_unpackhi_epi16(f0, f1);
+        h2 = _mm256_unpacklo_epi32(g1, g3);
+        h3 = _mm256_unpackhi_epi32(g1, g3);
+        f0 = _mm256_permute2x128_si256(h0, h1, 0x20);
+        f2 = _mm256_permute2x128_si256(h0, h1, 0x31);
+        f1 = _mm256_permute2x128_si256(h2, h3, 0x20);
+        f3 = _mm256_permute2x128_si256(h2, h3, 0x31);
+
+        _mm256_storeu_si256((__m256i *) (f + 16), f0);
+        _mm256_storeu_si256((__m256i *) (f + 80), f2);
+        _mm256_storeu_si256((__m256i *) (f + 48), f1);
+        _mm256_storeu_si256((__m256i *) (f + 112), f3);
+
+        f += 128;
+    }
+    f = origf;
+    for (rep = 0; rep < reps; ++rep) {
+        f1 = _mm256_loadu_si256((__m256i *) (f + 64));
+        f3 = _mm256_loadu_si256((__m256i *) (f + 80));
+        g3 = sub_x16(f1, f3);
+        g3 = mulmod_x16_scaled(qdata, g3, zeta4_x16, zeta4_x16_qinv);
+        g1 = add_x16(f1, f3);
+
+        f0 = _mm256_loadu_si256((__m256i *) (f + 0));
+        f2 = _mm256_loadu_si256((__m256i *) (f + 16));
+        g2 = sub_x16(f0, f2);
+        g0 = add_x16(f0, f2);
+
+        f3 = sub_x16(g3, g2);
+        f2 = add_x16(g2, g3);
+        f3 = mulmod_x16_scaled(qdata, f3, zetainv_x4_32_0, zetainv_x4_qinv_32_0);
+        f2 = mulmod_x16_scaled(qdata, f2, zeta_x4(32, 0), zeta_x4_qinv(32, 0));
+
+        g2 = _mm256_unpacklo_epi64(f2, f3);
+        g3 = _mm256_unpackhi_epi64(f2, f3);
+
+        f1 = sub_x16(g0, g1);
+        f0 = add_x16(g0, g1);
+        f1 = mulmod_x16_scaled(qdata, f1, zeta_x4(16, 0), zeta_x4_qinv(16, 0));
+        f0 = reduce_x16(qdata, f0);
+
+        g1 = _mm256_unpackhi_epi64(f0, f1);
+        g0 = _mm256_unpacklo_epi64(f0, f1);
+        f1 = _mm256_permute2x128_si256(g1, g3, 0x20);
+        f3 = _mm256_permute2x128_si256(g1, g3, 0x31);
+        f0 = _mm256_permute2x128_si256(g0, g2, 0x20);
+        f2 = _mm256_permute2x128_si256(g0, g2, 0x31);
+
+        _mm256_storeu_si256((__m256i *) (f + 64), f1);
+        _mm256_storeu_si256((__m256i *) (f + 80), f3);
+        _mm256_storeu_si256((__m256i *) (f + 0), f0);
+        _mm256_storeu_si256((__m256i *) (f + 16), f2);
+
+        f1 = _mm256_loadu_si256((__m256i *) (f + 96));
+        f3 = _mm256_loadu_si256((__m256i *) (f + 112));
+        g3 = sub_x16(f1, f3);
+        g3 = mulmod_x16_scaled(qdata, g3, zeta4_x16, zeta4_x16_qinv);
+        g1 = add_x16(f1, f3);
+
+        f0 = _mm256_loadu_si256((__m256i *) (f + 32));
+        f2 = _mm256_loadu_si256((__m256i *) (f + 48));
+        g2 = sub_x16(f0, f2);
+        g0 = add_x16(f0, f2);
+
+        f3 = sub_x16(g3, g2);
+        f2 = add_x16(g2, g3);
+        f3 = mulmod_x16_scaled(qdata, f3, zetainv_x4_32_1, zetainv_x4_qinv_32_1);
+        f2 = mulmod_x16_scaled(qdata, f2, zeta_x4(32, 1), zeta_x4_qinv(32, 1));
+
+        g2 = _mm256_unpacklo_epi64(f2, f3);
+        g3 = _mm256_unpackhi_epi64(f2, f3);
+
+        f1 = sub_x16(g0, g1);
+        f0 = add_x16(g0, g1);
+        f1 = mulmod_x16_scaled(qdata, f1, zeta_x4(16, 1), zeta_x4_qinv(16, 1));
+        f0 = reduce_x16(qdata, f0);
+
+        g1 = _mm256_unpackhi_epi64(f0, f1);
+        g0 = _mm256_unpacklo_epi64(f0, f1);
+        f1 = _mm256_permute2x128_si256(g1, g3, 0x20);
+        f3 = _mm256_permute2x128_si256(g1, g3, 0x31);
+        f0 = _mm256_permute2x128_si256(g0, g2, 0x20);
+        f2 = _mm256_permute2x128_si256(g0, g2, 0x31);
+
+        _mm256_storeu_si256((__m256i *) (f + 96), f1);
+        _mm256_storeu_si256((__m256i *) (f + 112), f3);
+        _mm256_storeu_si256((__m256i *) (f + 32), f0);
+        _mm256_storeu_si256((__m256i *) (f + 48), f2);
+
+        f += 128;
+    }
+    f = origf;
+    for (rep = 0; rep < reps; ++rep) {
+
+        f1 = _mm256_loadu_si256((__m256i *) (f + 16));
+        f3 = _mm256_loadu_si256((__m256i *) (f + 48));
+        g3 = sub_x16(f1, f3);
+        g3 = mulmod_x16_scaled(qdata, g3, zeta4_x16, zeta4_x16_qinv);
+        g1 = add_x16(f1, f3);
+
+        f0 = _mm256_loadu_si256((__m256i *) (f + 0));
+        f2 = _mm256_loadu_si256((__m256i *) (f + 32));
+        g2 = sub_x16(f0, f2);
+        g0 = add_x16(f0, f2);
+
+        f2 = add_x16(g2, g3);
+        f3 = sub_x16(g2, g3);
+        f2 = reduce_x16(qdata, f2);
+        f3 = reduce_x16(qdata, f3);
+
+        f1 = sub_x16(g0, g1);
+        f0 = add_x16(g0, g1);
+        f0 = reduce_x16(qdata, f0);
+
+        h0 = f0;
+        h1 = f1;
+        h2 = f2;
+        h3 = f3;
+
+        f1 = _mm256_loadu_si256((__m256i *) (f + 80));
+        f3 = _mm256_loadu_si256((__m256i *) (f + 112));
+        g3 = sub_x16(f1, f3);
+        g3 = mulmod_x16_scaled(qdata, g3, zeta4_x16, zeta4_x16_qinv);
+        g1 = add_x16(f1, f3);
+
+        f0 = _mm256_loadu_si256((__m256i *) (f + 64));
+        f2 = _mm256_loadu_si256((__m256i *) (f + 96));
+        g2 = sub_x16(f0, f2);
+        g0 = add_x16(f0, f2);
+
+        f3 = sub_x16(g3, g2);
+        f2 = add_x16(g2, g3);
+        f3 = mulmod_x16_scaled(qdata, f3, zetainv8_x16, zetainv8_x16_qinv);
+        f2 = mulmod_x16_scaled(qdata, f2, zeta8_x16, zeta8_x16_qinv);
+
+        f1 = sub_x16(g0, g1);
+        f0 = add_x16(g0, g1);
+        f1 = mulmod_x16_scaled(qdata, f1, zeta4_x16, zeta4_x16_qinv);
+        f0 = reduce_x16(qdata, f0);
+
+        g0 = add_x16(h0, f0);
+        g1 = add_x16(h1, f1);
+        g2 = add_x16(h2, f2);
+        g3 = add_x16(h3, f3);
+        _mm256_storeu_si256((__m256i *) (f + 0), g0);
+        _mm256_storeu_si256((__m256i *) (f + 16), g1);
+        _mm256_storeu_si256((__m256i *) (f + 32), g2);
+        _mm256_storeu_si256((__m256i *) (f + 48), g3);
+        g0 = sub_x16(h0, f0);
+        g1 = sub_x16(h1, f1);
+        g2 = sub_x16(h2, f2);
+        g3 = sub_x16(h3, f3);
+        _mm256_storeu_si256((__m256i *) (f + 64), g0);
+        _mm256_storeu_si256((__m256i *) (f + 80), g1);
+        _mm256_storeu_si256((__m256i *) (f + 96), g2);
+        _mm256_storeu_si256((__m256i *) (f + 112), g3);
+        f += 128;
+    }
+}
+
+static void ntt512(int16 *f, int reps, const __m256i *qdata) {
+    __m256i f0, f1, f2, f3, g0, g1, g2, g3; /* [-Werror=unused-variable] */ /* ,h0,h1,h2,h3; */
+    int16 *origf = f;
+    int rep;
+    __m256i zetainv_512[8];
+    __m256i zetainv_qinv_512[8];
+    int i;
+    for (i = 0; i < 8; ++i) {
+        zetainv_512[i] = zetainv(512, i);
+    }
+    for (i = 0; i < 8; ++i) {
+        zetainv_qinv_512[i] = zetainv_qinv(512, i);
+    }
+    for (rep = 0; rep < reps; ++rep) {
+        for (i = 0; i < 8; ++i) {
+            f1 = _mm256_loadu_si256((__m256i *) (f + 16 * i + 128));
+            f3 = _mm256_loadu_si256((__m256i *) (f + 16 * i + 384));
+            g3 = sub_x16(f1, f3);
+            g3 = mulmod_x16_scaled(qdata, g3, zeta4_x16, zeta4_x16_qinv);
+            g1 = add_x16(f1, f3);
+
+            f0 = _mm256_loadu_si256((__m256i *) (f + 16 * i));
+            f2 = _mm256_loadu_si256((__m256i *) (f + 16 * i + 256));
+            g2 = sub_x16(f0, f2);
+            g0 = add_x16(f0, f2);
+
+            f3 = sub_x16(g3, g2);
+            f2 = add_x16(g2, g3);
+            f3 = mulmod_x16_scaled(qdata, f3, zetainv_512[i], zetainv_qinv_512[i]);
+            f2 = mulmod_x16_scaled(qdata, f2, zeta(512, i), zeta_qinv(512, i));
+
+            f1 = sub_x16(g0, g1);
+            f0 = add_x16(g0, g1);
+            f1 = mulmod_x16_scaled(qdata, f1, zeta(256, i), zeta_qinv(256, i));
+            f0 = reduce_x16(qdata, f0);
+
+            _mm256_storeu_si256((__m256i *) (f + 16 * i + 384), f3);
+            _mm256_storeu_si256((__m256i *) (f + 16 * i + 256), f2);
+            _mm256_storeu_si256((__m256i *) (f + 16 * i + 128), f1);
+            _mm256_storeu_si256((__m256i *) (f + 16 * i), f0);
+
+        }
+        f += 512;
+    }
+    f = origf;
+    ntt128(f, reps * 4, qdata);
+}
+
+void PQCLEAN_SNTRUP857_AVX2_ntt512_7681(int16 *f, int reps) {
+    ntt512(f, reps, (const __m256i *) qdata_7681.data);
+}
+
+void PQCLEAN_SNTRUP857_AVX2_ntt512_10753(int16 *f, int reps) {
+    ntt512(f, reps, (const __m256i *) qdata_10753.data);
+}
+
+static void invntt128(int16 *f, int reps, const __m256i *qdata) {
+    __m256i f0, f1, f2, f3, g0, g1, g2, g3, h0, h1, h2, h3;
+    int16 *origf = f;
+    int rep;
+    __m256i zetainv_x4_16_0 = zetainv_x4(16, 0);
+    __m256i zetainv_x4_qinv_16_0 = zetainv_x4_qinv(16, 0);
+    __m256i zetainv_x4_32_0 = zetainv_x4(32, 0);
+    __m256i zetainv_x4_qinv_32_0 = zetainv_x4_qinv(32, 0);
+    __m256i zetainv_64_0 = zetainv(64, 0);
+    __m256i zetainv_qinv_64_0 = zetainv_qinv(64, 0);
+    __m256i zetainv_128_0 = zetainv(128, 0);
+    __m256i zetainv_qinv_128_0 = zetainv_qinv(128, 0);
+    __m256i zetainv_x4_16_1 = zetainv_x4(16, 1);
+    __m256i zetainv_x4_qinv_16_1 = zetainv_x4_qinv(16, 1);
+    __m256i zetainv_x4_32_1 = zetainv_x4(32, 1);
+    __m256i zetainv_x4_qinv_32_1 = zetainv_x4_qinv(32, 1);
+    __m256i zetainv_64_1 = zetainv(64, 1);
+    __m256i zetainv_qinv_64_1 = zetainv_qinv(64, 1);
+    __m256i zetainv_128_1 = zetainv(128, 1);
+    __m256i zetainv_qinv_128_1 = zetainv_qinv(128, 1);
+    for (rep = 0; rep < reps; ++rep) {
+        f0 = _mm256_loadu_si256((__m256i *) (f +   0));
+        f1 = _mm256_loadu_si256((__m256i *) (f +  64));
+        f2 = _mm256_loadu_si256((__m256i *) (f +  16));
+        f3 = _mm256_loadu_si256((__m256i *) (f +  80));
+        g0 = _mm256_loadu_si256((__m256i *) (f +  32));
+        g1 = _mm256_loadu_si256((__m256i *) (f +  96));
+        g2 = _mm256_loadu_si256((__m256i *) (f +  48));
+        g3 = _mm256_loadu_si256((__m256i *) (f + 112));
+
+        h1 = sub_x16(f0, f1);
+        h1 = reduce_x16(qdata, h1);
+        h0 = add_x16(f0, f1);
+        h3 = sub_x16(f2, f3);
+        h3 = mulmod_x16_scaled(qdata, h3, zeta4_x16, zeta4_x16_qinv);
+        h2 = add_x16(f2, f3);
+        f1 = sub_x16(g0, g1);
+        f1 = mulmod_x16_scaled(qdata, f1, zetainv8_x16, zetainv8_x16_qinv);
+        f0 = add_x16(g0, g1);
+        f3 = sub_x16(g2, g3);
+        f3 = mulmod_x16_scaled(qdata, f3, zeta8_x16, zeta8_x16_qinv);
+        f2 = add_x16(g2, g3);
+
+        g0 = add_x16(h0, h2);
+        g0 = reduce_x16(qdata, g0);
+        g2 = sub_x16(h0, h2);
+        g2 = reduce_x16(qdata, g2);
+        g1 = sub_x16(h1, h3);
+        g3 = add_x16(h1, h3);
+        h2 = sub_x16(f0, f2);
+        h2 = mulmod_x16_scaled(qdata, h2, zeta4_x16, zeta4_x16_qinv);
+        h0 = add_x16(f0, f2);
+        h3 = add_x16(f1, f3);
+        h3 = mulmod_x16_scaled(qdata, h3, zeta4_x16, zeta4_x16_qinv);
+        h1 = sub_x16(f1, f3);
+
+        f0 = add_x16(g0, h0);
+        g0 = sub_x16(g0, h0);
+        f1 = add_x16(g1, h1);
+        g1 = sub_x16(g1, h1);
+        f2 = sub_x16(g2, h2);
+        g2 = add_x16(g2, h2);
+        f3 = sub_x16(g3, h3);
+        g3 = add_x16(g3, h3);
+
+        _mm256_storeu_si256((__m256i *) (f +   0), f0);
+        _mm256_storeu_si256((__m256i *) (f +  32), g0);
+        _mm256_storeu_si256((__m256i *) (f +  64), f1);
+        _mm256_storeu_si256((__m256i *) (f +  96), g1);
+        _mm256_storeu_si256((__m256i *) (f +  16), f2);
+        _mm256_storeu_si256((__m256i *) (f +  48), g2);
+        _mm256_storeu_si256((__m256i *) (f +  80), f3);
+        _mm256_storeu_si256((__m256i *) (f + 112), g3);
+
+        f += 128;
+    }
+    f = origf;
+    for (rep = 0; rep < reps; ++rep) {
+        f0 = _mm256_loadu_si256((__m256i *) (f + 0));
+        f1 = _mm256_loadu_si256((__m256i *) (f + 64));
+        f2 = _mm256_loadu_si256((__m256i *) (f + 16));
+        f3 = _mm256_loadu_si256((__m256i *) (f + 80));
+
+        g0 = _mm256_unpacklo_epi64(f0, f1);
+        g1 = _mm256_unpacklo_epi64(f2, f3);
+        g2 = _mm256_unpackhi_epi64(f0, f1);
+        g3 = _mm256_unpackhi_epi64(f2, f3);
+        f2 = _mm256_permute2x128_si256(g0, g1, 0x31);
+        f3 = _mm256_permute2x128_si256(g2, g3, 0x31);
+        f0 = _mm256_permute2x128_si256(g0, g1, 0x20);
+        f1 = _mm256_permute2x128_si256(g2, g3, 0x20);
+
+        f2 = mulmod_x16_scaled(qdata, f2, zetainv_x4_32_0, zetainv_x4_qinv_32_0);
+        f3 = mulmod_x16_scaled(qdata, f3, zeta_x4(32, 0), zeta_x4_qinv(32, 0));
+
+        g3 = add_x16(f3, f2);
+        g3 = mulmod_x16_scaled(qdata, g3, zeta4_x16, zeta4_x16_qinv);
+        g2 = sub_x16(f3, f2);
+
+        f0 = reduce_x16(qdata, f0);
+        f1 = mulmod_x16_scaled(qdata, f1, zetainv_x4_16_0, zetainv_x4_qinv_16_0);
+
+        g1 = add_x16(f0, f1);
+        g0 = sub_x16(f0, f1);
+
+        f1 = add_x16(g1, g3);
+        f3 = sub_x16(g1, g3);
+        f0 = add_x16(g0, g2);
+        f2 = sub_x16(g0, g2);
+
+        _mm256_storeu_si256((__m256i *) (f + 64), f1);
+        _mm256_storeu_si256((__m256i *) (f + 80), f3);
+        _mm256_storeu_si256((__m256i *) (f + 0), f0);
+        _mm256_storeu_si256((__m256i *) (f + 16), f2);
+
+        f0 = _mm256_loadu_si256((__m256i *) (f + 32));
+        f1 = _mm256_loadu_si256((__m256i *) (f + 96));
+        f2 = _mm256_loadu_si256((__m256i *) (f + 48));
+        f3 = _mm256_loadu_si256((__m256i *) (f + 112));
+
+        g0 = _mm256_unpacklo_epi64(f0, f1);
+        g1 = _mm256_unpacklo_epi64(f2, f3);
+        g2 = _mm256_unpackhi_epi64(f0, f1);
+        g3 = _mm256_unpackhi_epi64(f2, f3);
+        f2 = _mm256_permute2x128_si256(g0, g1, 0x31);
+        f3 = _mm256_permute2x128_si256(g2, g3, 0x31);
+        f0 = _mm256_permute2x128_si256(g0, g1, 0x20);
+        f1 = _mm256_permute2x128_si256(g2, g3, 0x20);
+
+        f2 = mulmod_x16_scaled(qdata, f2, zetainv_x4_32_1, zetainv_x4_qinv_32_1);
+        f3 = mulmod_x16_scaled(qdata, f3, zeta_x4(32, 1), zeta_x4_qinv(32, 1));
+
+        g3 = add_x16(f3, f2);
+        g3 = mulmod_x16_scaled(qdata, g3, zeta4_x16, zeta4_x16_qinv);
+        g2 = sub_x16(f3, f2);
+
+        f0 = reduce_x16(qdata, f0);
+        f1 = mulmod_x16_scaled(qdata, f1, zetainv_x4_16_1, zetainv_x4_qinv_16_1);
+
+        g1 = add_x16(f0, f1);
+        g0 = sub_x16(f0, f1);
+
+        f1 = add_x16(g1, g3);
+        f3 = sub_x16(g1, g3);
+        f0 = add_x16(g0, g2);
+        f2 = sub_x16(g0, g2);
+
+        _mm256_storeu_si256((__m256i *) (f + 96), f1);
+        _mm256_storeu_si256((__m256i *) (f + 112), f3);
+        _mm256_storeu_si256((__m256i *) (f + 32), f0);
+        _mm256_storeu_si256((__m256i *) (f + 48), f2);
+
+        f += 128;
+    }
+    f = origf;
+    for (rep = 0; rep < reps; ++rep) {
+        f0 = _mm256_loadu_si256((__m256i *) (f + 0));
+        f2 = _mm256_loadu_si256((__m256i *) (f + 64));
+        f1 = _mm256_loadu_si256((__m256i *) (f + 32));
+        f3 = _mm256_loadu_si256((__m256i *) (f + 96));
+
+        g0 = _mm256_permute2x128_si256(f0, f2, 0x20);
+        g2 = _mm256_permute2x128_si256(f0, f2, 0x31);
+        f0 = _mm256_unpacklo_epi16(g0, g2);
+        f2 = _mm256_unpackhi_epi16(g0, g2);
+        g1 = _mm256_permute2x128_si256(f1, f3, 0x20);
+        g3 = _mm256_permute2x128_si256(f1, f3, 0x31);
+        f1 = _mm256_unpacklo_epi16(g1, g3);
+        f3 = _mm256_unpackhi_epi16(g1, g3);
+        g1 = _mm256_unpackhi_epi16(f0, f2);
+        g0 = _mm256_unpacklo_epi16(f0, f2);
+        g3 = _mm256_unpackhi_epi16(f1, f3);
+        g2 = _mm256_unpacklo_epi16(f1, f3);
+        f2 = _mm256_unpacklo_epi64(g1, g3);
+        f3 = _mm256_unpackhi_epi64(g1, g3);
+        f0 = _mm256_unpacklo_epi64(g0, g2);
+        f1 = _mm256_unpackhi_epi64(g0, g2);
+
+        f2 = mulmod_x16_scaled(qdata, f2, zetainv_128_0, zetainv_qinv_128_0);
+        f3 = mulmod_x16_scaled(qdata, f3, zeta(128, 0), zeta_qinv(128, 0));
+        f0 = reduce_x16(qdata, f0);
+        f1 = mulmod_x16_scaled(qdata, f1, zetainv_64_0, zetainv_qinv_64_0);
+
+        g3 = add_x16(f3, f2);
+        g3 = mulmod_x16_scaled(qdata, g3, zeta4_x16, zeta4_x16_qinv);
+        g1 = add_x16(f0, f1);
+        g2 = sub_x16(f3, f2);
+        g0 = sub_x16(f0, f1);
+
+        f1 = add_x16(g1, g3);
+        f3 = sub_x16(g1, g3);
+        f0 = add_x16(g0, g2);
+        f2 = sub_x16(g0, g2);
+
+        _mm256_storeu_si256((__m256i *) (f + 32), f1);
+        _mm256_storeu_si256((__m256i *) (f + 96), f3);
+        _mm256_storeu_si256((__m256i *) (f + 0), f0);
+        _mm256_storeu_si256((__m256i *) (f + 64), f2);
+
+        f0 = _mm256_loadu_si256((__m256i *) (f + 16));
+        f2 = _mm256_loadu_si256((__m256i *) (f + 80));
+        f1 = _mm256_loadu_si256((__m256i *) (f + 48));
+        f3 = _mm256_loadu_si256((__m256i *) (f + 112));
+
+        g0 = _mm256_permute2x128_si256(f0, f2, 0x20);
+        g2 = _mm256_permute2x128_si256(f0, f2, 0x31);
+        f0 = _mm256_unpacklo_epi16(g0, g2);
+        f2 = _mm256_unpackhi_epi16(g0, g2);
+        g1 = _mm256_permute2x128_si256(f1, f3, 0x20);
+        g3 = _mm256_permute2x128_si256(f1, f3, 0x31);
+        f1 = _mm256_unpacklo_epi16(g1, g3);
+        f3 = _mm256_unpackhi_epi16(g1, g3);
+        g1 = _mm256_unpackhi_epi16(f0, f2);
+        g0 = _mm256_unpacklo_epi16(f0, f2);
+        g3 = _mm256_unpackhi_epi16(f1, f3);
+        g2 = _mm256_unpacklo_epi16(f1, f3);
+        f2 = _mm256_unpacklo_epi64(g1, g3);
+        f3 = _mm256_unpackhi_epi64(g1, g3);
+        f0 = _mm256_unpacklo_epi64(g0, g2);
+        f1 = _mm256_unpackhi_epi64(g0, g2);
+
+        f2 = mulmod_x16_scaled(qdata, f2, zetainv_128_1, zetainv_qinv_128_1);
+        f3 = mulmod_x16_scaled(qdata, f3, zeta(128, 1), zeta_qinv(128, 1));
+        f0 = reduce_x16(qdata, f0);
+        f1 = mulmod_x16_scaled(qdata, f1, zetainv_64_1, zetainv_qinv_64_1);
+
+        g3 = add_x16(f3, f2);
+        g3 = mulmod_x16_scaled(qdata, g3, zeta4_x16, zeta4_x16_qinv);
+        g1 = add_x16(f0, f1);
+        g2 = sub_x16(f3, f2);
+        g0 = sub_x16(f0, f1);
+
+        f1 = add_x16(g1, g3);
+        f3 = sub_x16(g1, g3);
+        f0 = add_x16(g0, g2);
+        f2 = sub_x16(g0, g2);
+
+        _mm256_storeu_si256((__m256i *) (f + 48), f1);
+        _mm256_storeu_si256((__m256i *) (f + 112), f3);
+        _mm256_storeu_si256((__m256i *) (f + 16), f0);
+        _mm256_storeu_si256((__m256i *) (f + 80), f2);
+
+        f += 128;
+    }
+}
+
+static void invntt512(int16 *f, int reps, const __m256i *qdata) {
+    __m256i f0, f1, f2, f3, g0, g1, g2, g3; /* [-Werror=unused-variable] */ /* ,h0,h1,h2,h3; */
+    /* [-Werror=unused-variable] */ /* int16 *origf = f; */
+    int rep;
+    __m256i zetainv_512[8];
+    __m256i zetainv_qinv_512[8];
+    __m256i zetainv_256[8];
+    __m256i zetainv_qinv_256[8];
+    int i;
+    for (i = 0; i < 8; ++i) {
+        zetainv_512[i] = zetainv(512, i);
+    }
+    for (i = 0; i < 8; ++i) {
+        zetainv_qinv_512[i] = zetainv_qinv(512, i);
+    }
+    for (i = 0; i < 8; ++i) {
+        zetainv_256[i] = zetainv(256, i);
+    }
+    for (i = 0; i < 8; ++i) {
+        zetainv_qinv_256[i] = zetainv_qinv(256, i);
+    }
+    invntt128(f, 4 * reps, qdata);
+    for (rep = 0; rep < reps; ++rep) {
+        for (i = 0; i < 8; ++i) {
+            f2 = _mm256_loadu_si256((__m256i *) (f + 16 * i + 256));
+            f3 = _mm256_loadu_si256((__m256i *) (f + 16 * i + 384));
+
+            f2 = mulmod_x16_scaled(qdata, f2, zetainv_512[i], zetainv_qinv_512[i]);
+            f3 = mulmod_x16_scaled(qdata, f3, zeta(512, i), zeta_qinv(512, i));
+            g3 = add_x16(f3, f2);
+            g3 = mulmod_x16_scaled(qdata, g3, zeta4_x16, zeta4_x16_qinv);
+            g2 = sub_x16(f3, f2);
+
+            f0 = _mm256_loadu_si256((__m256i *) (f + 16 * i + 0));
+            f1 = _mm256_loadu_si256((__m256i *) (f + 16 * i + 128));
+
+            f0 = reduce_x16(qdata, f0);
+            f1 = mulmod_x16_scaled(qdata, f1, zetainv_256[i], zetainv_qinv_256[i]);
+            g1 = add_x16(f0, f1);
+            g0 = sub_x16(f0, f1);
+
+            f1 = add_x16(g1, g3);
+            f3 = sub_x16(g1, g3);
+            f0 = add_x16(g0, g2);
+            f2 = sub_x16(g0, g2);
+
+            _mm256_storeu_si256((__m256i *) (f + 16 * i + 128), f1);
+            _mm256_storeu_si256((__m256i *) (f + 16 * i + 384), f3);
+            _mm256_storeu_si256((__m256i *) (f + 16 * i + 0), f0);
+            _mm256_storeu_si256((__m256i *) (f + 16 * i + 256), f2);
+        }
+        f += 512;
+    }
+}
+
+void PQCLEAN_SNTRUP857_AVX2_invntt512_7681(int16 *f, int reps) {
+    invntt512(f, reps, (const __m256i *) qdata_7681.data);
+}
+
+void PQCLEAN_SNTRUP857_AVX2_invntt512_10753(int16 *f, int reps) {
+    invntt512(f, reps, (const __m256i *) qdata_10753.data);
+}
diff --git a/crypto_kem/sntrup857/avx2/crypto_core_multsntrup857_ntt.h b/crypto_kem/sntrup857/avx2/crypto_core_multsntrup857_ntt.h
new file mode 100644
index 00000000..a38bad6d
--- /dev/null
+++ b/crypto_kem/sntrup857/avx2/crypto_core_multsntrup857_ntt.h
@@ -0,0 +1,13 @@
+#ifndef ntt_H
+#define ntt_H
+
+#include <stdint.h>
+
+
+
+extern void PQCLEAN_SNTRUP857_AVX2_ntt512_7681(int16_t *f, int reps);
+extern void PQCLEAN_SNTRUP857_AVX2_ntt512_10753(int16_t *f, int reps);
+extern void PQCLEAN_SNTRUP857_AVX2_invntt512_7681(int16_t *f, int reps);
+extern void PQCLEAN_SNTRUP857_AVX2_invntt512_10753(int16_t *f, int reps);
+
+#endif
diff --git a/crypto_kem/sntrup857/avx2/crypto_core_scale3sntrup857.c b/crypto_kem/sntrup857/avx2/crypto_core_scale3sntrup857.c
new file mode 100644
index 00000000..6f4ae465
--- /dev/null
+++ b/crypto_kem/sntrup857/avx2/crypto_core_scale3sntrup857.c
@@ -0,0 +1,47 @@
+#include "crypto_core_scale3sntrup857.h"
+#include "crypto_decode_857xint16.h"
+#include "crypto_encode_857xint16.h"
+#include <immintrin.h>
+
+#define p 857
+#define q 5167
+
+#define crypto_decode_pxint16 PQCLEAN_SNTRUP857_AVX2_crypto_decode_857xint16
+#define crypto_encode_pxint16 PQCLEAN_SNTRUP857_AVX2_crypto_encode_857xint16
+
+typedef int16_t Fq;
+
+/* out = 3*in in Rq */
+int PQCLEAN_SNTRUP857_AVX2_crypto_core_scale3sntrup857(unsigned char *outbytes, const unsigned char *inbytes) {
+    int i = p - 16;
+
+    __m256i save = _mm256_loadu_si256((__m256i *) (inbytes + 2 * i));
+    /* in case outbytes = inbytes */
+
+    for (;;) {
+        do {
+            __m256i x = _mm256_loadu_si256((__m256i *) inbytes);
+            __m256i xneg;
+            x = _mm256_mullo_epi16(x, _mm256_set1_epi16(3));
+            x = _mm256_sub_epi16(x, _mm256_set1_epi16((q + 1) / 2));
+            xneg = _mm256_srai_epi16(x, 15);
+            x = _mm256_add_epi16(x, _mm256_set1_epi16(q)&xneg);
+            xneg = _mm256_srai_epi16(x, 15);
+            x = _mm256_add_epi16(x, _mm256_set1_epi16(q)&xneg);
+            x = _mm256_sub_epi16(x, _mm256_set1_epi16((q - 1) / 2));
+            _mm256_storeu_si256((__m256i *) outbytes, x);
+
+            inbytes += 32;
+            outbytes += 32;
+            i -= 16;
+        } while (i >= 0);
+        if (i <= -16) {
+            break;
+        }
+        inbytes += 2 * i;
+        outbytes += 2 * i;
+        _mm256_storeu_si256((__m256i *) outbytes, save);
+    }
+
+    return 0;
+}
diff --git a/crypto_kem/sntrup857/avx2/crypto_core_scale3sntrup857.h b/crypto_kem/sntrup857/avx2/crypto_core_scale3sntrup857.h
new file mode 100644
index 00000000..21f587e3
--- /dev/null
+++ b/crypto_kem/sntrup857/avx2/crypto_core_scale3sntrup857.h
@@ -0,0 +1,11 @@
+#ifndef PQCLEAN_SNTRUP857_AVX2_CRYPTO_CORE_SCALE3SNTRUP857_H
+#define PQCLEAN_SNTRUP857_AVX2_CRYPTO_CORE_SCALE3SNTRUP857_H
+
+#include <stdint.h>
+#define PQCLEAN_SNTRUP857_AVX2_crypto_core_scale3sntrup857_OUTPUTBYTES 1714
+#define PQCLEAN_SNTRUP857_AVX2_crypto_core_scale3sntrup857_INPUTBYTES 1714
+#define PQCLEAN_SNTRUP857_AVX2_crypto_core_scale3sntrup857_KEYBYTES 0
+#define PQCLEAN_SNTRUP857_AVX2_crypto_core_scale3sntrup857_CONSTBYTES 0
+
+int PQCLEAN_SNTRUP857_AVX2_crypto_core_scale3sntrup857(unsigned char *outbytes, const unsigned char *inbytes);
+#endif
diff --git a/crypto_kem/sntrup857/avx2/crypto_core_weightsntrup857.c b/crypto_kem/sntrup857/avx2/crypto_core_weightsntrup857.c
new file mode 100644
index 00000000..5df7914a
--- /dev/null
+++ b/crypto_kem/sntrup857/avx2/crypto_core_weightsntrup857.c
@@ -0,0 +1,45 @@
+#include "crypto_core_weightsntrup857.h"
+#include "crypto_encode_int16.h"
+#include "params.h"
+#include <immintrin.h>
+
+#define int8 int8_t
+#define int16 int16_t
+
+
+/* out = little-endian weight of bottom bits of in */
+int PQCLEAN_SNTRUP857_AVX2_crypto_core_weightsntrup857(unsigned char *outbytes, const unsigned char *inbytes) {
+    int8 *in = (void *) inbytes;
+    int i;
+    __m256i sum, sumhi;
+    int16 weight;
+
+    sum = _mm256_loadu_si256((__m256i *) (in + p - 32));
+    sum &= _mm256_set_epi8(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0);
+
+    for (i = p - 32; i >= 0; i -= 32) {
+        __m256i bits = _mm256_loadu_si256((__m256i *) in);
+        bits &= _mm256_set1_epi8(1);
+        sum = _mm256_add_epi8(sum, bits);
+        in += 32;
+    }
+
+    /* sum is 32xint8; want to add these int8 */
+    sumhi = _mm256_srli_epi16(sum, 8);
+    sum &= _mm256_set1_epi16(0xff);
+    sum = _mm256_add_epi16(sum, sumhi);
+
+    /* sum is 16xint16; want to add these int16 */
+    sum = _mm256_hadd_epi16(sum, sum);
+    /* want sum[0]+sum[1]+sum[2]+sum[3]+sum[8]+sum[9]+sum[10]+sum[11] */
+    sum = _mm256_hadd_epi16(sum, sum);
+    /* want sum[0]+sum[1]+sum[8]+sum[9] */
+    sum = _mm256_hadd_epi16(sum, sum);
+    /* want sum[0]+sum[8] */
+
+    weight = _mm256_extract_epi16(sum, 0);
+    weight += _mm256_extract_epi16(sum, 8);
+
+    PQCLEAN_SNTRUP857_AVX2_crypto_encode_int16(outbytes, &weight);
+    return 0;
+}
diff --git a/crypto_kem/sntrup857/avx2/crypto_core_weightsntrup857.h b/crypto_kem/sntrup857/avx2/crypto_core_weightsntrup857.h
new file mode 100644
index 00000000..1d5b2226
--- /dev/null
+++ b/crypto_kem/sntrup857/avx2/crypto_core_weightsntrup857.h
@@ -0,0 +1,11 @@
+#ifndef PQCLEAN_SNTRUP857_AVX2_CRYPTO_CORE_WEIGHTSNTRUP857_H
+#define PQCLEAN_SNTRUP857_AVX2_CRYPTO_CORE_WEIGHTSNTRUP857_H
+
+#include <stdint.h>
+#define PQCLEAN_SNTRUP857_AVX2_crypto_core_weightsntrup857_OUTPUTBYTES 2
+#define PQCLEAN_SNTRUP857_AVX2_crypto_core_weightsntrup857_INPUTBYTES 857
+#define PQCLEAN_SNTRUP857_AVX2_crypto_core_weightsntrup857_KEYBYTES 0
+#define PQCLEAN_SNTRUP857_AVX2_crypto_core_weightsntrup857_CONSTBYTES 0
+
+int PQCLEAN_SNTRUP857_AVX2_crypto_core_weightsntrup857(unsigned char *outbytes, const unsigned char *inbytes);
+#endif
diff --git a/crypto_kem/sntrup857/avx2/crypto_core_wforcesntrup857.c b/crypto_kem/sntrup857/avx2/crypto_core_wforcesntrup857.c
new file mode 100644
index 00000000..889de6f9
--- /dev/null
+++ b/crypto_kem/sntrup857/avx2/crypto_core_wforcesntrup857.c
@@ -0,0 +1,61 @@
+#include "crypto_core_wforcesntrup857.h"
+#include "crypto_decode_int16.h"
+#include "params.h"
+#include <immintrin.h>
+
+#define int16 int16_t
+
+
+/* out = in if bottom bits of in have weight w */
+/* otherwise out = (1,1,...,1,0,0,...,0) */
+int PQCLEAN_SNTRUP857_AVX2_crypto_core_wforcesntrup857(unsigned char *out, const unsigned char *in) {
+    int16 weight;
+    int16 mask;
+    __m256i maskvec;
+    int i;
+
+    crypto_core_weight((unsigned char *) &weight, in);
+    PQCLEAN_SNTRUP857_AVX2_crypto_decode_int16(&weight, (unsigned char *) &weight);
+
+    mask = (weight - w) | (w - weight);
+    mask >>= 15;
+    maskvec = _mm256_set1_epi16((short) ~mask);
+
+    i = w - 32;
+    for (;;) {
+        do {
+            __m256i x = _mm256_loadu_si256((__m256i *) in);
+            x ^= _mm256_set1_epi8(1);
+            x &= maskvec;
+            x ^= _mm256_set1_epi8(1);
+            _mm256_storeu_si256((__m256i *) out, x);
+            in += 32;
+            out += 32;
+            i -= 32;
+        } while (i >= 0);
+        if (i <= -32) {
+            break;
+        }
+        in += i;
+        out += i;
+    }
+
+    i = p - w - 32;
+    for (;;) {
+        do {
+            __m256i x = _mm256_loadu_si256((__m256i *) in);
+            x &= maskvec;
+            _mm256_storeu_si256((__m256i *) out, x);
+            in += 32;
+            out += 32;
+            i -= 32;
+        } while (i >= 0);
+        if (i <= -32) {
+            break;
+        }
+        in += i;
+        out += i;
+    }
+
+    return 0;
+}
diff --git a/crypto_kem/sntrup857/avx2/crypto_core_wforcesntrup857.h b/crypto_kem/sntrup857/avx2/crypto_core_wforcesntrup857.h
new file mode 100644
index 00000000..d1155a07
--- /dev/null
+++ b/crypto_kem/sntrup857/avx2/crypto_core_wforcesntrup857.h
@@ -0,0 +1,11 @@
+#ifndef PQCLEAN_SNTRUP857_AVX2_CRYPTO_CORE_WFORCESNTRUP857_H
+#define PQCLEAN_SNTRUP857_AVX2_CRYPTO_CORE_WFORCESNTRUP857_H
+
+#include <stdint.h>
+#define PQCLEAN_SNTRUP857_AVX2_crypto_core_wforcesntrup857_OUTPUTBYTES 857
+#define PQCLEAN_SNTRUP857_AVX2_crypto_core_wforcesntrup857_INPUTBYTES 857
+#define PQCLEAN_SNTRUP857_AVX2_crypto_core_wforcesntrup857_KEYBYTES 0
+#define PQCLEAN_SNTRUP857_AVX2_crypto_core_wforcesntrup857_CONSTBYTES 0
+
+int PQCLEAN_SNTRUP857_AVX2_crypto_core_wforcesntrup857(unsigned char *out, const unsigned char *in);
+#endif
diff --git a/crypto_kem/sntrup857/avx2/crypto_decode_857x1723.c b/crypto_kem/sntrup857/avx2/crypto_decode_857x1723.c
new file mode 100644
index 00000000..336f1b6b
--- /dev/null
+++ b/crypto_kem/sntrup857/avx2/crypto_decode_857x1723.c
@@ -0,0 +1,430 @@
+#include "crypto_decode_857x1723.h"
+#include <immintrin.h>
+/* auto-generated; do not edit */
+
+#define int16 int16_t
+#define int32 int32_t
+
+static inline int16 mullo(int16 x, int16 y) {
+    return x * y;
+}
+
+static inline int16 mulhi(int16 x, int16 y) {
+    return (x * (int32)y) >> 16;
+}
+
+static inline __m256i add(__m256i x, __m256i y) {
+    return _mm256_add_epi16(x, y);
+}
+
+static inline __m256i sub(__m256i x, __m256i y) {
+    return _mm256_sub_epi16(x, y);
+}
+
+static inline __m256i shiftleftconst(__m256i x, int16 y) {
+    return _mm256_slli_epi16(x, y);
+}
+
+static inline __m256i signedshiftrightconst(__m256i x, int16 y) {
+    return _mm256_srai_epi16(x, y);
+}
+
+static inline __m256i addconst(__m256i x, int16 y) {
+    return add(x, _mm256_set1_epi16(y));
+}
+
+static inline __m256i subconst(__m256i x, int16 y) {
+    return sub(x, _mm256_set1_epi16(y));
+}
+
+static inline __m256i mulloconst(__m256i x, int16 y) {
+    return _mm256_mullo_epi16(x, _mm256_set1_epi16(y));
+}
+
+static inline __m256i mulhiconst(__m256i x, int16 y) {
+    return _mm256_mulhi_epi16(x, _mm256_set1_epi16(y));
+}
+
+static inline __m256i ifgesubconst(__m256i x, int16 y) {
+    __m256i y16 = _mm256_set1_epi16(y);
+    __m256i top16 = _mm256_set1_epi16((int16)(y - 1));
+    return sub(x, _mm256_cmpgt_epi16(x, top16) & y16);
+}
+
+static inline __m256i ifnegaddconst(__m256i x, int16 y) {
+    return add(x, signedshiftrightconst(x, 15) & _mm256_set1_epi16(y));
+}
+
+void PQCLEAN_SNTRUP857_AVX2_crypto_decode_857x1723(void *v, const unsigned char *s) {
+    int16 *R0 = v;
+    int16 R1[429], R2[215], R3[108], R4[54], R5[27], R6[14], R7[7], R8[4], R9[2], R10[1];
+    long long i;
+    int16 a0, a1, a2;
+    __m256i A0, A1, A2, S0, S1, B0, B1, C0, C1;
+
+    s += PQCLEAN_SNTRUP857_AVX2_crypto_decode_857x1723_STRBYTES;
+    a1 = 0;
+    a1 += *--s; /* 0...255 */
+    a1 -= 160; /* -160...95 */
+    a1 += (a1 >> 15) & 160; /* 0...159 */
+    R10[0] = a1;
+
+    /* R10 ------> R9: reconstruct mod 1*[743]+[14044] */
+
+    i = 0;
+    s -= 2;
+    a0 = R10[0];
+    a0 = mulhi(a0, 276) - mulhi(mullo(a0, -22580), 743); /* -372...440 */
+    a0 += s[2 * i + 1]; /* -372...695 */
+    a0 = mulhi(a0, 276) - mulhi(mullo(a0, -22580), 743); /* -374...374 */
+    a0 += s[2 * i + 0]; /* -374...629 */
+    a0 += (a0 >> 15) & 743; /* 0...742 */
+    a1 = (s[2 * i + 1] << 8) + s[2 * i] - a0;
+    a1 = mullo(a1, -3881);
+
+    /* invalid inputs might need reduction mod 14044 */
+    a1 -= 14044;
+    a1 += (a1 >> 15) & 14044;
+
+    R9[0] = a0;
+    R9[1] = a1;
+    s -= 0;
+
+    /* R9 ------> R8: reconstruct mod 3*[436]+[8246] */
+
+    i = 0;
+    s -= 1;
+    a2 = a0 = R9[1];
+    a0 = mulhi(a0, -64) - mulhi(mullo(a0, 27056), 436); /* -234...218 */
+    a0 += s[1 * i + 0]; /* -234...473 */
+    a0 -= 436; /* -670..>37 */
+    a0 += (a0 >> 15) & 436; /* -234...435 */
+    a0 += (a0 >> 15) & 436; /* 0...435 */
+    a1 = (a2 << 6) + ((s[i] - a0) >> 2);
+    a1 = mullo(a1, 2405);
+
+    /* invalid inputs might need reduction mod 8246 */
+    a1 -= 8246;
+    a1 += (a1 >> 15) & 8246;
+
+    R8[2] = a0;
+    R8[3] = a1;
+    s -= 1;
+    for (i = 0; i >= 0; --i) {
+        a2 = a0 = R9[i];
+        a0 = mulhi(a0, -64) - mulhi(mullo(a0, 27056), 436); /* -234...218 */
+        a0 += s[1 * i + 0]; /* -234...473 */
+        a0 -= 436; /* -670..>37 */
+        a0 += (a0 >> 15) & 436; /* -234...435 */
+        a0 += (a0 >> 15) & 436; /* 0...435 */
+        a1 = (a2 << 6) + ((s[i] - a0) >> 2);
+        a1 = mullo(a1, 2405);
+
+        /* invalid inputs might need reduction mod 436 */
+        a1 -= 436;
+        a1 += (a1 >> 15) & 436;
+
+        R8[2 * i] = a0;
+        R8[2 * i + 1] = a1;
+    }
+
+    /* R8 ------> R7: reconstruct mod 6*[334]+[8246] */
+
+    R7[6] = R8[3];
+    s -= 3;
+    for (i = 2; i >= 0; --i) {
+        a2 = a0 = R8[i];
+        a0 = mulhi(a0, 62) - mulhi(mullo(a0, 15305), 334); /* -167...182 */
+        a0 += s[1 * i + 0]; /* -167...437 */
+        a0 -= 334; /* -501..>103 */
+        a0 += (a0 >> 15) & 334; /* -167...333 */
+        a0 += (a0 >> 15) & 334; /* 0...333 */
+        a1 = (a2 << 7) + ((s[i] - a0) >> 1);
+        a1 = mullo(a1, -22761);
+
+        /* invalid inputs might need reduction mod 334 */
+        a1 -= 334;
+        a1 += (a1 >> 15) & 334;
+
+        R7[2 * i] = a0;
+        R7[2 * i + 1] = a1;
+    }
+
+    /* R7 ------> R6: reconstruct mod 13*[292]+[7229] */
+
+    i = 0;
+    s -= 1;
+    a2 = a0 = R7[6];
+    a0 = mulhi(a0, 64) - mulhi(mullo(a0, 8080), 292); /* -146...162 */
+    a0 += s[1 * i + 0]; /* -146...417 */
+    a0 -= 292; /* -438..>125 */
+    a0 += (a0 >> 15) & 292; /* -146...291 */
+    a0 += (a0 >> 15) & 292; /* 0...291 */
+    a1 = (a2 << 6) + ((s[i] - a0) >> 2);
+    a1 = mullo(a1, -3591);
+
+    /* invalid inputs might need reduction mod 7229 */
+    a1 -= 7229;
+    a1 += (a1 >> 15) & 7229;
+
+    R6[12] = a0;
+    R6[13] = a1;
+    s -= 6;
+    for (i = 5; i >= 0; --i) {
+        a2 = a0 = R7[i];
+        a0 = mulhi(a0, 64) - mulhi(mullo(a0, 8080), 292); /* -146...162 */
+        a0 += s[1 * i + 0]; /* -146...417 */
+        a0 -= 292; /* -438..>125 */
+        a0 += (a0 >> 15) & 292; /* -146...291 */
+        a0 += (a0 >> 15) & 292; /* 0...291 */
+        a1 = (a2 << 6) + ((s[i] - a0) >> 2);
+        a1 = mullo(a1, -3591);
+
+        /* invalid inputs might need reduction mod 292 */
+        a1 -= 292;
+        a1 += (a1 >> 15) & 292;
+
+        R6[2 * i] = a0;
+        R6[2 * i + 1] = a1;
+    }
+
+    /* R6 ------> R5: reconstruct mod 26*[273]+[7229] */
+
+    R5[26] = R6[13];
+    s -= 13;
+    for (i = 12; i >= 0; --i) {
+        a2 = a0 = R6[i];
+        a0 = mulhi(a0, 1) - mulhi(mullo(a0, 4081), 273); /* -137...136 */
+        a0 += s[1 * i + 0]; /* -137...391 */
+        a0 -= 273; /* -410..>118 */
+        a0 += (a0 >> 15) & 273; /* -137...272 */
+        a0 += (a0 >> 15) & 273; /* 0...272 */
+        a1 = (a2 << 8) + s[i] - a0;
+        a1 = mullo(a1, 4081);
+
+        /* invalid inputs might need reduction mod 273 */
+        a1 -= 273;
+        a1 += (a1 >> 15) & 273;
+
+        R5[2 * i] = a0;
+        R5[2 * i + 1] = a1;
+    }
+
+    /* R5 ------> R4: reconstruct mod 53*[4225]+[438] */
+
+    i = 0;
+    s -= 1;
+    a2 = a0 = R5[26];
+    a0 = mulhi(a0, -259) - mulhi(mullo(a0, -3971), 4225); /* -2178...2112 */
+    a0 += s[1 * i + 0]; /* -2178...2367 */
+    a0 += (a0 >> 15) & 4225; /* 0...4224 */
+    a1 = (a2 << 8) + s[i] - a0;
+    a1 = mullo(a1, 12161);
+
+    /* invalid inputs might need reduction mod 438 */
+    a1 -= 438;
+    a1 += (a1 >> 15) & 438;
+
+    R4[52] = a0;
+    R4[53] = a1;
+    s -= 52;
+    i = 10;
+    for (;;) {
+        A0 = _mm256_loadu_si256((__m256i *) &R5[i]);
+        S0 = _mm256_loadu_si256((__m256i *) (s + 2 * i));
+        S1 = _mm256_srli_epi16(S0, 8);
+        S0 &= _mm256_set1_epi16(255);
+        A0 = sub(mulhiconst(A0, -259), mulhiconst(mulloconst(A0, -3971), 4225)); /* -2178...2112 */
+        A0 = add(A0, S1); /* -2178...2367 */
+        A0 = sub(mulhiconst(A0, -259), mulhiconst(mulloconst(A0, -3971), 4225)); /* -2122...2121 */
+        A0 = add(A0, S0); /* -2122...2376 */
+        A0 = ifnegaddconst(A0, 4225); /* 0...4224 */
+        A1 = add(shiftleftconst(S1, 8), sub(S0, A0));
+        A1 = mulloconst(A1, 12161);
+
+        /* invalid inputs might need reduction mod 4225 */
+        A1 = ifgesubconst(A1, 4225);
+
+        /* A0: r0r2r4r6r8r10r12r14 r16r18r20r22r24r26r28r30 */
+        /* A1: r1r3r5r7r9r11r13r15 r17r19r21r23r25r27r29r31 */
+        B0 = _mm256_unpacklo_epi16(A0, A1);
+        B1 = _mm256_unpackhi_epi16(A0, A1);
+        /* B0: r0r1r2r3r4r5r6r7 r16r17r18r19r20r21r22r23 */
+        /* B1: r8r9r10r11r12r13r14r15 r24r25r26r27r28r29r30r31 */
+        C0 = _mm256_permute2x128_si256(B0, B1, 0x20);
+        C1 = _mm256_permute2x128_si256(B0, B1, 0x31);
+        /* C0: r0r1r2r3r4r5r6r7 r8r9r10r11r12r13r14r15 */
+        /* C1: r16r17r18r19r20r21r22r23 r24r25r26r27r28r29r30r31 */
+        _mm256_storeu_si256((__m256i *) (&R4[2 * i]), C0);
+        _mm256_storeu_si256((__m256i *) (16 + &R4[2 * i]), C1);
+        if (!i) {
+            break;
+        }
+        i = -16 - ((~15) & -i);
+    }
+
+    /* R4 ------> R3: reconstruct mod 107*[65]+[1723] */
+
+    i = 0;
+    s -= 1;
+    a2 = a0 = R4[53];
+    a0 = mulhi(a0, 1) - mulhi(mullo(a0, 4033), 65); /* -33...32 */
+    a0 += s[1 * i + 0]; /* -33...287 */
+    a0 = mulhi(a0, 16) - mulhi(mullo(a0, -1008), 65); /* -33...32 */
+    a0 += (a0 >> 15) & 65; /* 0...64 */
+    a1 = (a2 << 8) + s[i] - a0;
+    a1 = mullo(a1, 4033);
+
+    /* invalid inputs might need reduction mod 1723 */
+    a1 -= 1723;
+    a1 += (a1 >> 15) & 1723;
+
+    R3[106] = a0;
+    R3[107] = a1;
+    s -= 0;
+    i = 37;
+    for (;;) {
+        A2 = A0 = _mm256_loadu_si256((__m256i *) &R4[i]);
+        A0 = sub(mulhiconst(A0, 16), mulhiconst(mulloconst(A0, -1008), 65)); /* -33...36 */
+        A0 = ifnegaddconst(A0, 65); /* 0...64 */
+        A1 = signedshiftrightconst(sub(A2, A0), 0);
+        A1 = mulloconst(A1, 4033);
+
+        /* invalid inputs might need reduction mod 65 */
+        A1 = ifgesubconst(A1, 65);
+
+        /* A0: r0r2r4r6r8r10r12r14 r16r18r20r22r24r26r28r30 */
+        /* A1: r1r3r5r7r9r11r13r15 r17r19r21r23r25r27r29r31 */
+        B0 = _mm256_unpacklo_epi16(A0, A1);
+        B1 = _mm256_unpackhi_epi16(A0, A1);
+        /* B0: r0r1r2r3r4r5r6r7 r16r17r18r19r20r21r22r23 */
+        /* B1: r8r9r10r11r12r13r14r15 r24r25r26r27r28r29r30r31 */
+        C0 = _mm256_permute2x128_si256(B0, B1, 0x20);
+        C1 = _mm256_permute2x128_si256(B0, B1, 0x31);
+        /* C0: r0r1r2r3r4r5r6r7 r8r9r10r11r12r13r14r15 */
+        /* C1: r16r17r18r19r20r21r22r23 r24r25r26r27r28r29r30r31 */
+        _mm256_storeu_si256((__m256i *) (&R3[2 * i]), C0);
+        _mm256_storeu_si256((__m256i *) (16 + &R3[2 * i]), C1);
+        if (!i) {
+            break;
+        }
+        i = -16 - ((~15) & -i);
+    }
+
+    /* R3 ------> R2: reconstruct mod 214*[2053]+[1723] */
+
+    R2[214] = R3[107];
+    s -= 214;
+    i = 91;
+    for (;;) {
+        A0 = _mm256_loadu_si256((__m256i *) &R3[i]);
+        S0 = _mm256_loadu_si256((__m256i *) (s + 2 * i));
+        S1 = _mm256_srli_epi16(S0, 8);
+        S0 &= _mm256_set1_epi16(255);
+        A0 = sub(mulhiconst(A0, 100), mulhiconst(mulloconst(A0, -8172), 2053)); /* -1027...1051 */
+        A0 = add(A0, S1); /* -1027...1306 */
+        A0 = sub(mulhiconst(A0, 100), mulhiconst(mulloconst(A0, -8172), 2053)); /* -1029...1028 */
+        A0 = add(A0, S0); /* -1029...1283 */
+        A0 = ifnegaddconst(A0, 2053); /* 0...2052 */
+        A1 = add(shiftleftconst(S1, 8), sub(S0, A0));
+        A1 = mulloconst(A1, -31539);
+
+        /* invalid inputs might need reduction mod 2053 */
+        A1 = ifgesubconst(A1, 2053);
+
+        /* A0: r0r2r4r6r8r10r12r14 r16r18r20r22r24r26r28r30 */
+        /* A1: r1r3r5r7r9r11r13r15 r17r19r21r23r25r27r29r31 */
+        B0 = _mm256_unpacklo_epi16(A0, A1);
+        B1 = _mm256_unpackhi_epi16(A0, A1);
+        /* B0: r0r1r2r3r4r5r6r7 r16r17r18r19r20r21r22r23 */
+        /* B1: r8r9r10r11r12r13r14r15 r24r25r26r27r28r29r30r31 */
+        C0 = _mm256_permute2x128_si256(B0, B1, 0x20);
+        C1 = _mm256_permute2x128_si256(B0, B1, 0x31);
+        /* C0: r0r1r2r3r4r5r6r7 r8r9r10r11r12r13r14r15 */
+        /* C1: r16r17r18r19r20r21r22r23 r24r25r26r27r28r29r30r31 */
+        _mm256_storeu_si256((__m256i *) (&R2[2 * i]), C0);
+        _mm256_storeu_si256((__m256i *) (16 + &R2[2 * i]), C1);
+        if (!i) {
+            break;
+        }
+        i = -16 - ((~15) & -i);
+    }
+
+    /* R2 ------> R1: reconstruct mod 428*[11597]+[1723] */
+
+    R1[428] = R2[214];
+    s -= 428;
+    i = 198;
+    for (;;) {
+        A0 = _mm256_loadu_si256((__m256i *) &R2[i]);
+        S0 = _mm256_loadu_si256((__m256i *) (s + 2 * i));
+        S1 = _mm256_srli_epi16(S0, 8);
+        S0 &= _mm256_set1_epi16(255);
+        A0 = sub(mulhiconst(A0, -3643), mulhiconst(mulloconst(A0, -1447), 11597)); /* -6710...5798 */
+        A0 = add(A0, S1); /* -6710...6053 */
+        A0 = sub(mulhiconst(A0, -3643), mulhiconst(mulloconst(A0, -1447), 11597)); /* -6135...6171 */
+        A0 = add(A0, S0); /* -6135...6426 */
+        A0 = ifnegaddconst(A0, 11597); /* 0...11596 */
+        A1 = add(shiftleftconst(S1, 8), sub(S0, A0));
+        A1 = mulloconst(A1, -11387);
+
+        /* invalid inputs might need reduction mod 11597 */
+        A1 = ifgesubconst(A1, 11597);
+
+        /* A0: r0r2r4r6r8r10r12r14 r16r18r20r22r24r26r28r30 */
+        /* A1: r1r3r5r7r9r11r13r15 r17r19r21r23r25r27r29r31 */
+        B0 = _mm256_unpacklo_epi16(A0, A1);
+        B1 = _mm256_unpackhi_epi16(A0, A1);
+        /* B0: r0r1r2r3r4r5r6r7 r16r17r18r19r20r21r22r23 */
+        /* B1: r8r9r10r11r12r13r14r15 r24r25r26r27r28r29r30r31 */
+        C0 = _mm256_permute2x128_si256(B0, B1, 0x20);
+        C1 = _mm256_permute2x128_si256(B0, B1, 0x31);
+        /* C0: r0r1r2r3r4r5r6r7 r8r9r10r11r12r13r14r15 */
+        /* C1: r16r17r18r19r20r21r22r23 r24r25r26r27r28r29r30r31 */
+        _mm256_storeu_si256((__m256i *) (&R1[2 * i]), C0);
+        _mm256_storeu_si256((__m256i *) (16 + &R1[2 * i]), C1);
+        if (!i) {
+            break;
+        }
+        i = -16 - ((~15) & -i);
+    }
+
+    /* R1 ------> R0: reconstruct mod 857*[1723] */
+
+    R0[856] = 3 * R1[428] - 2583;
+    s -= 428;
+    i = 412;
+    for (;;) {
+        A2 = A0 = _mm256_loadu_si256((__m256i *) &R1[i]);
+        S0 = _mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i *) (s + i)));
+        A0 = sub(mulhiconst(A0, 365), mulhiconst(mulloconst(A0, -9737), 1723)); /* -862...952 */
+        A0 = add(A0, S0); /* -862...1207 */
+        A0 = ifnegaddconst(A0, 1723); /* 0...1722 */
+        A1 = add(shiftleftconst(A2, 8), sub(S0, A0));
+        A1 = mulloconst(A1, 20083);
+
+        /* invalid inputs might need reduction mod 1723 */
+        A1 = ifgesubconst(A1, 1723);
+
+        A0 = mulloconst(A0, 3);
+        A1 = mulloconst(A1, 3);
+        A0 = subconst(A0, 2583);
+        A1 = subconst(A1, 2583);
+        /* A0: r0r2r4r6r8r10r12r14 r16r18r20r22r24r26r28r30 */
+        /* A1: r1r3r5r7r9r11r13r15 r17r19r21r23r25r27r29r31 */
+        B0 = _mm256_unpacklo_epi16(A0, A1);
+        B1 = _mm256_unpackhi_epi16(A0, A1);
+        /* B0: r0r1r2r3r4r5r6r7 r16r17r18r19r20r21r22r23 */
+        /* B1: r8r9r10r11r12r13r14r15 r24r25r26r27r28r29r30r31 */
+        C0 = _mm256_permute2x128_si256(B0, B1, 0x20);
+        C1 = _mm256_permute2x128_si256(B0, B1, 0x31);
+        /* C0: r0r1r2r3r4r5r6r7 r8r9r10r11r12r13r14r15 */
+        /* C1: r16r17r18r19r20r21r22r23 r24r25r26r27r28r29r30r31 */
+        _mm256_storeu_si256((__m256i *) (&R0[2 * i]), C0);
+        _mm256_storeu_si256((__m256i *) (16 + &R0[2 * i]), C1);
+        if (!i) {
+            break;
+        }
+        i = -16 - ((~15) & -i);
+    }
+}
diff --git a/crypto_kem/sntrup857/avx2/crypto_decode_857x1723.h b/crypto_kem/sntrup857/avx2/crypto_decode_857x1723.h
new file mode 100644
index 00000000..6e1391f3
--- /dev/null
+++ b/crypto_kem/sntrup857/avx2/crypto_decode_857x1723.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_SNTRUP857_AVX2_CRYPTO_DECODE_857X1723_H
+#define PQCLEAN_SNTRUP857_AVX2_CRYPTO_DECODE_857X1723_H
+
+#include <stdint.h>
+#define PQCLEAN_SNTRUP857_AVX2_crypto_decode_857x1723_STRBYTES 1152
+#define PQCLEAN_SNTRUP857_AVX2_crypto_decode_857x1723_ITEMS 857
+#define PQCLEAN_SNTRUP857_AVX2_crypto_decode_857x1723_ITEMBYTES 2
+
+void PQCLEAN_SNTRUP857_AVX2_crypto_decode_857x1723(void *v, const unsigned char *s);
+#endif
diff --git a/crypto_kem/sntrup857/avx2/crypto_decode_857x3.c b/crypto_kem/sntrup857/avx2/crypto_decode_857x3.c
new file mode 100644
index 00000000..5a50a4d8
--- /dev/null
+++ b/crypto_kem/sntrup857/avx2/crypto_decode_857x3.c
@@ -0,0 +1,65 @@
+#include "crypto_decode_857x3.h"
+#include <immintrin.h>
+#define uint8 uint8_t
+
+#define p 857
+#define loops 7
+#define overshoot 10
+
+void PQCLEAN_SNTRUP857_AVX2_crypto_decode_857x3(void *v, const unsigned char *s) {
+    uint8 *f = v;
+    int loop;
+    uint8 *nextf = f + 128 - 4 * overshoot;
+    const unsigned char *nexts = s + 32 - overshoot;
+
+    for (loop = loops; loop > 0; --loop) {
+        __m256i s0 = _mm256_loadu_si256((const __m256i *) s);
+        s = nexts;
+        nexts += 32;
+
+        __m256i s1 = _mm256_srli_epi16(s0 & _mm256_set1_epi8(-16), 4);
+        s0 &= _mm256_set1_epi8(15);
+
+        __m256i a0 = _mm256_unpacklo_epi8(s0, s1);
+        /* 0 0>>4 1 1>>4 2 2>>4 3 3>>4 4 4>>4 5 5>>4 6 6>>4 7 7>>4 */
+        /* 16 16>>4 ... */
+        __m256i a1 = _mm256_unpackhi_epi8(s0, s1);
+        /* 8 8>>4 9 9>>4 10 10>>4 ... */
+        /* 24 24>>4 ... */
+
+        __m256i a2 = _mm256_srli_epi16(a0 & _mm256_set1_epi8(12), 2);
+        __m256i a3 = _mm256_srli_epi16(a1 & _mm256_set1_epi8(12), 2);
+        a0 &= _mm256_set1_epi8(3);
+        a1 &= _mm256_set1_epi8(3);
+
+        __m256i b0 = _mm256_unpacklo_epi8(a0, a2);
+        /* 0 0>>2 0>>4 0>>6 1 1>>2 1>>4 1>>6 */
+        /* 2 2>>2 2>>4 2>>6 3 3>>2 3>>4 3>.6 */
+        /* 16 16>>2 16>>4 16>>6 ... */
+        __m256i b2 = _mm256_unpackhi_epi8(a0, a2);
+        /* 4 4>>2 ... */
+        __m256i b1 = _mm256_unpacklo_epi8(a1, a3);
+        /* 8 8>>2 ... */
+        __m256i b3 = _mm256_unpackhi_epi8(a1, a3);
+        /* 12 12>>2 ... */
+
+        __m256i f0 = _mm256_permute2x128_si256(b0, b2, 0x20);
+        __m256i f2 = _mm256_permute2x128_si256(b0, b2, 0x31);
+        __m256i f1 = _mm256_permute2x128_si256(b1, b3, 0x20);
+        __m256i f3 = _mm256_permute2x128_si256(b1, b3, 0x31);
+
+        f0 = _mm256_add_epi8(f0, _mm256_set1_epi8(-1));
+        f1 = _mm256_add_epi8(f1, _mm256_set1_epi8(-1));
+        f2 = _mm256_add_epi8(f2, _mm256_set1_epi8(-1));
+        f3 = _mm256_add_epi8(f3, _mm256_set1_epi8(-1));
+
+        _mm256_storeu_si256((__m256i *) (f + 0), f0);
+        _mm256_storeu_si256((__m256i *) (f + 32), f1);
+        _mm256_storeu_si256((__m256i *) (f + 64), f2);
+        _mm256_storeu_si256((__m256i *) (f + 96), f3);
+        f = nextf;
+        nextf += 128;
+    }
+
+    *f = ((uint8)(*s & 3)) - 1;
+}
diff --git a/crypto_kem/sntrup857/avx2/crypto_decode_857x3.h b/crypto_kem/sntrup857/avx2/crypto_decode_857x3.h
new file mode 100644
index 00000000..8bac5dd0
--- /dev/null
+++ b/crypto_kem/sntrup857/avx2/crypto_decode_857x3.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_SNTRUP857_AVX2_CRYPTO_DECODE_857X3_H
+#define PQCLEAN_SNTRUP857_AVX2_CRYPTO_DECODE_857X3_H
+
+#include <stdint.h>
+#define PQCLEAN_SNTRUP857_AVX2_crypto_decode_857x3_STRBYTES 215
+#define PQCLEAN_SNTRUP857_AVX2_crypto_decode_857x3_ITEMS 857
+#define PQCLEAN_SNTRUP857_AVX2_crypto_decode_857x3_ITEMBYTES 1
+
+void PQCLEAN_SNTRUP857_AVX2_crypto_decode_857x3(void *v, const unsigned char *s);
+#endif
diff --git a/crypto_kem/sntrup857/avx2/crypto_decode_857x5167.c b/crypto_kem/sntrup857/avx2/crypto_decode_857x5167.c
new file mode 100644
index 00000000..be710422
--- /dev/null
+++ b/crypto_kem/sntrup857/avx2/crypto_decode_857x5167.c
@@ -0,0 +1,424 @@
+#include "crypto_decode_857x5167.h"
+#include <immintrin.h>
+/* auto-generated; do not edit */
+
+#define int16 int16_t
+#define int32 int32_t
+
+static inline int16 mullo(int16 x, int16 y) {
+    return x * y;
+}
+
+static inline int16 mulhi(int16 x, int16 y) {
+    return (x * (int32)y) >> 16;
+}
+
+static inline __m256i add(__m256i x, __m256i y) {
+    return _mm256_add_epi16(x, y);
+}
+
+static inline __m256i sub(__m256i x, __m256i y) {
+    return _mm256_sub_epi16(x, y);
+}
+
+static inline __m256i shiftleftconst(__m256i x, int16 y) {
+    return _mm256_slli_epi16(x, y);
+}
+
+static inline __m256i signedshiftrightconst(__m256i x, int16 y) {
+    return _mm256_srai_epi16(x, y);
+}
+
+static inline __m256i addconst(__m256i x, int16 y) {
+    return add(x, _mm256_set1_epi16(y));
+}
+
+static inline __m256i subconst(__m256i x, int16 y) {
+    return sub(x, _mm256_set1_epi16(y));
+}
+
+static inline __m256i mulloconst(__m256i x, int16 y) {
+    return _mm256_mullo_epi16(x, _mm256_set1_epi16(y));
+}
+
+static inline __m256i mulhiconst(__m256i x, int16 y) {
+    return _mm256_mulhi_epi16(x, _mm256_set1_epi16(y));
+}
+
+static inline __m256i ifgesubconst(__m256i x, int16 y) {
+    __m256i y16 = _mm256_set1_epi16(y);
+    __m256i top16 = _mm256_set1_epi16((int16)(y - 1));
+    return sub(x, _mm256_cmpgt_epi16(x, top16) & y16);
+}
+
+static inline __m256i ifnegaddconst(__m256i x, int16 y) {
+    return add(x, signedshiftrightconst(x, 15) & _mm256_set1_epi16(y));
+}
+
+void PQCLEAN_SNTRUP857_AVX2_crypto_decode_857x5167(void *v, const unsigned char *s) {
+    int16 *R0 = v;
+    int16 R1[429], R2[215], R3[108], R4[54], R5[27], R6[14], R7[7], R8[4], R9[2], R10[1];
+    long long i;
+    int16 a0, a1, a2;
+    __m256i A0, A1, A2, S0, S1, B0, B1, C0, C1;
+
+    s += PQCLEAN_SNTRUP857_AVX2_crypto_decode_857x5167_STRBYTES;
+    a1 = 0;
+    a1 += *--s; /* 0...255 */
+    a1 = mulhi(a1, 841) - mulhi(mullo(a1, -2695), 6225);
+    a1 += *--s; /* -3113...3370 */
+    a1 += (a1 >> 15) & 6225; /* 0...6224 */
+    R10[0] = a1;
+
+    /* R10 ------> R9: reconstruct mod 1*[5476]+[291] */
+
+    i = 0;
+    s -= 1;
+    a2 = a0 = R10[0];
+    a0 = mulhi(a0, -1248) - mulhi(mullo(a0, -3064), 5476); /* -3050...2738 */
+    a0 += s[1 * i + 0]; /* -3050...2993 */
+    a0 += (a0 >> 15) & 5476; /* 0...5475 */
+    a1 = (a2 << 6) + ((s[i] - a0) >> 2);
+    a1 = mullo(a1, -3351);
+
+    /* invalid inputs might need reduction mod 291 */
+    a1 -= 291;
+    a1 += (a1 >> 15) & 291;
+
+    R9[0] = a0;
+    R9[1] = a1;
+    s -= 0;
+
+    /* R9 ------> R8: reconstruct mod 3*[74]+[1004] */
+
+    i = 0;
+    s -= 1;
+    a2 = a0 = R9[1];
+    a0 = mulhi(a0, 10) - mulhi(mullo(a0, -30111), 74); /* -37...39 */
+    a0 += s[1 * i + 0]; /* -37...294 */
+    a0 = mulhi(a0, -28) - mulhi(mullo(a0, -886), 74); /* -38...37 */
+    a0 += (a0 >> 15) & 74; /* 0...73 */
+    a1 = (a2 << 7) + ((s[i] - a0) >> 1);
+    a1 = mullo(a1, 7085);
+
+    /* invalid inputs might need reduction mod 1004 */
+    a1 -= 1004;
+    a1 += (a1 >> 15) & 1004;
+
+    R8[2] = a0;
+    R8[3] = a1;
+    s -= 0;
+    for (i = 0; i >= 0; --i) {
+        a2 = a0 = R9[i];
+        a0 = mulhi(a0, -28) - mulhi(mullo(a0, -886), 74); /* -44...37 */
+        a0 += (a0 >> 15) & 74; /* 0...73 */
+        a1 = (a2 - a0) >> 1;
+        a1 = mullo(a1, 7085);
+
+        /* invalid inputs might need reduction mod 74 */
+        a1 -= 74;
+        a1 += (a1 >> 15) & 74;
+
+        R8[2 * i] = a0;
+        R8[2 * i + 1] = a1;
+    }
+
+    /* R8 ------> R7: reconstruct mod 6*[2194]+[1004] */
+
+    R7[6] = R8[3];
+    s -= 6;
+    for (i = 2; i >= 0; --i) {
+        a2 = a0 = R8[i];
+        a0 = mulhi(a0, -302) - mulhi(mullo(a0, -7647), 2194); /* -1173...1097 */
+        a0 += s[2 * i + 1]; /* -1173...1352 */
+        a0 = mulhi(a0, -302) - mulhi(mullo(a0, -7647), 2194); /* -1104...1102 */
+        a0 += s[2 * i + 0]; /* -1104...1357 */
+        a0 += (a0 >> 15) & 2194; /* 0...2193 */
+        a1 = (a2 << 15) + (s[2 * i + 1] << 7) + ((s[2 * i] - a0) >> 1);
+        a1 = mullo(a1, 11769);
+
+        /* invalid inputs might need reduction mod 2194 */
+        a1 -= 2194;
+        a1 += (a1 >> 15) & 2194;
+
+        R7[2 * i] = a0;
+        R7[2 * i + 1] = a1;
+    }
+
+    /* R7 ------> R6: reconstruct mod 13*[11991]+[5483] */
+
+    i = 0;
+    s -= 2;
+    a0 = R7[6];
+    a0 = mulhi(a0, 1807) - mulhi(mullo(a0, -1399), 11991); /* -5996...6447 */
+    a0 += s[2 * i + 1]; /* -5996...6702 */
+    a0 = mulhi(a0, 1807) - mulhi(mullo(a0, -1399), 11991); /* -6161...6180 */
+    a0 += s[2 * i + 0]; /* -6161...6435 */
+    a0 += (a0 >> 15) & 11991; /* 0...11990 */
+    a1 = (s[2 * i + 1] << 8) + s[2 * i] - a0;
+    a1 = mullo(a1, -23321);
+
+    /* invalid inputs might need reduction mod 5483 */
+    a1 -= 5483;
+    a1 += (a1 >> 15) & 5483;
+
+    R6[12] = a0;
+    R6[13] = a1;
+    s -= 12;
+    for (i = 5; i >= 0; --i) {
+        a0 = R7[i];
+        a0 = mulhi(a0, 1807) - mulhi(mullo(a0, -1399), 11991); /* -5996...6447 */
+        a0 += s[2 * i + 1]; /* -5996...6702 */
+        a0 = mulhi(a0, 1807) - mulhi(mullo(a0, -1399), 11991); /* -6161...6180 */
+        a0 += s[2 * i + 0]; /* -6161...6435 */
+        a0 += (a0 >> 15) & 11991; /* 0...11990 */
+        a1 = (s[2 * i + 1] << 8) + s[2 * i] - a0;
+        a1 = mullo(a1, -23321);
+
+        /* invalid inputs might need reduction mod 11991 */
+        a1 -= 11991;
+        a1 += (a1 >> 15) & 11991;
+
+        R6[2 * i] = a0;
+        R6[2 * i + 1] = a1;
+    }
+
+    /* R6 ------> R5: reconstruct mod 26*[1752]+[5483] */
+
+    R5[26] = R6[13];
+    s -= 13;
+    for (i = 12; i >= 0; --i) {
+        a2 = a0 = R6[i];
+        a0 = mulhi(a0, 64) - mulhi(mullo(a0, -9576), 1752); /* -876...892 */
+        a0 += s[1 * i + 0]; /* -876...1147 */
+        a0 += (a0 >> 15) & 1752; /* 0...1751 */
+        a1 = (a2 << 5) + ((s[i] - a0) >> 3);
+        a1 = mullo(a1, -1197);
+
+        /* invalid inputs might need reduction mod 1752 */
+        a1 -= 1752;
+        a1 += (a1 >> 15) & 1752;
+
+        R5[2 * i] = a0;
+        R5[2 * i + 1] = a1;
+    }
+
+    /* R5 ------> R4: reconstruct mod 53*[10713]+[131] */
+
+    i = 0;
+    s -= 1;
+    a2 = a0 = R5[26];
+    a0 = mulhi(a0, 658) - mulhi(mullo(a0, -1566), 10713); /* -5357...5521 */
+    a0 += s[1 * i + 0]; /* -5357...5776 */
+    a0 += (a0 >> 15) & 10713; /* 0...10712 */
+    a1 = (a2 << 8) + s[i] - a0;
+    a1 = mullo(a1, -14743);
+
+    /* invalid inputs might need reduction mod 131 */
+    a1 -= 131;
+    a1 += (a1 >> 15) & 131;
+
+    R4[52] = a0;
+    R4[53] = a1;
+    s -= 52;
+    i = 10;
+    for (;;) {
+        A0 = _mm256_loadu_si256((__m256i *) &R5[i]);
+        S0 = _mm256_loadu_si256((__m256i *) (s + 2 * i));
+        S1 = _mm256_srli_epi16(S0, 8);
+        S0 &= _mm256_set1_epi16(255);
+        A0 = sub(mulhiconst(A0, 658), mulhiconst(mulloconst(A0, -1566), 10713)); /* -5357...5521 */
+        A0 = add(A0, S1); /* -5357...5776 */
+        A0 = sub(mulhiconst(A0, 658), mulhiconst(mulloconst(A0, -1566), 10713)); /* -5411...5414 */
+        A0 = add(A0, S0); /* -5411...5669 */
+        A0 = ifnegaddconst(A0, 10713); /* 0...10712 */
+        A1 = add(shiftleftconst(S1, 8), sub(S0, A0));
+        A1 = mulloconst(A1, -14743);
+
+        /* invalid inputs might need reduction mod 10713 */
+        A1 = ifgesubconst(A1, 10713);
+
+        /* A0: r0r2r4r6r8r10r12r14 r16r18r20r22r24r26r28r30 */
+        /* A1: r1r3r5r7r9r11r13r15 r17r19r21r23r25r27r29r31 */
+        B0 = _mm256_unpacklo_epi16(A0, A1);
+        B1 = _mm256_unpackhi_epi16(A0, A1);
+        /* B0: r0r1r2r3r4r5r6r7 r16r17r18r19r20r21r22r23 */
+        /* B1: r8r9r10r11r12r13r14r15 r24r25r26r27r28r29r30r31 */
+        C0 = _mm256_permute2x128_si256(B0, B1, 0x20);
+        C1 = _mm256_permute2x128_si256(B0, B1, 0x31);
+        /* C0: r0r1r2r3r4r5r6r7 r8r9r10r11r12r13r14r15 */
+        /* C1: r16r17r18r19r20r21r22r23 r24r25r26r27r28r29r30r31 */
+        _mm256_storeu_si256((__m256i *) (&R4[2 * i]), C0);
+        _mm256_storeu_si256((__m256i *) (16 + &R4[2 * i]), C1);
+        if (!i) {
+            break;
+        }
+        i = -16 - ((~15) & -i);
+    }
+
+    /* R4 ------> R3: reconstruct mod 107*[1656]+[5167] */
+
+    i = 0;
+    s -= 2;
+    a2 = a0 = R4[53];
+    a0 = mulhi(a0, 280) - mulhi(mullo(a0, -10131), 1656); /* -828...898 */
+    a0 += s[2 * i + 1]; /* -828...1153 */
+    a0 = mulhi(a0, 280) - mulhi(mullo(a0, -10131), 1656); /* -832...832 */
+    a0 += s[2 * i + 0]; /* -832...1087 */
+    a0 += (a0 >> 15) & 1656; /* 0...1655 */
+    a1 = (a2 << 13) + (s[2 * i + 1] << 5) + ((s[2 * i] - a0) >> 3);
+    a1 = mullo(a1, 1583);
+
+    /* invalid inputs might need reduction mod 5167 */
+    a1 -= 5167;
+    a1 += (a1 >> 15) & 5167;
+
+    R3[106] = a0;
+    R3[107] = a1;
+    s -= 53;
+    i = 37;
+    for (;;) {
+        A2 = A0 = _mm256_loadu_si256((__m256i *) &R4[i]);
+        S0 = _mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i *) (s + i)));
+        A0 = sub(mulhiconst(A0, 280), mulhiconst(mulloconst(A0, -10131), 1656)); /* -828...898 */
+        A0 = add(A0, S0); /* -828...1153 */
+        A0 = ifnegaddconst(A0, 1656); /* 0...1655 */
+        A1 = add(shiftleftconst(A2, 5), signedshiftrightconst(sub(S0, A0), 3));
+        A1 = mulloconst(A1, 1583);
+
+        /* invalid inputs might need reduction mod 1656 */
+        A1 = ifgesubconst(A1, 1656);
+
+        /* A0: r0r2r4r6r8r10r12r14 r16r18r20r22r24r26r28r30 */
+        /* A1: r1r3r5r7r9r11r13r15 r17r19r21r23r25r27r29r31 */
+        B0 = _mm256_unpacklo_epi16(A0, A1);
+        B1 = _mm256_unpackhi_epi16(A0, A1);
+        /* B0: r0r1r2r3r4r5r6r7 r16r17r18r19r20r21r22r23 */
+        /* B1: r8r9r10r11r12r13r14r15 r24r25r26r27r28r29r30r31 */
+        C0 = _mm256_permute2x128_si256(B0, B1, 0x20);
+        C1 = _mm256_permute2x128_si256(B0, B1, 0x31);
+        /* C0: r0r1r2r3r4r5r6r7 r8r9r10r11r12r13r14r15 */
+        /* C1: r16r17r18r19r20r21r22r23 r24r25r26r27r28r29r30r31 */
+        _mm256_storeu_si256((__m256i *) (&R3[2 * i]), C0);
+        _mm256_storeu_si256((__m256i *) (16 + &R3[2 * i]), C1);
+        if (!i) {
+            break;
+        }
+        i = -16 - ((~15) & -i);
+    }
+
+    /* R3 ------> R2: reconstruct mod 214*[651]+[5167] */
+
+    R2[214] = R3[107];
+    s -= 107;
+    i = 91;
+    for (;;) {
+        A2 = A0 = _mm256_loadu_si256((__m256i *) &R3[i]);
+        S0 = _mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i *) (s + i)));
+        A0 = sub(mulhiconst(A0, 295), mulhiconst(mulloconst(A0, -25771), 651)); /* -326...399 */
+        A0 = add(A0, S0); /* -326...654 */
+        A0 = subconst(A0, 651); /* -977...3 */
+        A0 = ifnegaddconst(A0, 651); /* -326...650 */
+        A0 = ifnegaddconst(A0, 651); /* 0...650 */
+        A1 = add(shiftleftconst(A2, 8), sub(S0, A0));
+        A1 = mulloconst(A1, -10973);
+
+        /* invalid inputs might need reduction mod 651 */
+        A1 = ifgesubconst(A1, 651);
+
+        /* A0: r0r2r4r6r8r10r12r14 r16r18r20r22r24r26r28r30 */
+        /* A1: r1r3r5r7r9r11r13r15 r17r19r21r23r25r27r29r31 */
+        B0 = _mm256_unpacklo_epi16(A0, A1);
+        B1 = _mm256_unpackhi_epi16(A0, A1);
+        /* B0: r0r1r2r3r4r5r6r7 r16r17r18r19r20r21r22r23 */
+        /* B1: r8r9r10r11r12r13r14r15 r24r25r26r27r28r29r30r31 */
+        C0 = _mm256_permute2x128_si256(B0, B1, 0x20);
+        C1 = _mm256_permute2x128_si256(B0, B1, 0x31);
+        /* C0: r0r1r2r3r4r5r6r7 r8r9r10r11r12r13r14r15 */
+        /* C1: r16r17r18r19r20r21r22r23 r24r25r26r27r28r29r30r31 */
+        _mm256_storeu_si256((__m256i *) (&R2[2 * i]), C0);
+        _mm256_storeu_si256((__m256i *) (16 + &R2[2 * i]), C1);
+        if (!i) {
+            break;
+        }
+        i = -16 - ((~15) & -i);
+    }
+
+    /* R2 ------> R1: reconstruct mod 428*[408]+[5167] */
+
+    R1[428] = R2[214];
+    s -= 214;
+    i = 198;
+    for (;;) {
+        A2 = A0 = _mm256_loadu_si256((__m256i *) &R2[i]);
+        S0 = _mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i *) (s + i)));
+        A0 = sub(mulhiconst(A0, -152), mulhiconst(mulloconst(A0, 24415), 408)); /* -242...204 */
+        A0 = add(A0, S0); /* -242...459 */
+        A0 = subconst(A0, 408); /* -650...51 */
+        A0 = ifnegaddconst(A0, 408); /* -242...407 */
+        A0 = ifnegaddconst(A0, 408); /* 0...407 */
+        A1 = add(shiftleftconst(A2, 5), signedshiftrightconst(sub(S0, A0), 3));
+        A1 = mulloconst(A1, -1285);
+
+        /* invalid inputs might need reduction mod 408 */
+        A1 = ifgesubconst(A1, 408);
+
+        /* A0: r0r2r4r6r8r10r12r14 r16r18r20r22r24r26r28r30 */
+        /* A1: r1r3r5r7r9r11r13r15 r17r19r21r23r25r27r29r31 */
+        B0 = _mm256_unpacklo_epi16(A0, A1);
+        B1 = _mm256_unpackhi_epi16(A0, A1);
+        /* B0: r0r1r2r3r4r5r6r7 r16r17r18r19r20r21r22r23 */
+        /* B1: r8r9r10r11r12r13r14r15 r24r25r26r27r28r29r30r31 */
+        C0 = _mm256_permute2x128_si256(B0, B1, 0x20);
+        C1 = _mm256_permute2x128_si256(B0, B1, 0x31);
+        /* C0: r0r1r2r3r4r5r6r7 r8r9r10r11r12r13r14r15 */
+        /* C1: r16r17r18r19r20r21r22r23 r24r25r26r27r28r29r30r31 */
+        _mm256_storeu_si256((__m256i *) (&R1[2 * i]), C0);
+        _mm256_storeu_si256((__m256i *) (16 + &R1[2 * i]), C1);
+        if (!i) {
+            break;
+        }
+        i = -16 - ((~15) & -i);
+    }
+
+    /* R1 ------> R0: reconstruct mod 857*[5167] */
+
+    R0[856] = R1[428] - 2583;
+    s -= 856;
+    i = 412;
+    for (;;) {
+        A0 = _mm256_loadu_si256((__m256i *) &R1[i]);
+        S0 = _mm256_loadu_si256((__m256i *) (s + 2 * i));
+        S1 = _mm256_srli_epi16(S0, 8);
+        S0 &= _mm256_set1_epi16(255);
+        A0 = sub(mulhiconst(A0, -33), mulhiconst(mulloconst(A0, -3247), 5167)); /* -2592...2583 */
+        A0 = add(A0, S1); /* -2592...2838 */
+        A0 = sub(mulhiconst(A0, -33), mulhiconst(mulloconst(A0, -3247), 5167)); /* -2585...2584 */
+        A0 = add(A0, S0); /* -2585...2839 */
+        A0 = ifnegaddconst(A0, 5167); /* 0...5166 */
+        A1 = add(shiftleftconst(S1, 8), sub(S0, A0));
+        A1 = mulloconst(A1, -19761);
+
+        /* invalid inputs might need reduction mod 5167 */
+        A1 = ifgesubconst(A1, 5167);
+
+        A0 = subconst(A0, 2583);
+        A1 = subconst(A1, 2583);
+        /* A0: r0r2r4r6r8r10r12r14 r16r18r20r22r24r26r28r30 */
+        /* A1: r1r3r5r7r9r11r13r15 r17r19r21r23r25r27r29r31 */
+        B0 = _mm256_unpacklo_epi16(A0, A1);
+        B1 = _mm256_unpackhi_epi16(A0, A1);
+        /* B0: r0r1r2r3r4r5r6r7 r16r17r18r19r20r21r22r23 */
+        /* B1: r8r9r10r11r12r13r14r15 r24r25r26r27r28r29r30r31 */
+        C0 = _mm256_permute2x128_si256(B0, B1, 0x20);
+        C1 = _mm256_permute2x128_si256(B0, B1, 0x31);
+        /* C0: r0r1r2r3r4r5r6r7 r8r9r10r11r12r13r14r15 */
+        /* C1: r16r17r18r19r20r21r22r23 r24r25r26r27r28r29r30r31 */
+        _mm256_storeu_si256((__m256i *) (&R0[2 * i]), C0);
+        _mm256_storeu_si256((__m256i *) (16 + &R0[2 * i]), C1);
+        if (!i) {
+            break;
+        }
+        i = -16 - ((~15) & -i);
+    }
+}
diff --git a/crypto_kem/sntrup857/avx2/crypto_decode_857x5167.h b/crypto_kem/sntrup857/avx2/crypto_decode_857x5167.h
new file mode 100644
index 00000000..fbd13dbf
--- /dev/null
+++ b/crypto_kem/sntrup857/avx2/crypto_decode_857x5167.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_SNTRUP857_AVX2_CRYPTO_DECODE_857X5167_H
+#define PQCLEAN_SNTRUP857_AVX2_CRYPTO_DECODE_857X5167_H
+
+#include <stdint.h>
+#define PQCLEAN_SNTRUP857_AVX2_crypto_decode_857x5167_STRBYTES 1322
+#define PQCLEAN_SNTRUP857_AVX2_crypto_decode_857x5167_ITEMS 857
+#define PQCLEAN_SNTRUP857_AVX2_crypto_decode_857x5167_ITEMBYTES 2
+
+void PQCLEAN_SNTRUP857_AVX2_crypto_decode_857x5167(void *v, const unsigned char *s);
+#endif
diff --git a/crypto_kem/sntrup857/avx2/crypto_decode_857xint16.c b/crypto_kem/sntrup857/avx2/crypto_decode_857xint16.c
new file mode 100644
index 00000000..d52df0c3
--- /dev/null
+++ b/crypto_kem/sntrup857/avx2/crypto_decode_857xint16.c
@@ -0,0 +1,16 @@
+#include "crypto_decode_857xint16.h"
+
+
+void PQCLEAN_SNTRUP857_AVX2_crypto_decode_857xint16(void *v, const unsigned char *s) {
+    uint16_t *x = v;
+    int i;
+
+    for (i = 0; i < 857; ++i) {
+        uint16_t u0 = s[0];
+        uint16_t u1 = s[1];
+        u1 <<= 8;
+        *x = u0 | u1;
+        x += 1;
+        s += 2;
+    }
+}
diff --git a/crypto_kem/sntrup857/avx2/crypto_decode_857xint16.h b/crypto_kem/sntrup857/avx2/crypto_decode_857xint16.h
new file mode 100644
index 00000000..2c626a2c
--- /dev/null
+++ b/crypto_kem/sntrup857/avx2/crypto_decode_857xint16.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_SNTRUP857_AVX2_CRYPTO_DECODE_857XINT16_H
+#define PQCLEAN_SNTRUP857_AVX2_CRYPTO_DECODE_857XINT16_H
+
+#include <stdint.h>
+#define PQCLEAN_SNTRUP857_AVX2_crypto_decode_857xint16_STRBYTES 1714
+#define PQCLEAN_SNTRUP857_AVX2_crypto_decode_857xint16_ITEMBYTES 2
+#define PQCLEAN_SNTRUP857_AVX2_crypto_decode_857xint16_ITEMS 857
+
+void PQCLEAN_SNTRUP857_AVX2_crypto_decode_857xint16(void *v, const unsigned char *s);
+#endif
diff --git a/crypto_kem/sntrup857/avx2/crypto_decode_857xint32.c b/crypto_kem/sntrup857/avx2/crypto_decode_857xint32.c
new file mode 100644
index 00000000..cfd87ef9
--- /dev/null
+++ b/crypto_kem/sntrup857/avx2/crypto_decode_857xint32.c
@@ -0,0 +1,20 @@
+#include "crypto_decode_857xint32.h"
+
+
+void PQCLEAN_SNTRUP857_AVX2_crypto_decode_857xint32(void *v, const unsigned char *s) {
+    uint32_t *x = v;
+    int i;
+
+    for (i = 0; i < 857; ++i) {
+        uint32_t u0 = s[0];
+        uint32_t u1 = s[1];
+        uint32_t u2 = s[2];
+        uint32_t u3 = s[3];
+        u1 <<= 8;
+        u2 <<= 16;
+        u3 <<= 24;
+        *x = u0 | u1 | u2 | u3;
+        x += 1;
+        s += 4;
+    }
+}
diff --git a/crypto_kem/sntrup857/avx2/crypto_decode_857xint32.h b/crypto_kem/sntrup857/avx2/crypto_decode_857xint32.h
new file mode 100644
index 00000000..23ca858e
--- /dev/null
+++ b/crypto_kem/sntrup857/avx2/crypto_decode_857xint32.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_SNTRUP857_AVX2_CRYPTO_DECODE_857XINT32_H
+#define PQCLEAN_SNTRUP857_AVX2_CRYPTO_DECODE_857XINT32_H
+
+#include <stdint.h>
+#define PQCLEAN_SNTRUP857_AVX2_crypto_decode_857xint32_STRBYTES 3428
+#define PQCLEAN_SNTRUP857_AVX2_crypto_decode_857xint32_ITEMBYTES 4
+#define PQCLEAN_SNTRUP857_AVX2_crypto_decode_857xint32_ITEMS 857
+
+void PQCLEAN_SNTRUP857_AVX2_crypto_decode_857xint32(void *v, const unsigned char *s);
+#endif
diff --git a/crypto_kem/sntrup857/avx2/crypto_decode_int16.c b/crypto_kem/sntrup857/avx2/crypto_decode_int16.c
new file mode 100644
index 00000000..a3a53a47
--- /dev/null
+++ b/crypto_kem/sntrup857/avx2/crypto_decode_int16.c
@@ -0,0 +1,9 @@
+#include "crypto_decode_int16.h"
+
+
+void PQCLEAN_SNTRUP857_AVX2_crypto_decode_int16(void *x, const unsigned char *s) {
+    uint16_t u0 = s[0];
+    uint16_t u1 = s[1];
+    u1 <<= 8;
+    *(uint16_t *) x = u0 | u1;
+}
diff --git a/crypto_kem/sntrup857/avx2/crypto_decode_int16.h b/crypto_kem/sntrup857/avx2/crypto_decode_int16.h
new file mode 100644
index 00000000..fcaa1f3f
--- /dev/null
+++ b/crypto_kem/sntrup857/avx2/crypto_decode_int16.h
@@ -0,0 +1,9 @@
+#ifndef PQCLEAN_SNTRUP857_AVX2_CRYPTO_DECODE_INT16_H
+#define PQCLEAN_SNTRUP857_AVX2_CRYPTO_DECODE_INT16_H
+
+#include <stdint.h>
+#define PQCLEAN_SNTRUP857_AVX2_crypto_core_multsntrup857_STRBYTES 2
+#define PQCLEAN_SNTRUP857_AVX2_crypto_core_multsntrup857_ITEMBYTES 2
+#define PQCLEAN_SNTRUP857_AVX2_crypto_core_multsntrup857_ITEMS 1
+void PQCLEAN_SNTRUP857_AVX2_crypto_decode_int16(void *x, const unsigned char *s);
+#endif
diff --git a/crypto_kem/sntrup857/avx2/crypto_encode_857x1723.c b/crypto_kem/sntrup857/avx2/crypto_encode_857x1723.c
new file mode 100644
index 00000000..8ae91a55
--- /dev/null
+++ b/crypto_kem/sntrup857/avx2/crypto_encode_857x1723.c
@@ -0,0 +1,283 @@
+#include "crypto_encode_857x1723.h"
+#include <immintrin.h>
+/* auto-generated; do not edit */
+
+#define int16 int16_t
+#define uint16 uint16_t
+#define uint32 uint32_t
+
+void PQCLEAN_SNTRUP857_AVX2_crypto_encode_857x1723(unsigned char *out, const void *v) {
+    const int16 *R0 = v;
+    /* XXX: caller could overlap R with input */
+    uint16 R[429];
+    long i;
+    const uint16 *reading;
+    uint16 *writing;
+    uint16 r0, r1;
+    uint32 r2;
+    uint32 s0;
+
+    reading = (uint16 *) R0;
+    writing = R;
+    i = 54;
+    while (i > 0) {
+        __m256i x, y;
+        --i;
+        if (!i) {
+            reading -= 8;
+            writing -= 4;
+            out -= 4;
+        }
+        x = _mm256_loadu_si256((__m256i *) reading);
+        x = _mm256_add_epi16(x, _mm256_set1_epi16(2583));
+        x &= _mm256_set1_epi16(16383);
+        x = _mm256_mulhi_epi16(x, _mm256_set1_epi16(21846));
+        y = x & _mm256_set1_epi32(65535);
+        x = _mm256_srli_epi32(x, 16);
+        x = _mm256_mullo_epi32(x, _mm256_set1_epi32(1723));
+        x = _mm256_add_epi32(y, x);
+        x = _mm256_shuffle_epi8(x, _mm256_set_epi8(
+                                    12, 8, 4, 0, 12, 8, 4, 0, 14, 13, 10, 9, 6, 5, 2, 1,
+                                    12, 8, 4, 0, 12, 8, 4, 0, 14, 13, 10, 9, 6, 5, 2, 1
+                                ));
+        x = _mm256_permute4x64_epi64(x, 0xd8);
+        _mm_storeu_si128((__m128i *) writing, _mm256_extractf128_si256(x, 0));
+        s0 = _mm256_extract_epi32(x, 4);
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 = _mm256_extract_epi32(x, 6);
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        reading += 16;
+        writing += 8;
+    }
+    R[428] = (((R0[856] + 2583) & 16383) * 10923) >> 15;
+
+    reading = (uint16 *) R;
+    writing = R;
+    i = 14;
+    while (i > 0) {
+        __m256i x, x2, y, y2;
+        --i;
+        if (!i) {
+            reading -= 20;
+            writing -= 10;
+            out -= 20;
+        }
+        x = _mm256_loadu_si256((__m256i *) (reading + 0));
+        x2 = _mm256_loadu_si256((__m256i *) (reading + 16));
+        y = x & _mm256_set1_epi32(65535);
+        y2 = x2 & _mm256_set1_epi32(65535);
+        x = _mm256_srli_epi32(x, 16);
+        x2 = _mm256_srli_epi32(x2, 16);
+        x = _mm256_mullo_epi32(x, _mm256_set1_epi32(11597));
+        x2 = _mm256_mullo_epi32(x2, _mm256_set1_epi32(11597));
+        x = _mm256_add_epi32(y, x);
+        x2 = _mm256_add_epi32(y2, x2);
+        x = _mm256_shuffle_epi8(x, _mm256_set_epi8(
+                                    15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0,
+                                    15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0
+                                ));
+        x2 = _mm256_shuffle_epi8(x2, _mm256_set_epi8(
+                                     15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0,
+                                     15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0
+                                 ));
+        x = _mm256_permute4x64_epi64(x, 0xd8);
+        x2 = _mm256_permute4x64_epi64(x2, 0xd8);
+        _mm256_storeu_si256((__m256i *) writing, _mm256_permute2f128_si256(x, x2, 0x31));
+        _mm256_storeu_si256((__m256i *) out, _mm256_permute2f128_si256(x, x2, 0x20));
+        reading += 32;
+        writing += 16;
+        out += 32;
+    }
+    R[214] = R[428];
+
+    reading = (uint16 *) R;
+    writing = R;
+    i = 7;
+    while (i > 0) {
+        __m256i x, x2, y, y2;
+        --i;
+        if (!i) {
+            reading -= 10;
+            writing -= 5;
+            out -= 10;
+        }
+        x = _mm256_loadu_si256((__m256i *) (reading + 0));
+        x2 = _mm256_loadu_si256((__m256i *) (reading + 16));
+        y = x & _mm256_set1_epi32(65535);
+        y2 = x2 & _mm256_set1_epi32(65535);
+        x = _mm256_srli_epi32(x, 16);
+        x2 = _mm256_srli_epi32(x2, 16);
+        x = _mm256_mullo_epi32(x, _mm256_set1_epi32(2053));
+        x2 = _mm256_mullo_epi32(x2, _mm256_set1_epi32(2053));
+        x = _mm256_add_epi32(y, x);
+        x2 = _mm256_add_epi32(y2, x2);
+        x = _mm256_shuffle_epi8(x, _mm256_set_epi8(
+                                    15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0,
+                                    15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0
+                                ));
+        x2 = _mm256_shuffle_epi8(x2, _mm256_set_epi8(
+                                     15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0,
+                                     15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0
+                                 ));
+        x = _mm256_permute4x64_epi64(x, 0xd8);
+        x2 = _mm256_permute4x64_epi64(x2, 0xd8);
+        _mm256_storeu_si256((__m256i *) writing, _mm256_permute2f128_si256(x, x2, 0x31));
+        _mm256_storeu_si256((__m256i *) out, _mm256_permute2f128_si256(x, x2, 0x20));
+        reading += 32;
+        writing += 16;
+        out += 32;
+    }
+    R[107] = R[214];
+
+    for (i = 0; i < 53; ++i) {
+        r0 = R[2 * i];
+        r1 = R[2 * i + 1];
+        r2 = r0 + r1 * (uint32)65;
+        R[i] = r2;
+    }
+    r0 = R[106];
+    r1 = R[107];
+    r2 = r0 + r1 * (uint32)65;
+    *out++ = r2;
+    r2 >>= 8;
+    R[53] = r2;
+
+    reading = (uint16 *) R;
+    writing = R;
+    i = 2;
+    while (i > 0) {
+        __m256i x, x2, y, y2;
+        --i;
+        if (!i) {
+            reading -= 12;
+            writing -= 6;
+            out -= 12;
+        }
+        x = _mm256_loadu_si256((__m256i *) (reading + 0));
+        x2 = _mm256_loadu_si256((__m256i *) (reading + 16));
+        y = x & _mm256_set1_epi32(65535);
+        y2 = x2 & _mm256_set1_epi32(65535);
+        x = _mm256_srli_epi32(x, 16);
+        x2 = _mm256_srli_epi32(x2, 16);
+        x = _mm256_mullo_epi32(x, _mm256_set1_epi32(4225));
+        x2 = _mm256_mullo_epi32(x2, _mm256_set1_epi32(4225));
+        x = _mm256_add_epi32(y, x);
+        x2 = _mm256_add_epi32(y2, x2);
+        x = _mm256_shuffle_epi8(x, _mm256_set_epi8(
+                                    15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0,
+                                    15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0
+                                ));
+        x2 = _mm256_shuffle_epi8(x2, _mm256_set_epi8(
+                                     15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0,
+                                     15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0
+                                 ));
+        x = _mm256_permute4x64_epi64(x, 0xd8);
+        x2 = _mm256_permute4x64_epi64(x2, 0xd8);
+        _mm256_storeu_si256((__m256i *) writing, _mm256_permute2f128_si256(x, x2, 0x31));
+        _mm256_storeu_si256((__m256i *) out, _mm256_permute2f128_si256(x, x2, 0x20));
+        reading += 32;
+        writing += 16;
+        out += 32;
+    }
+    r0 = R[52];
+    r1 = R[53];
+    r2 = r0 + r1 * (uint32)4225;
+    *out++ = r2;
+    r2 >>= 8;
+    R[26] = r2;
+
+    reading = (uint16 *) R;
+    writing = R;
+    i = 2;
+    while (i > 0) {
+        __m256i x, y;
+        --i;
+        if (!i) {
+            reading -= 6;
+            writing -= 3;
+            out -= 3;
+        }
+        x = _mm256_loadu_si256((__m256i *) reading);
+        y = x & _mm256_set1_epi32(65535);
+        x = _mm256_srli_epi32(x, 16);
+        x = _mm256_mullo_epi32(x, _mm256_set1_epi32(273));
+        x = _mm256_add_epi32(y, x);
+        x = _mm256_shuffle_epi8(x, _mm256_set_epi8(
+                                    12, 8, 4, 0, 12, 8, 4, 0, 14, 13, 10, 9, 6, 5, 2, 1,
+                                    12, 8, 4, 0, 12, 8, 4, 0, 14, 13, 10, 9, 6, 5, 2, 1
+                                ));
+        x = _mm256_permute4x64_epi64(x, 0xd8);
+        _mm_storeu_si128((__m128i *) writing, _mm256_extractf128_si256(x, 0));
+        s0 = _mm256_extract_epi32(x, 4);
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 = _mm256_extract_epi32(x, 6);
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        reading += 16;
+        writing += 8;
+    }
+    R[13] = R[26];
+
+    for (i = 0; i < 7; ++i) {
+        r0 = R[2 * i];
+        r1 = R[2 * i + 1];
+        r2 = r0 + r1 * (uint32)292;
+        *out++ = r2;
+        r2 >>= 8;
+        R[i] = r2;
+    }
+
+    for (i = 0; i < 3; ++i) {
+        r0 = R[2 * i];
+        r1 = R[2 * i + 1];
+        r2 = r0 + r1 * (uint32)334;
+        *out++ = r2;
+        r2 >>= 8;
+        R[i] = r2;
+    }
+    R[3] = R[6];
+
+    for (i = 0; i < 2; ++i) {
+        r0 = R[2 * i];
+        r1 = R[2 * i + 1];
+        r2 = r0 + r1 * (uint32)436;
+        *out++ = r2;
+        r2 >>= 8;
+        R[i] = r2;
+    }
+
+    r0 = R[0];
+    r1 = R[1];
+    r2 = r0 + r1 * (uint32)743;
+    *out++ = r2;
+    r2 >>= 8;
+    *out++ = r2;
+    r2 >>= 8;
+    R[0] = r2;
+
+    r0 = R[0];
+    *out++ = r0; /*clang-analyzer-deadcode.DeadStores*/ /*r0 >>= 8;*/
+}
diff --git a/crypto_kem/sntrup857/avx2/crypto_encode_857x1723.h b/crypto_kem/sntrup857/avx2/crypto_encode_857x1723.h
new file mode 100644
index 00000000..66a9e6ae
--- /dev/null
+++ b/crypto_kem/sntrup857/avx2/crypto_encode_857x1723.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_SNTRUP857_AVX2_CRYPTO_ENCODE_857X1723_H
+#define PQCLEAN_SNTRUP857_AVX2_CRYPTO_ENCODE_857X1723_H
+
+#include <stdint.h>
+#define PQCLEAN_SNTRUP857_AVX2_crypto_encode_857x1723_STRBYTES 1152
+#define PQCLEAN_SNTRUP857_AVX2_crypto_encode_857x1723_ITEMS 857
+#define PQCLEAN_SNTRUP857_AVX2_crypto_encode_857x1723_ITEMBYTES 2
+
+void PQCLEAN_SNTRUP857_AVX2_crypto_encode_857x1723(unsigned char *out, const void *v);
+#endif
diff --git a/crypto_kem/sntrup857/avx2/crypto_encode_857x1723round.c b/crypto_kem/sntrup857/avx2/crypto_encode_857x1723round.c
new file mode 100644
index 00000000..7d9937a1
--- /dev/null
+++ b/crypto_kem/sntrup857/avx2/crypto_encode_857x1723round.c
@@ -0,0 +1,285 @@
+#include "crypto_encode_857x1723round.h"
+#include <immintrin.h>
+/* auto-generated; do not edit */
+
+#define int16 int16_t
+#define uint16 uint16_t
+#define uint32 uint32_t
+
+void PQCLEAN_SNTRUP857_AVX2_crypto_encode_857x1723round(unsigned char *out, const void *v) {
+    const int16 *R0 = v;
+    /* XXX: caller could overlap R with input */
+    uint16 R[429];
+    long i;
+    const uint16 *reading;
+    uint16 *writing;
+    uint16 r0, r1;
+    uint32 r2;
+    uint32 s0;
+
+    reading = (uint16 *) R0;
+    writing = R;
+    i = 54;
+    while (i > 0) {
+        __m256i x, y;
+        --i;
+        if (!i) {
+            reading -= 8;
+            writing -= 4;
+            out -= 4;
+        }
+        x = _mm256_loadu_si256((__m256i *) reading);
+        x = _mm256_mulhrs_epi16(x, _mm256_set1_epi16(10923));
+        x = _mm256_add_epi16(x, _mm256_add_epi16(x, x));
+        x = _mm256_add_epi16(x, _mm256_set1_epi16(2583));
+        x &= _mm256_set1_epi16(16383);
+        x = _mm256_mulhi_epi16(x, _mm256_set1_epi16(21846));
+        y = x & _mm256_set1_epi32(65535);
+        x = _mm256_srli_epi32(x, 16);
+        x = _mm256_mullo_epi32(x, _mm256_set1_epi32(1723));
+        x = _mm256_add_epi32(y, x);
+        x = _mm256_shuffle_epi8(x, _mm256_set_epi8(
+                                    12, 8, 4, 0, 12, 8, 4, 0, 14, 13, 10, 9, 6, 5, 2, 1,
+                                    12, 8, 4, 0, 12, 8, 4, 0, 14, 13, 10, 9, 6, 5, 2, 1
+                                ));
+        x = _mm256_permute4x64_epi64(x, 0xd8);
+        _mm_storeu_si128((__m128i *) writing, _mm256_extractf128_si256(x, 0));
+        s0 = _mm256_extract_epi32(x, 4);
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 = _mm256_extract_epi32(x, 6);
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        reading += 16;
+        writing += 8;
+    }
+    R[428] = (((3 * ((10923 * R0[856] + 16384) >> 15) + 2583) & 16383) * 10923) >> 15;
+
+    reading = (uint16 *) R;
+    writing = R;
+    i = 14;
+    while (i > 0) {
+        __m256i x, x2, y, y2;
+        --i;
+        if (!i) {
+            reading -= 20;
+            writing -= 10;
+            out -= 20;
+        }
+        x = _mm256_loadu_si256((__m256i *) (reading + 0));
+        x2 = _mm256_loadu_si256((__m256i *) (reading + 16));
+        y = x & _mm256_set1_epi32(65535);
+        y2 = x2 & _mm256_set1_epi32(65535);
+        x = _mm256_srli_epi32(x, 16);
+        x2 = _mm256_srli_epi32(x2, 16);
+        x = _mm256_mullo_epi32(x, _mm256_set1_epi32(11597));
+        x2 = _mm256_mullo_epi32(x2, _mm256_set1_epi32(11597));
+        x = _mm256_add_epi32(y, x);
+        x2 = _mm256_add_epi32(y2, x2);
+        x = _mm256_shuffle_epi8(x, _mm256_set_epi8(
+                                    15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0,
+                                    15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0
+                                ));
+        x2 = _mm256_shuffle_epi8(x2, _mm256_set_epi8(
+                                     15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0,
+                                     15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0
+                                 ));
+        x = _mm256_permute4x64_epi64(x, 0xd8);
+        x2 = _mm256_permute4x64_epi64(x2, 0xd8);
+        _mm256_storeu_si256((__m256i *) writing, _mm256_permute2f128_si256(x, x2, 0x31));
+        _mm256_storeu_si256((__m256i *) out, _mm256_permute2f128_si256(x, x2, 0x20));
+        reading += 32;
+        writing += 16;
+        out += 32;
+    }
+    R[214] = R[428];
+
+    reading = (uint16 *) R;
+    writing = R;
+    i = 7;
+    while (i > 0) {
+        __m256i x, x2, y, y2;
+        --i;
+        if (!i) {
+            reading -= 10;
+            writing -= 5;
+            out -= 10;
+        }
+        x = _mm256_loadu_si256((__m256i *) (reading + 0));
+        x2 = _mm256_loadu_si256((__m256i *) (reading + 16));
+        y = x & _mm256_set1_epi32(65535);
+        y2 = x2 & _mm256_set1_epi32(65535);
+        x = _mm256_srli_epi32(x, 16);
+        x2 = _mm256_srli_epi32(x2, 16);
+        x = _mm256_mullo_epi32(x, _mm256_set1_epi32(2053));
+        x2 = _mm256_mullo_epi32(x2, _mm256_set1_epi32(2053));
+        x = _mm256_add_epi32(y, x);
+        x2 = _mm256_add_epi32(y2, x2);
+        x = _mm256_shuffle_epi8(x, _mm256_set_epi8(
+                                    15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0,
+                                    15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0
+                                ));
+        x2 = _mm256_shuffle_epi8(x2, _mm256_set_epi8(
+                                     15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0,
+                                     15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0
+                                 ));
+        x = _mm256_permute4x64_epi64(x, 0xd8);
+        x2 = _mm256_permute4x64_epi64(x2, 0xd8);
+        _mm256_storeu_si256((__m256i *) writing, _mm256_permute2f128_si256(x, x2, 0x31));
+        _mm256_storeu_si256((__m256i *) out, _mm256_permute2f128_si256(x, x2, 0x20));
+        reading += 32;
+        writing += 16;
+        out += 32;
+    }
+    R[107] = R[214];
+
+    for (i = 0; i < 53; ++i) {
+        r0 = R[2 * i];
+        r1 = R[2 * i + 1];
+        r2 = r0 + r1 * (uint32)65;
+        R[i] = r2;
+    }
+    r0 = R[106];
+    r1 = R[107];
+    r2 = r0 + r1 * (uint32)65;
+    *out++ = r2;
+    r2 >>= 8;
+    R[53] = r2;
+
+    reading = (uint16 *) R;
+    writing = R;
+    i = 2;
+    while (i > 0) {
+        __m256i x, x2, y, y2;
+        --i;
+        if (!i) {
+            reading -= 12;
+            writing -= 6;
+            out -= 12;
+        }
+        x = _mm256_loadu_si256((__m256i *) (reading + 0));
+        x2 = _mm256_loadu_si256((__m256i *) (reading + 16));
+        y = x & _mm256_set1_epi32(65535);
+        y2 = x2 & _mm256_set1_epi32(65535);
+        x = _mm256_srli_epi32(x, 16);
+        x2 = _mm256_srli_epi32(x2, 16);
+        x = _mm256_mullo_epi32(x, _mm256_set1_epi32(4225));
+        x2 = _mm256_mullo_epi32(x2, _mm256_set1_epi32(4225));
+        x = _mm256_add_epi32(y, x);
+        x2 = _mm256_add_epi32(y2, x2);
+        x = _mm256_shuffle_epi8(x, _mm256_set_epi8(
+                                    15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0,
+                                    15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0
+                                ));
+        x2 = _mm256_shuffle_epi8(x2, _mm256_set_epi8(
+                                     15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0,
+                                     15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0
+                                 ));
+        x = _mm256_permute4x64_epi64(x, 0xd8);
+        x2 = _mm256_permute4x64_epi64(x2, 0xd8);
+        _mm256_storeu_si256((__m256i *) writing, _mm256_permute2f128_si256(x, x2, 0x31));
+        _mm256_storeu_si256((__m256i *) out, _mm256_permute2f128_si256(x, x2, 0x20));
+        reading += 32;
+        writing += 16;
+        out += 32;
+    }
+    r0 = R[52];
+    r1 = R[53];
+    r2 = r0 + r1 * (uint32)4225;
+    *out++ = r2;
+    r2 >>= 8;
+    R[26] = r2;
+
+    reading = (uint16 *) R;
+    writing = R;
+    i = 2;
+    while (i > 0) {
+        __m256i x, y;
+        --i;
+        if (!i) {
+            reading -= 6;
+            writing -= 3;
+            out -= 3;
+        }
+        x = _mm256_loadu_si256((__m256i *) reading);
+        y = x & _mm256_set1_epi32(65535);
+        x = _mm256_srli_epi32(x, 16);
+        x = _mm256_mullo_epi32(x, _mm256_set1_epi32(273));
+        x = _mm256_add_epi32(y, x);
+        x = _mm256_shuffle_epi8(x, _mm256_set_epi8(
+                                    12, 8, 4, 0, 12, 8, 4, 0, 14, 13, 10, 9, 6, 5, 2, 1,
+                                    12, 8, 4, 0, 12, 8, 4, 0, 14, 13, 10, 9, 6, 5, 2, 1
+                                ));
+        x = _mm256_permute4x64_epi64(x, 0xd8);
+        _mm_storeu_si128((__m128i *) writing, _mm256_extractf128_si256(x, 0));
+        s0 = _mm256_extract_epi32(x, 4);
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 = _mm256_extract_epi32(x, 6);
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        reading += 16;
+        writing += 8;
+    }
+    R[13] = R[26];
+
+    for (i = 0; i < 7; ++i) {
+        r0 = R[2 * i];
+        r1 = R[2 * i + 1];
+        r2 = r0 + r1 * (uint32)292;
+        *out++ = r2;
+        r2 >>= 8;
+        R[i] = r2;
+    }
+
+    for (i = 0; i < 3; ++i) {
+        r0 = R[2 * i];
+        r1 = R[2 * i + 1];
+        r2 = r0 + r1 * (uint32)334;
+        *out++ = r2;
+        r2 >>= 8;
+        R[i] = r2;
+    }
+    R[3] = R[6];
+
+    for (i = 0; i < 2; ++i) {
+        r0 = R[2 * i];
+        r1 = R[2 * i + 1];
+        r2 = r0 + r1 * (uint32)436;
+        *out++ = r2;
+        r2 >>= 8;
+        R[i] = r2;
+    }
+
+    r0 = R[0];
+    r1 = R[1];
+    r2 = r0 + r1 * (uint32)743;
+    *out++ = r2;
+    r2 >>= 8;
+    *out++ = r2;
+    r2 >>= 8;
+    R[0] = r2;
+
+    r0 = R[0];
+    *out++ = r0; /*clang-analyzer-deadcode.DeadStores*/ /*r0 >>= 8;*/
+}
diff --git a/crypto_kem/sntrup857/avx2/crypto_encode_857x1723round.h b/crypto_kem/sntrup857/avx2/crypto_encode_857x1723round.h
new file mode 100644
index 00000000..be2cb158
--- /dev/null
+++ b/crypto_kem/sntrup857/avx2/crypto_encode_857x1723round.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_SNTRUP857_AVX2_CRYPTO_ENCODE_857X1723ROUND_H
+#define PQCLEAN_SNTRUP857_AVX2_CRYPTO_ENCODE_857X1723ROUND_H
+
+#include <stdint.h>
+#define PQCLEAN_SNTRUP857_AVX2_crypto_encode_857x1723round_STRBYTES 1152
+#define PQCLEAN_SNTRUP857_AVX2_crypto_encode_857x1723round_ITEMS 857
+#define PQCLEAN_SNTRUP857_AVX2_crypto_encode_857x1723round_ITEMBYTES 2
+
+void PQCLEAN_SNTRUP857_AVX2_crypto_encode_857x1723round(unsigned char *out, const void *v);
+#endif
diff --git a/crypto_kem/sntrup857/avx2/crypto_encode_857x3.c b/crypto_kem/sntrup857/avx2/crypto_encode_857x3.c
new file mode 100644
index 00000000..44e734f5
--- /dev/null
+++ b/crypto_kem/sntrup857/avx2/crypto_encode_857x3.c
@@ -0,0 +1,64 @@
+#include "crypto_encode_857x3.h"
+#include <immintrin.h>
+#define uint8 uint8_t
+
+#define p 857
+#define loops 7
+#define overshoot 10
+
+static const union {
+    uint8 init[32];
+    __m256i val;
+} lobytes_buf = { .init = {
+        255, 0, 255, 0, 255, 0, 255, 0,
+        255, 0, 255, 0, 255, 0, 255, 0,
+        255, 0, 255, 0, 255, 0, 255, 0,
+        255, 0, 255, 0, 255, 0, 255, 0,
+    }
+};
+#define lobytes (lobytes_buf.val)
+
+void PQCLEAN_SNTRUP857_AVX2_crypto_encode_857x3(unsigned char *s, const void *v) {
+    const uint8 *f = v;
+    int loop;
+    const uint8 *nextf = f + 128 - 4 * overshoot;
+    unsigned char *nexts = s + 32 - overshoot;
+
+    for (loop = loops; loop > 0; --loop) {
+        __m256i f0 = _mm256_loadu_si256((const __m256i *) (f + 0));
+        __m256i f1 = _mm256_loadu_si256((const __m256i *) (f + 32));
+        __m256i f2 = _mm256_loadu_si256((const __m256i *) (f + 64));
+        __m256i f3 = _mm256_loadu_si256((const __m256i *) (f + 96));
+        f = nextf;
+        nextf += 128;
+
+        __m256i a0 = _mm256_packus_epi16(f0 & lobytes, f1 & lobytes);
+        /* 0 2 4 6 8 10 12 14 32 34 36 38 40 42 44 46 */
+        /* 16 18 20 22 24 26 28 30 48 50 52 54 56 58 60 62 */
+        __m256i a1 = _mm256_packus_epi16(_mm256_srli_epi16(f0, 8), _mm256_srli_epi16(f1, 8));
+        /* 1 3 ... */
+        __m256i a2 = _mm256_packus_epi16(f2 & lobytes, f3 & lobytes);
+        __m256i a3 = _mm256_packus_epi16(_mm256_srli_epi16(f2, 8), _mm256_srli_epi16(f3, 8));
+
+        a0 = _mm256_add_epi8(a0, _mm256_slli_epi16(a1 & _mm256_set1_epi8(63), 2));
+        a2 = _mm256_add_epi8(a2, _mm256_slli_epi16(a3 & _mm256_set1_epi8(63), 2));
+
+        __m256i b0 = _mm256_packus_epi16(a0 & lobytes, a2 & lobytes);
+        /* 0 4 8 12 32 36 40 44 64 68 72 76 96 100 104 108 */
+        /* 16 20 24 28 48 52 56 60 80 84 88 92 112 116 120 124 */
+        __m256i b2 = _mm256_packus_epi16(_mm256_srli_epi16(a0, 8), _mm256_srli_epi16(a2, 8));
+        /* 2 6 ... */
+
+        b0 = _mm256_add_epi8(b0, _mm256_slli_epi16(b2 & _mm256_set1_epi8(15), 4));
+
+        b0 = _mm256_permutevar8x32_epi32(b0, _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0));
+
+        b0 = _mm256_add_epi8(b0, _mm256_set1_epi8(85));
+
+        _mm256_storeu_si256((__m256i *) s, b0);
+        s = nexts;
+        nexts += 32;
+    }
+
+    *s++ = *f++ + 1;
+}
diff --git a/crypto_kem/sntrup857/avx2/crypto_encode_857x3.h b/crypto_kem/sntrup857/avx2/crypto_encode_857x3.h
new file mode 100644
index 00000000..13c61537
--- /dev/null
+++ b/crypto_kem/sntrup857/avx2/crypto_encode_857x3.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_SNTRUP857_AVX2_CRYPTO_ENCODE_857X3_H
+#define PQCLEAN_SNTRUP857_AVX2_CRYPTO_ENCODE_857X3_H
+
+#include <stdint.h>
+#define PQCLEAN_SNTRUP857_AVX2_crypto_encode_857x3_STRBYTES 215
+#define PQCLEAN_SNTRUP857_AVX2_crypto_encode_857x3_ITEMS 857
+#define PQCLEAN_SNTRUP857_AVX2_crypto_encode_857x3_ITEMBYTES 1
+
+void PQCLEAN_SNTRUP857_AVX2_crypto_encode_857x3(unsigned char *s, const void *v);
+#endif
diff --git a/crypto_kem/sntrup857/avx2/crypto_encode_857x5167.c b/crypto_kem/sntrup857/avx2/crypto_encode_857x5167.c
new file mode 100644
index 00000000..1dbc4f29
--- /dev/null
+++ b/crypto_kem/sntrup857/avx2/crypto_encode_857x5167.c
@@ -0,0 +1,331 @@
+#include "crypto_encode_857x5167.h"
+#include <immintrin.h>
+/* auto-generated; do not edit */
+
+#define int16 int16_t
+#define uint16 uint16_t
+#define uint32 uint32_t
+
+void PQCLEAN_SNTRUP857_AVX2_crypto_encode_857x5167(unsigned char *out, const void *v) {
+    const int16 *R0 = v;
+    /* XXX: caller could overlap R with input */
+    uint16 R[429];
+    long i;
+    const uint16 *reading;
+    uint16 *writing;
+    uint16 r0, r1;
+    uint32 r2;
+    uint32 s0;
+
+    reading = (uint16 *) R0;
+    writing = R;
+    i = 27;
+    while (i > 0) {
+        __m256i x, x2, y, y2;
+        --i;
+        if (!i) {
+            reading -= 8;
+            writing -= 4;
+            out -= 8;
+        }
+        x = _mm256_loadu_si256((__m256i *) (reading + 0));
+        x2 = _mm256_loadu_si256((__m256i *) (reading + 16));
+        x = _mm256_add_epi16(x, _mm256_set1_epi16(2583));
+        x2 = _mm256_add_epi16(x2, _mm256_set1_epi16(2583));
+        x &= _mm256_set1_epi16(16383);
+        x2 &= _mm256_set1_epi16(16383);
+        y = x & _mm256_set1_epi32(65535);
+        y2 = x2 & _mm256_set1_epi32(65535);
+        x = _mm256_srli_epi32(x, 16);
+        x2 = _mm256_srli_epi32(x2, 16);
+        x = _mm256_mullo_epi32(x, _mm256_set1_epi32(5167));
+        x2 = _mm256_mullo_epi32(x2, _mm256_set1_epi32(5167));
+        x = _mm256_add_epi32(y, x);
+        x2 = _mm256_add_epi32(y2, x2);
+        x = _mm256_shuffle_epi8(x, _mm256_set_epi8(
+                                    15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0,
+                                    15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0
+                                ));
+        x2 = _mm256_shuffle_epi8(x2, _mm256_set_epi8(
+                                     15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0,
+                                     15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0
+                                 ));
+        x = _mm256_permute4x64_epi64(x, 0xd8);
+        x2 = _mm256_permute4x64_epi64(x2, 0xd8);
+        _mm256_storeu_si256((__m256i *) writing, _mm256_permute2f128_si256(x, x2, 0x31));
+        _mm256_storeu_si256((__m256i *) out, _mm256_permute2f128_si256(x, x2, 0x20));
+        reading += 32;
+        writing += 16;
+        out += 32;
+    }
+    R[428] = ((R0[856] + 2583) & 16383);
+
+    reading = (uint16 *) R;
+    writing = R;
+    i = 27;
+    while (i > 0) {
+        __m256i x, y;
+        --i;
+        if (!i) {
+            reading -= 4;
+            writing -= 2;
+            out -= 2;
+        }
+        x = _mm256_loadu_si256((__m256i *) reading);
+        y = x & _mm256_set1_epi32(65535);
+        x = _mm256_srli_epi32(x, 16);
+        x = _mm256_mullo_epi32(x, _mm256_set1_epi32(408));
+        x = _mm256_add_epi32(y, x);
+        x = _mm256_shuffle_epi8(x, _mm256_set_epi8(
+                                    12, 8, 4, 0, 12, 8, 4, 0, 14, 13, 10, 9, 6, 5, 2, 1,
+                                    12, 8, 4, 0, 12, 8, 4, 0, 14, 13, 10, 9, 6, 5, 2, 1
+                                ));
+        x = _mm256_permute4x64_epi64(x, 0xd8);
+        _mm_storeu_si128((__m128i *) writing, _mm256_extractf128_si256(x, 0));
+        s0 = _mm256_extract_epi32(x, 4);
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 = _mm256_extract_epi32(x, 6);
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        reading += 16;
+        writing += 8;
+    }
+    R[214] = R[428];
+
+    reading = (uint16 *) R;
+    writing = R;
+    i = 14;
+    while (i > 0) {
+        __m256i x, y;
+        --i;
+        if (!i) {
+            reading -= 10;
+            writing -= 5;
+            out -= 5;
+        }
+        x = _mm256_loadu_si256((__m256i *) reading);
+        y = x & _mm256_set1_epi32(65535);
+        x = _mm256_srli_epi32(x, 16);
+        x = _mm256_mullo_epi32(x, _mm256_set1_epi32(651));
+        x = _mm256_add_epi32(y, x);
+        x = _mm256_shuffle_epi8(x, _mm256_set_epi8(
+                                    12, 8, 4, 0, 12, 8, 4, 0, 14, 13, 10, 9, 6, 5, 2, 1,
+                                    12, 8, 4, 0, 12, 8, 4, 0, 14, 13, 10, 9, 6, 5, 2, 1
+                                ));
+        x = _mm256_permute4x64_epi64(x, 0xd8);
+        _mm_storeu_si128((__m128i *) writing, _mm256_extractf128_si256(x, 0));
+        s0 = _mm256_extract_epi32(x, 4);
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 = _mm256_extract_epi32(x, 6);
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        reading += 16;
+        writing += 8;
+    }
+    R[107] = R[214];
+
+    reading = (uint16 *) R;
+    writing = R;
+    i = 7;
+    while (i > 0) {
+        __m256i x, y;
+        --i;
+        if (!i) {
+            reading -= 6;
+            writing -= 3;
+            out -= 3;
+        }
+        x = _mm256_loadu_si256((__m256i *) reading);
+        y = x & _mm256_set1_epi32(65535);
+        x = _mm256_srli_epi32(x, 16);
+        x = _mm256_mullo_epi32(x, _mm256_set1_epi32(1656));
+        x = _mm256_add_epi32(y, x);
+        x = _mm256_shuffle_epi8(x, _mm256_set_epi8(
+                                    12, 8, 4, 0, 12, 8, 4, 0, 14, 13, 10, 9, 6, 5, 2, 1,
+                                    12, 8, 4, 0, 12, 8, 4, 0, 14, 13, 10, 9, 6, 5, 2, 1
+                                ));
+        x = _mm256_permute4x64_epi64(x, 0xd8);
+        _mm_storeu_si128((__m128i *) writing, _mm256_extractf128_si256(x, 0));
+        s0 = _mm256_extract_epi32(x, 4);
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 = _mm256_extract_epi32(x, 6);
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        reading += 16;
+        writing += 8;
+    }
+    r0 = R[106];
+    r1 = R[107];
+    r2 = r0 + r1 * (uint32)1656;
+    *out++ = r2;
+    r2 >>= 8;
+    *out++ = r2;
+    r2 >>= 8;
+    R[53] = r2;
+
+    reading = (uint16 *) R;
+    writing = R;
+    i = 2;
+    while (i > 0) {
+        __m256i x, x2, y, y2;
+        --i;
+        if (!i) {
+            reading -= 12;
+            writing -= 6;
+            out -= 12;
+        }
+        x = _mm256_loadu_si256((__m256i *) (reading + 0));
+        x2 = _mm256_loadu_si256((__m256i *) (reading + 16));
+        y = x & _mm256_set1_epi32(65535);
+        y2 = x2 & _mm256_set1_epi32(65535);
+        x = _mm256_srli_epi32(x, 16);
+        x2 = _mm256_srli_epi32(x2, 16);
+        x = _mm256_mullo_epi32(x, _mm256_set1_epi32(10713));
+        x2 = _mm256_mullo_epi32(x2, _mm256_set1_epi32(10713));
+        x = _mm256_add_epi32(y, x);
+        x2 = _mm256_add_epi32(y2, x2);
+        x = _mm256_shuffle_epi8(x, _mm256_set_epi8(
+                                    15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0,
+                                    15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0
+                                ));
+        x2 = _mm256_shuffle_epi8(x2, _mm256_set_epi8(
+                                     15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0,
+                                     15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0
+                                 ));
+        x = _mm256_permute4x64_epi64(x, 0xd8);
+        x2 = _mm256_permute4x64_epi64(x2, 0xd8);
+        _mm256_storeu_si256((__m256i *) writing, _mm256_permute2f128_si256(x, x2, 0x31));
+        _mm256_storeu_si256((__m256i *) out, _mm256_permute2f128_si256(x, x2, 0x20));
+        reading += 32;
+        writing += 16;
+        out += 32;
+    }
+    r0 = R[52];
+    r1 = R[53];
+    r2 = r0 + r1 * (uint32)10713;
+    *out++ = r2;
+    r2 >>= 8;
+    R[26] = r2;
+
+    reading = (uint16 *) R;
+    writing = R;
+    i = 2;
+    while (i > 0) {
+        __m256i x, y;
+        --i;
+        if (!i) {
+            reading -= 6;
+            writing -= 3;
+            out -= 3;
+        }
+        x = _mm256_loadu_si256((__m256i *) reading);
+        y = x & _mm256_set1_epi32(65535);
+        x = _mm256_srli_epi32(x, 16);
+        x = _mm256_mullo_epi32(x, _mm256_set1_epi32(1752));
+        x = _mm256_add_epi32(y, x);
+        x = _mm256_shuffle_epi8(x, _mm256_set_epi8(
+                                    12, 8, 4, 0, 12, 8, 4, 0, 14, 13, 10, 9, 6, 5, 2, 1,
+                                    12, 8, 4, 0, 12, 8, 4, 0, 14, 13, 10, 9, 6, 5, 2, 1
+                                ));
+        x = _mm256_permute4x64_epi64(x, 0xd8);
+        _mm_storeu_si128((__m128i *) writing, _mm256_extractf128_si256(x, 0));
+        s0 = _mm256_extract_epi32(x, 4);
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 = _mm256_extract_epi32(x, 6);
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        s0 >>= 8;
+        *out++ = s0;
+        reading += 16;
+        writing += 8;
+    }
+    R[13] = R[26];
+
+    for (i = 0; i < 7; ++i) {
+        r0 = R[2 * i];
+        r1 = R[2 * i + 1];
+        r2 = r0 + r1 * (uint32)11991;
+        *out++ = r2;
+        r2 >>= 8;
+        *out++ = r2;
+        r2 >>= 8;
+        R[i] = r2;
+    }
+
+    for (i = 0; i < 3; ++i) {
+        r0 = R[2 * i];
+        r1 = R[2 * i + 1];
+        r2 = r0 + r1 * (uint32)2194;
+        *out++ = r2;
+        r2 >>= 8;
+        *out++ = r2;
+        r2 >>= 8;
+        R[i] = r2;
+    }
+    R[3] = R[6];
+
+    r0 = R[0];
+    r1 = R[1];
+    r2 = r0 + r1 * (uint32)74;
+    R[0] = r2;
+    r0 = R[2];
+    r1 = R[3];
+    r2 = r0 + r1 * (uint32)74;
+    *out++ = r2;
+    r2 >>= 8;
+    R[1] = r2;
+
+    r0 = R[0];
+    r1 = R[1];
+    r2 = r0 + r1 * (uint32)5476;
+    *out++ = r2;
+    r2 >>= 8;
+    R[0] = r2;
+
+    r0 = R[0];
+    *out++ = r0;
+    r0 >>= 8;
+    *out++ = r0; /*clang-analyzer-deadcode.DeadStores*/ /*r0 >>= 8;*/
+}
diff --git a/crypto_kem/sntrup857/avx2/crypto_encode_857x5167.h b/crypto_kem/sntrup857/avx2/crypto_encode_857x5167.h
new file mode 100644
index 00000000..0c513509
--- /dev/null
+++ b/crypto_kem/sntrup857/avx2/crypto_encode_857x5167.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_SNTRUP857_AVX2_CRYPTO_ENCODE_857X5167_H
+#define PQCLEAN_SNTRUP857_AVX2_CRYPTO_ENCODE_857X5167_H
+
+#include <stdint.h>
+#define PQCLEAN_SNTRUP857_AVX2_crypto_encode_857x5167_STRBYTES 1322
+#define PQCLEAN_SNTRUP857_AVX2_crypto_encode_857x5167_ITEMS 857
+#define PQCLEAN_SNTRUP857_AVX2_crypto_encode_857x5167_ITEMBYTES 2
+
+void PQCLEAN_SNTRUP857_AVX2_crypto_encode_857x5167(unsigned char *out, const void *v);
+#endif
diff --git a/crypto_kem/sntrup857/avx2/crypto_encode_857xfreeze3.c b/crypto_kem/sntrup857/avx2/crypto_encode_857xfreeze3.c
new file mode 100644
index 00000000..108da266
--- /dev/null
+++ b/crypto_kem/sntrup857/avx2/crypto_encode_857xfreeze3.c
@@ -0,0 +1,31 @@
+#include "crypto_encode_857xfreeze3.h"
+#include <immintrin.h>
+#define int16 int16_t
+
+#define p 857
+
+void PQCLEAN_SNTRUP857_AVX2_crypto_encode_857xfreeze3(unsigned char *s, const void *v) {
+    const int16 *r = v;
+
+    int i = p - 16;
+    for (;;) {
+        do {
+            __m256i x = _mm256_loadu_si256((__m256i *) r);
+            __m256i y = _mm256_mulhrs_epi16(x, _mm256_set1_epi16(10923));
+            x = _mm256_sub_epi16(x, y);
+            y = _mm256_add_epi16(y, y);
+            x = _mm256_sub_epi16(x, y);
+            __m128i x0 = _mm256_extractf128_si256(x, 0);
+            __m128i x1 = _mm256_extractf128_si256(x, 1);
+            _mm_storeu_si128((__m128i *) s, _mm_packs_epi16(x0, x1));
+            i -= 16;
+            r += 16;
+            s += 16;
+        } while (i >= 0);
+        if (i <= -16) {
+            break;
+        }
+        r += i;
+        s += i;
+    }
+}
diff --git a/crypto_kem/sntrup857/avx2/crypto_encode_857xfreeze3.h b/crypto_kem/sntrup857/avx2/crypto_encode_857xfreeze3.h
new file mode 100644
index 00000000..99052e21
--- /dev/null
+++ b/crypto_kem/sntrup857/avx2/crypto_encode_857xfreeze3.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_SNTRUP857_AVX2_CRYPTO_ENCODE_857XFREEZE3_H
+#define PQCLEAN_SNTRUP857_AVX2_CRYPTO_ENCODE_857XFREEZE3_H
+
+#include <stdint.h>
+#define PQCLEAN_SNTRUP857_AVX2_crypto_encode_857xfreeze3_STRBYTES 857
+#define PQCLEAN_SNTRUP857_AVX2_crypto_encode_857xfreeze3_ITEMS 857
+#define PQCLEAN_SNTRUP857_AVX2_crypto_encode_857xfreeze3_ITEMBYTES 2
+
+void PQCLEAN_SNTRUP857_AVX2_crypto_encode_857xfreeze3(unsigned char *s, const void *v);
+#endif
diff --git a/crypto_kem/sntrup857/avx2/crypto_encode_857xint16.c b/crypto_kem/sntrup857/avx2/crypto_encode_857xint16.c
new file mode 100644
index 00000000..7c889f7a
--- /dev/null
+++ b/crypto_kem/sntrup857/avx2/crypto_encode_857xint16.c
@@ -0,0 +1,13 @@
+#include "crypto_encode_857xint16.h"
+
+
+void PQCLEAN_SNTRUP857_AVX2_crypto_encode_857xint16(unsigned char *s, const void *v) {
+    const uint16_t *x = v;
+    int i;
+
+    for (i = 0; i < 857; ++i) {
+        uint16_t u = *x++;
+        *s++ = u;
+        *s++ = u >> 8;
+    }
+}
diff --git a/crypto_kem/sntrup857/avx2/crypto_encode_857xint16.h b/crypto_kem/sntrup857/avx2/crypto_encode_857xint16.h
new file mode 100644
index 00000000..e131a942
--- /dev/null
+++ b/crypto_kem/sntrup857/avx2/crypto_encode_857xint16.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_SNTRUP857_AVX2_CRYPTO_ENCODE_857XINT16_H
+#define PQCLEAN_SNTRUP857_AVX2_CRYPTO_ENCODE_857XINT16_H
+
+#include <stdint.h>
+#define PQCLEAN_SNTRUP857_AVX2_crypto_encode_857xint16_STRBYTES 1714
+#define PQCLEAN_SNTRUP857_AVX2_crypto_encode_857xint16_ITEMBYTES 2
+#define PQCLEAN_SNTRUP857_AVX2_crypto_encode_857xint16_ITEMS 857
+
+void PQCLEAN_SNTRUP857_AVX2_crypto_encode_857xint16(unsigned char *s, const void *v);
+#endif
diff --git a/crypto_kem/sntrup857/avx2/crypto_encode_int16.c b/crypto_kem/sntrup857/avx2/crypto_encode_int16.c
new file mode 100644
index 00000000..33767274
--- /dev/null
+++ b/crypto_kem/sntrup857/avx2/crypto_encode_int16.c
@@ -0,0 +1,9 @@
+#include "crypto_encode_int16.h"
+
+#define uint16 uint16_t
+
+void PQCLEAN_SNTRUP857_AVX2_crypto_encode_int16(unsigned char *s, const void *x) {
+    uint16 u = *(const uint16 *) x;
+    s[0] = u;
+    s[1] = u >> 8;
+}
diff --git a/crypto_kem/sntrup857/avx2/crypto_encode_int16.h b/crypto_kem/sntrup857/avx2/crypto_encode_int16.h
new file mode 100644
index 00000000..1ca7ca0a
--- /dev/null
+++ b/crypto_kem/sntrup857/avx2/crypto_encode_int16.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_SNTRUP857_AVX2_CRYPTO_ENCODE_INT16_H
+#define PQCLEAN_SNTRUP857_AVX2_CRYPTO_ENCODE_INT16_H
+
+#include <stdint.h>
+#define PQCLEAN_SNTRUP857_AVX2_crypto_encode_int16_STRBYTES 2
+#define PQCLEAN_SNTRUP857_AVX2_crypto_encode_int16_ITEMBYTES 2
+#define PQCLEAN_SNTRUP857_AVX2_crypto_encode_int16_ITEMS 1
+
+void PQCLEAN_SNTRUP857_AVX2_crypto_encode_int16(unsigned char *s, const void *x);
+#endif
diff --git a/crypto_kem/sntrup857/avx2/crypto_sort_int32.c b/crypto_kem/sntrup857/avx2/crypto_sort_int32.c
new file mode 100644
index 00000000..a438d7d3
--- /dev/null
+++ b/crypto_kem/sntrup857/avx2/crypto_sort_int32.c
@@ -0,0 +1,1210 @@
+#include "crypto_sort_int32.h"
+#include <immintrin.h>
+// Based on supercop-20200820/crypto_sort/int32/avx2
+
+
+#define int32 int32_t
+
+typedef __m256i int32x8;
+#define int32x8_load(z) _mm256_loadu_si256((__m256i *) (z))
+#define int32x8_store(z,i) _mm256_storeu_si256((__m256i *) (z),(i))
+#define int32x8_min _mm256_min_epi32
+#define int32x8_max _mm256_max_epi32
+
+#define int32x8_MINMAX(a,b) \
+    do { \
+        int32x8 c = int32x8_min((a),(b)); \
+        (b) = int32x8_max((a),(b)); \
+        (a) = c; \
+    } while(0)
+
+static inline void int32_MINMAX(int32 *a, int32 *b) {
+    int32 ab = *b ^ *a;
+    int32 c = (int32)((int64_t) * b - (int64_t) * a);
+    c ^= ab & (c ^ *b);
+    c >>= 31;
+    c &= ab;
+    *a ^= c;
+    *b ^= c;
+}
+
+static void minmax_vector(int32 *x, int32 *y, size_t n) {
+    if ((long long) n < 8) {
+        while ((long long) n > 0) {
+            int32_MINMAX(x, y);
+            ++x;
+            ++y;
+            --n;
+        }
+        return;
+    }
+    if (n & 7) {
+        int32x8 x0 = int32x8_load(x + n - 8);
+        int32x8 y0 = int32x8_load(y + n - 8);
+        int32x8_MINMAX(x0, y0);
+        int32x8_store(x + n - 8, x0);
+        int32x8_store(y + n - 8, y0);
+        n &= ~7;
+    }
+    do {
+        int32x8 x0 = int32x8_load(x);
+        int32x8 y0 = int32x8_load(y);
+        int32x8_MINMAX(x0, y0);
+        int32x8_store(x, x0);
+        int32x8_store(y, y0);
+        x += 8;
+        y += 8;
+        n -= 8;
+    } while (n);
+}
+
+/* stages 8,4,2,1 of size-16 bitonic merging */
+static void merge16_finish(int32 *x, int32x8 x0, int32x8 x1, int flagdown) {
+    int32x8 b0, b1, c0, c1, mask;
+
+    int32x8_MINMAX(x0, x1);
+
+    b0 = _mm256_permute2x128_si256(x0, x1, 0x20); /* A0123B0123 */
+    b1 = _mm256_permute2x128_si256(x0, x1, 0x31); /* A4567B4567 */
+
+    int32x8_MINMAX(b0, b1);
+
+    c0 = _mm256_unpacklo_epi64(b0, b1); /* A0145B0145 */
+    c1 = _mm256_unpackhi_epi64(b0, b1); /* A2367B2367 */
+
+    int32x8_MINMAX(c0, c1);
+
+    b0 = _mm256_unpacklo_epi32(c0, c1); /* A0213B0213 */
+    b1 = _mm256_unpackhi_epi32(c0, c1); /* A4657B4657 */
+
+    c0 = _mm256_unpacklo_epi64(b0, b1); /* A0246B0246 */
+    c1 = _mm256_unpackhi_epi64(b0, b1); /* A1357B1357 */
+
+    int32x8_MINMAX(c0, c1);
+
+    b0 = _mm256_unpacklo_epi32(c0, c1); /* A0123B0123 */
+    b1 = _mm256_unpackhi_epi32(c0, c1); /* A4567B4567 */
+
+    x0 = _mm256_permute2x128_si256(b0, b1, 0x20); /* A01234567 */
+    x1 = _mm256_permute2x128_si256(b0, b1, 0x31); /* A01234567 */
+
+    if (flagdown) {
+        mask = _mm256_set1_epi32(-1);
+        x0 ^= mask;
+        x1 ^= mask;
+    }
+
+    int32x8_store(&x[0], x0);
+    int32x8_store(&x[8], x1);
+}
+
+/* stages 64,32 of bitonic merging; n is multiple of 128 */
+static void int32_twostages_32(int32 *x, size_t n) {
+    size_t i;
+
+    while (n > 0) {
+        for (i = 0; i < 32; i += 8) {
+            int32x8 x0 = int32x8_load(&x[i]);
+            int32x8 x1 = int32x8_load(&x[i + 32]);
+            int32x8 x2 = int32x8_load(&x[i + 64]);
+            int32x8 x3 = int32x8_load(&x[i + 96]);
+
+            int32x8_MINMAX(x0, x2);
+            int32x8_MINMAX(x1, x3);
+            int32x8_MINMAX(x0, x1);
+            int32x8_MINMAX(x2, x3);
+
+            int32x8_store(&x[i], x0);
+            int32x8_store(&x[i + 32], x1);
+            int32x8_store(&x[i + 64], x2);
+            int32x8_store(&x[i + 96], x3);
+        }
+        x += 128;
+        n -= 128;
+    }
+}
+
+/* stages 4q,2q,q of bitonic merging */
+static size_t int32_threestages(int32 *x, size_t n, size_t q) {
+    size_t k, i;
+
+    for (k = 0; k + 8 * q <= n; k += 8 * q) {
+        for (i = k; i < k + q; i += 8) {
+            int32x8 x0 = int32x8_load(&x[i]);
+            int32x8 x1 = int32x8_load(&x[i + q]);
+            int32x8 x2 = int32x8_load(&x[i + 2 * q]);
+            int32x8 x3 = int32x8_load(&x[i + 3 * q]);
+            int32x8 x4 = int32x8_load(&x[i + 4 * q]);
+            int32x8 x5 = int32x8_load(&x[i + 5 * q]);
+            int32x8 x6 = int32x8_load(&x[i + 6 * q]);
+            int32x8 x7 = int32x8_load(&x[i + 7 * q]);
+
+            int32x8_MINMAX(x0, x4);
+            int32x8_MINMAX(x1, x5);
+            int32x8_MINMAX(x2, x6);
+            int32x8_MINMAX(x3, x7);
+            int32x8_MINMAX(x0, x2);
+            int32x8_MINMAX(x1, x3);
+            int32x8_MINMAX(x4, x6);
+            int32x8_MINMAX(x5, x7);
+            int32x8_MINMAX(x0, x1);
+            int32x8_MINMAX(x2, x3);
+            int32x8_MINMAX(x4, x5);
+            int32x8_MINMAX(x6, x7);
+
+            int32x8_store(&x[i], x0);
+            int32x8_store(&x[i + q], x1);
+            int32x8_store(&x[i + 2 * q], x2);
+            int32x8_store(&x[i + 3 * q], x3);
+            int32x8_store(&x[i + 4 * q], x4);
+            int32x8_store(&x[i + 5 * q], x5);
+            int32x8_store(&x[i + 6 * q], x6);
+            int32x8_store(&x[i + 7 * q], x7);
+        }
+    }
+
+    return k;
+}
+
+/* n is a power of 2; n >= 8; if n == 8 then flagdown */
+// NOLINTNEXTLINE(google-readability-function-size)
+static void int32_sort_2power(int32 *x, size_t n, int flagdown) {
+    size_t p, q, i, j, k;
+    int32x8 mask;
+
+    if (n == 8) {
+        int32 x0 = x[0];
+        int32 x1 = x[1];
+        int32 x2 = x[2];
+        int32 x3 = x[3];
+        int32 x4 = x[4];
+        int32 x5 = x[5];
+        int32 x6 = x[6];
+        int32 x7 = x[7];
+
+        /* odd-even sort instead of bitonic sort */
+
+        int32_MINMAX(&x1, &x0);
+        int32_MINMAX(&x3, &x2);
+        int32_MINMAX(&x2, &x0);
+        int32_MINMAX(&x3, &x1);
+        int32_MINMAX(&x2, &x1);
+
+        int32_MINMAX(&x5, &x4);
+        int32_MINMAX(&x7, &x6);
+        int32_MINMAX(&x6, &x4);
+        int32_MINMAX(&x7, &x5);
+        int32_MINMAX(&x6, &x5);
+
+        int32_MINMAX(&x4, &x0);
+        int32_MINMAX(&x6, &x2);
+        int32_MINMAX(&x4, &x2);
+
+        int32_MINMAX(&x5, &x1);
+        int32_MINMAX(&x7, &x3);
+        int32_MINMAX(&x5, &x3);
+
+        int32_MINMAX(&x2, &x1);
+        int32_MINMAX(&x4, &x3);
+        int32_MINMAX(&x6, &x5);
+
+        x[0] = x0;
+        x[1] = x1;
+        x[2] = x2;
+        x[3] = x3;
+        x[4] = x4;
+        x[5] = x5;
+        x[6] = x6;
+        x[7] = x7;
+        return;
+    }
+
+    if (n == 16) {
+        int32x8 x0, x1, b0, b1, c0, c1;
+
+        x0 = int32x8_load(&x[0]);
+        x1 = int32x8_load(&x[8]);
+
+        mask = _mm256_set_epi32(0, 0, -1, -1, 0, 0, -1, -1);
+
+        x0 ^= mask; /* A01234567 */
+        x1 ^= mask; /* B01234567 */
+
+        b0 = _mm256_unpacklo_epi32(x0, x1); /* AB0AB1AB4AB5 */
+        b1 = _mm256_unpackhi_epi32(x0, x1); /* AB2AB3AB6AB7 */
+
+        c0 = _mm256_unpacklo_epi64(b0, b1); /* AB0AB2AB4AB6 */
+        c1 = _mm256_unpackhi_epi64(b0, b1); /* AB1AB3AB5AB7 */
+
+        int32x8_MINMAX(c0, c1);
+
+        mask = _mm256_set_epi32(0, 0, -1, -1, -1, -1, 0, 0);
+        c0 ^= mask;
+        c1 ^= mask;
+
+        b0 = _mm256_unpacklo_epi32(c0, c1); /* A01B01A45B45 */
+        b1 = _mm256_unpackhi_epi32(c0, c1); /* A23B23A67B67 */
+
+        int32x8_MINMAX(b0, b1);
+
+        x0 = _mm256_unpacklo_epi64(b0, b1); /* A01234567 */
+        x1 = _mm256_unpackhi_epi64(b0, b1); /* B01234567 */
+
+        b0 = _mm256_unpacklo_epi32(x0, x1); /* AB0AB1AB4AB5 */
+        b1 = _mm256_unpackhi_epi32(x0, x1); /* AB2AB3AB6AB7 */
+
+        c0 = _mm256_unpacklo_epi64(b0, b1); /* AB0AB2AB4AB6 */
+        c1 = _mm256_unpackhi_epi64(b0, b1); /* AB1AB3AB5AB7 */
+
+        int32x8_MINMAX(c0, c1);
+
+        b0 = _mm256_unpacklo_epi32(c0, c1); /* A01B01A45B45 */
+        b1 = _mm256_unpackhi_epi32(c0, c1); /* A23B23A67B67 */
+
+        b0 ^= mask;
+        b1 ^= mask;
+
+        c0 = _mm256_permute2x128_si256(b0, b1, 0x20); /* A01B01A23B23 */
+        c1 = _mm256_permute2x128_si256(b0, b1, 0x31); /* A45B45A67B67 */
+
+        int32x8_MINMAX(c0, c1);
+
+        b0 = _mm256_permute2x128_si256(c0, c1, 0x20); /* A01B01A45B45 */
+        b1 = _mm256_permute2x128_si256(c0, c1, 0x31); /* A23B23A67B67 */
+
+        int32x8_MINMAX(b0, b1);
+
+        x0 = _mm256_unpacklo_epi64(b0, b1); /* A01234567 */
+        x1 = _mm256_unpackhi_epi64(b0, b1); /* B01234567 */
+
+        b0 = _mm256_unpacklo_epi32(x0, x1); /* AB0AB1AB4AB5 */
+        b1 = _mm256_unpackhi_epi32(x0, x1); /* AB2AB3AB6AB7 */
+
+        c0 = _mm256_unpacklo_epi64(b0, b1); /* AB0AB2AB4AB6 */
+        c1 = _mm256_unpackhi_epi64(b0, b1); /* AB1AB3AB5AB7 */
+
+        int32x8_MINMAX(c0, c1);
+
+        b0 = _mm256_unpacklo_epi32(c0, c1); /* A01B01A45B45 */
+        b1 = _mm256_unpackhi_epi32(c0, c1); /* A23B23A67B67 */
+
+        x0 = _mm256_unpacklo_epi64(b0, b1); /* A01234567 */
+        x1 = _mm256_unpackhi_epi64(b0, b1); /* B01234567 */
+
+        mask = _mm256_set1_epi32(-1);
+        if (flagdown) {
+            x1 ^= mask;
+        } else {
+            x0 ^= mask;
+        }
+
+        merge16_finish(x, x0, x1, flagdown);
+        return;
+    }
+
+    if (n == 32) {
+        int32x8 x0, x1, x2, x3;
+
+        int32_sort_2power(x, 16, 1);
+        int32_sort_2power(x + 16, 16, 0);
+
+        x0 = int32x8_load(&x[0]);
+        x1 = int32x8_load(&x[8]);
+        x2 = int32x8_load(&x[16]);
+        x3 = int32x8_load(&x[24]);
+
+        if (flagdown) {
+            mask = _mm256_set1_epi32(-1);
+            x0 ^= mask;
+            x1 ^= mask;
+            x2 ^= mask;
+            x3 ^= mask;
+        }
+
+        int32x8_MINMAX(x0, x2);
+        int32x8_MINMAX(x1, x3);
+
+        merge16_finish(x, x0, x1, flagdown);
+        merge16_finish(x + 16, x2, x3, flagdown);
+        return;
+    }
+
+    p = n >> 3;
+    for (i = 0; i < p; i += 8) {
+        int32x8 x0 = int32x8_load(&x[i]);
+        int32x8 x2 = int32x8_load(&x[i + 2 * p]);
+        int32x8 x4 = int32x8_load(&x[i + 4 * p]);
+        int32x8 x6 = int32x8_load(&x[i + 6 * p]);
+
+        /* odd-even stage instead of bitonic stage */
+
+        int32x8_MINMAX(x4, x0);
+        int32x8_MINMAX(x6, x2);
+        int32x8_MINMAX(x2, x0);
+        int32x8_MINMAX(x6, x4);
+        int32x8_MINMAX(x2, x4);
+
+        int32x8_store(&x[i], x0);
+        int32x8_store(&x[i + 2 * p], x2);
+        int32x8_store(&x[i + 4 * p], x4);
+        int32x8_store(&x[i + 6 * p], x6);
+
+        int32x8 x1 = int32x8_load(&x[i + p]);
+        int32x8 x3 = int32x8_load(&x[i + 3 * p]);
+        int32x8 x5 = int32x8_load(&x[i + 5 * p]);
+        int32x8 x7 = int32x8_load(&x[i + 7 * p]);
+
+        int32x8_MINMAX(x1, x5);
+        int32x8_MINMAX(x3, x7);
+        int32x8_MINMAX(x1, x3);
+        int32x8_MINMAX(x5, x7);
+        int32x8_MINMAX(x5, x3);
+
+        int32x8_store(&x[i + p], x1);
+        int32x8_store(&x[i + 3 * p], x3);
+        int32x8_store(&x[i + 5 * p], x5);
+        int32x8_store(&x[i + 7 * p], x7);
+    }
+
+    if (n >= 128) {
+        int flip, flipflip;
+
+        mask = _mm256_set1_epi32(-1);
+
+        for (j = 0; j < n; j += 32) {
+            int32x8 x0 = int32x8_load(&x[j]);
+            int32x8 x1 = int32x8_load(&x[j + 16]);
+            x0 ^= mask;
+            x1 ^= mask;
+            int32x8_store(&x[j], x0);
+            int32x8_store(&x[j + 16], x1);
+        }
+
+        p = 8;
+        for (;;) { /* for p in [8, 16, ..., n/16] */
+            q = p >> 1;
+            while (q >= 128) {
+                int32_threestages(x, n, q >> 2);
+                q >>= 3;
+            }
+            if (q == 64) {
+                int32_twostages_32(x, n);
+                q = 16;
+            }
+            if (q == 32) {
+                q = 8;
+                for (k = 0; k < n; k += 8 * q) {
+                    for (i = k; i < k + q; i += 8) {
+                        int32x8 x0 = int32x8_load(&x[i]);
+                        int32x8 x1 = int32x8_load(&x[i + q]);
+                        int32x8 x2 = int32x8_load(&x[i + 2 * q]);
+                        int32x8 x3 = int32x8_load(&x[i + 3 * q]);
+                        int32x8 x4 = int32x8_load(&x[i + 4 * q]);
+                        int32x8 x5 = int32x8_load(&x[i + 5 * q]);
+                        int32x8 x6 = int32x8_load(&x[i + 6 * q]);
+                        int32x8 x7 = int32x8_load(&x[i + 7 * q]);
+
+                        int32x8_MINMAX(x0, x4);
+                        int32x8_MINMAX(x1, x5);
+                        int32x8_MINMAX(x2, x6);
+                        int32x8_MINMAX(x3, x7);
+                        int32x8_MINMAX(x0, x2);
+                        int32x8_MINMAX(x1, x3);
+                        int32x8_MINMAX(x4, x6);
+                        int32x8_MINMAX(x5, x7);
+                        int32x8_MINMAX(x0, x1);
+                        int32x8_MINMAX(x2, x3);
+                        int32x8_MINMAX(x4, x5);
+                        int32x8_MINMAX(x6, x7);
+
+                        int32x8_store(&x[i], x0);
+                        int32x8_store(&x[i + q], x1);
+                        int32x8_store(&x[i + 2 * q], x2);
+                        int32x8_store(&x[i + 3 * q], x3);
+                        int32x8_store(&x[i + 4 * q], x4);
+                        int32x8_store(&x[i + 5 * q], x5);
+                        int32x8_store(&x[i + 6 * q], x6);
+                        int32x8_store(&x[i + 7 * q], x7);
+                    }
+                }
+                q = 4;
+            }
+            if (q == 16) {
+                q = 8;
+                for (k = 0; k < n; k += 4 * q) {
+                    for (i = k; i < k + q; i += 8) {
+                        int32x8 x0 = int32x8_load(&x[i]);
+                        int32x8 x1 = int32x8_load(&x[i + q]);
+                        int32x8 x2 = int32x8_load(&x[i + 2 * q]);
+                        int32x8 x3 = int32x8_load(&x[i + 3 * q]);
+
+                        int32x8_MINMAX(x0, x2);
+                        int32x8_MINMAX(x1, x3);
+                        int32x8_MINMAX(x0, x1);
+                        int32x8_MINMAX(x2, x3);
+
+                        int32x8_store(&x[i], x0);
+                        int32x8_store(&x[i + q], x1);
+                        int32x8_store(&x[i + 2 * q], x2);
+                        int32x8_store(&x[i + 3 * q], x3);
+                    }
+                }
+                q = 4;
+            }
+            if (q == 8) {
+                for (k = 0; k < n; k += q + q) {
+                    int32x8 x0 = int32x8_load(&x[k]);
+                    int32x8 x1 = int32x8_load(&x[k + q]);
+
+                    int32x8_MINMAX(x0, x1);
+
+                    int32x8_store(&x[k], x0);
+                    int32x8_store(&x[k + q], x1);
+                }
+            }
+
+            q = n >> 3;
+            flip = (p << 1 == q);
+            flipflip = !flip;
+            for (j = 0; j < q; j += p + p) {
+                for (k = j; k < j + p + p; k += p) {
+                    for (i = k; i < k + p; i += 8) {
+                        int32x8 x0 = int32x8_load(&x[i]);
+                        int32x8 x1 = int32x8_load(&x[i + q]);
+                        int32x8 x2 = int32x8_load(&x[i + 2 * q]);
+                        int32x8 x3 = int32x8_load(&x[i + 3 * q]);
+                        int32x8 x4 = int32x8_load(&x[i + 4 * q]);
+                        int32x8 x5 = int32x8_load(&x[i + 5 * q]);
+                        int32x8 x6 = int32x8_load(&x[i + 6 * q]);
+                        int32x8 x7 = int32x8_load(&x[i + 7 * q]);
+
+                        int32x8_MINMAX(x0, x1);
+                        int32x8_MINMAX(x2, x3);
+                        int32x8_MINMAX(x4, x5);
+                        int32x8_MINMAX(x6, x7);
+                        int32x8_MINMAX(x0, x2);
+                        int32x8_MINMAX(x1, x3);
+                        int32x8_MINMAX(x4, x6);
+                        int32x8_MINMAX(x5, x7);
+                        int32x8_MINMAX(x0, x4);
+                        int32x8_MINMAX(x1, x5);
+                        int32x8_MINMAX(x2, x6);
+                        int32x8_MINMAX(x3, x7);
+
+                        if (flip) {
+                            x0 ^= mask;
+                            x1 ^= mask;
+                            x2 ^= mask;
+                            x3 ^= mask;
+                            x4 ^= mask;
+                            x5 ^= mask;
+                            x6 ^= mask;
+                            x7 ^= mask;
+                        }
+
+                        int32x8_store(&x[i], x0);
+                        int32x8_store(&x[i + q], x1);
+                        int32x8_store(&x[i + 2 * q], x2);
+                        int32x8_store(&x[i + 3 * q], x3);
+                        int32x8_store(&x[i + 4 * q], x4);
+                        int32x8_store(&x[i + 5 * q], x5);
+                        int32x8_store(&x[i + 6 * q], x6);
+                        int32x8_store(&x[i + 7 * q], x7);
+                    }
+                    flip ^= 1;
+                }
+                flip ^= flipflip;
+            }
+
+            if (p << 4 == n) {
+                break;
+            }
+            p <<= 1;
+        }
+    }
+
+    for (p = 4; p >= 1; p >>= 1) {
+        int32 *z = x;
+        int32 *target = x + n;
+        if (p == 4) {
+            mask = _mm256_set_epi32(0, 0, 0, 0, -1, -1, -1, -1);
+            while (z != target) {
+                int32x8 x0 = int32x8_load(&z[0]);
+                int32x8 x1 = int32x8_load(&z[8]);
+                x0 ^= mask;
+                x1 ^= mask;
+                int32x8_store(&z[0], x0);
+                int32x8_store(&z[8], x1);
+                z += 16;
+            }
+        } else if (p == 2) {
+            mask = _mm256_set_epi32(0, 0, -1, -1, -1, -1, 0, 0);
+            while (z != target) {
+                int32x8 x0 = int32x8_load(&z[0]);
+                int32x8 x1 = int32x8_load(&z[8]);
+                x0 ^= mask;
+                x1 ^= mask;
+                int32x8 b0 = _mm256_permute2x128_si256(x0, x1, 0x20);
+                int32x8 b1 = _mm256_permute2x128_si256(x0, x1, 0x31);
+                int32x8_MINMAX(b0, b1);
+                int32x8 c0 = _mm256_permute2x128_si256(b0, b1, 0x20);
+                int32x8 c1 = _mm256_permute2x128_si256(b0, b1, 0x31);
+                int32x8_store(&z[0], c0);
+                int32x8_store(&z[8], c1);
+                z += 16;
+            }
+        } else { /* p == 1 */
+            mask = _mm256_set_epi32(0, -1, -1, 0, 0, -1, -1, 0);
+            while (z != target) {
+                int32x8 x0 = int32x8_load(&z[0]);
+                int32x8 x1 = int32x8_load(&z[8]);
+                x0 ^= mask;
+                x1 ^= mask;
+                int32x8 b0 = _mm256_permute2x128_si256(x0, x1, 0x20); /* A0123B0123 */
+                int32x8 b1 = _mm256_permute2x128_si256(x0, x1, 0x31); /* A4567B4567 */
+                int32x8 c0 = _mm256_unpacklo_epi64(b0, b1); /* A0145B0145 */
+                int32x8 c1 = _mm256_unpackhi_epi64(b0, b1); /* A2367B2367 */
+                int32x8_MINMAX(c0, c1);
+                int32x8 d0 = _mm256_unpacklo_epi64(c0, c1); /* A0123B0123 */
+                int32x8 d1 = _mm256_unpackhi_epi64(c0, c1); /* A4567B4567 */
+                int32x8_MINMAX(d0, d1);
+                int32x8 e0 = _mm256_permute2x128_si256(d0, d1, 0x20);
+                int32x8 e1 = _mm256_permute2x128_si256(d0, d1, 0x31);
+                int32x8_store(&z[0], e0);
+                int32x8_store(&z[8], e1);
+                z += 16;
+            }
+        }
+
+        q = n >> 4;
+        while (q >= 128 || q == 32) {
+            int32_threestages(x, n, q >> 2);
+            q >>= 3;
+        }
+        while (q >= 16) {
+            q >>= 1;
+            for (j = 0; j < n; j += 4 * q) {
+                for (k = j; k < j + q; k += 8) {
+                    int32x8 x0 = int32x8_load(&x[k]);
+                    int32x8 x1 = int32x8_load(&x[k + q]);
+                    int32x8 x2 = int32x8_load(&x[k + 2 * q]);
+                    int32x8 x3 = int32x8_load(&x[k + 3 * q]);
+
+                    int32x8_MINMAX(x0, x2);
+                    int32x8_MINMAX(x1, x3);
+                    int32x8_MINMAX(x0, x1);
+                    int32x8_MINMAX(x2, x3);
+
+                    int32x8_store(&x[k], x0);
+                    int32x8_store(&x[k + q], x1);
+                    int32x8_store(&x[k + 2 * q], x2);
+                    int32x8_store(&x[k + 3 * q], x3);
+                }
+            }
+            q >>= 1;
+        }
+        if (q == 8) {
+            for (j = 0; j < n; j += 2 * q) {
+                int32x8 x0 = int32x8_load(&x[j]);
+                int32x8 x1 = int32x8_load(&x[j + q]);
+
+                int32x8_MINMAX(x0, x1);
+
+                int32x8_store(&x[j], x0);
+                int32x8_store(&x[j + q], x1);
+            }
+        }
+
+        q = n >> 3;
+        for (k = 0; k < q; k += 8) {
+            int32x8 x0 = int32x8_load(&x[k]);
+            int32x8 x1 = int32x8_load(&x[k + q]);
+            int32x8 x2 = int32x8_load(&x[k + 2 * q]);
+            int32x8 x3 = int32x8_load(&x[k + 3 * q]);
+            int32x8 x4 = int32x8_load(&x[k + 4 * q]);
+            int32x8 x5 = int32x8_load(&x[k + 5 * q]);
+            int32x8 x6 = int32x8_load(&x[k + 6 * q]);
+            int32x8 x7 = int32x8_load(&x[k + 7 * q]);
+
+            int32x8_MINMAX(x0, x1);
+            int32x8_MINMAX(x2, x3);
+            int32x8_MINMAX(x4, x5);
+            int32x8_MINMAX(x6, x7);
+            int32x8_MINMAX(x0, x2);
+            int32x8_MINMAX(x1, x3);
+            int32x8_MINMAX(x4, x6);
+            int32x8_MINMAX(x5, x7);
+            int32x8_MINMAX(x0, x4);
+            int32x8_MINMAX(x1, x5);
+            int32x8_MINMAX(x2, x6);
+            int32x8_MINMAX(x3, x7);
+
+            int32x8_store(&x[k], x0);
+            int32x8_store(&x[k + q], x1);
+            int32x8_store(&x[k + 2 * q], x2);
+            int32x8_store(&x[k + 3 * q], x3);
+            int32x8_store(&x[k + 4 * q], x4);
+            int32x8_store(&x[k + 5 * q], x5);
+            int32x8_store(&x[k + 6 * q], x6);
+            int32x8_store(&x[k + 7 * q], x7);
+        }
+    }
+
+    /* everything is still masked with _mm256_set_epi32(0,-1,0,-1,0,-1,0,-1); */
+    mask = _mm256_set1_epi32(-1);
+
+    for (i = 0; i < n; i += 64) {
+        int32x8 a0 = int32x8_load(&x[i]);
+        int32x8 a1 = int32x8_load(&x[i + 8]);
+        int32x8 a2 = int32x8_load(&x[i + 16]);
+        int32x8 a3 = int32x8_load(&x[i + 24]);
+        int32x8 a4 = int32x8_load(&x[i + 32]);
+        int32x8 a5 = int32x8_load(&x[i + 40]);
+        int32x8 a6 = int32x8_load(&x[i + 48]);
+        int32x8 a7 = int32x8_load(&x[i + 56]);
+
+        int32x8 b0 = _mm256_unpacklo_epi32(a0, a1); /* AB0AB1AB4AB5 */
+        int32x8 b1 = _mm256_unpackhi_epi32(a0, a1); /* AB2AB3AB6AB7 */
+        int32x8 b2 = _mm256_unpacklo_epi32(a2, a3); /* CD0CD1CD4CD5 */
+        int32x8 b3 = _mm256_unpackhi_epi32(a2, a3); /* CD2CD3CD6CD7 */
+        int32x8 b4 = _mm256_unpacklo_epi32(a4, a5); /* EF0EF1EF4EF5 */
+        int32x8 b5 = _mm256_unpackhi_epi32(a4, a5); /* EF2EF3EF6EF7 */
+        int32x8 b6 = _mm256_unpacklo_epi32(a6, a7); /* GH0GH1GH4GH5 */
+        int32x8 b7 = _mm256_unpackhi_epi32(a6, a7); /* GH2GH3GH6GH7 */
+
+        int32x8 c0 = _mm256_unpacklo_epi64(b0, b2); /* ABCD0ABCD4 */
+        int32x8 c1 = _mm256_unpacklo_epi64(b1, b3); /* ABCD2ABCD6 */
+        int32x8 c2 = _mm256_unpackhi_epi64(b0, b2); /* ABCD1ABCD5 */
+        int32x8 c3 = _mm256_unpackhi_epi64(b1, b3); /* ABCD3ABCD7 */
+        int32x8 c4 = _mm256_unpacklo_epi64(b4, b6); /* EFGH0EFGH4 */
+        int32x8 c5 = _mm256_unpacklo_epi64(b5, b7); /* EFGH2EFGH6 */
+        int32x8 c6 = _mm256_unpackhi_epi64(b4, b6); /* EFGH1EFGH5 */
+        int32x8 c7 = _mm256_unpackhi_epi64(b5, b7); /* EFGH3EFGH7 */
+
+        if (flagdown) {
+            c2 ^= mask;
+            c3 ^= mask;
+            c6 ^= mask;
+            c7 ^= mask;
+        } else {
+            c0 ^= mask;
+            c1 ^= mask;
+            c4 ^= mask;
+            c5 ^= mask;
+        }
+
+        int32x8 d0 = _mm256_permute2x128_si256(c0, c4, 0x20); /* ABCDEFGH0 */
+        int32x8 d1 = _mm256_permute2x128_si256(c2, c6, 0x20); /* ABCDEFGH1 */
+        int32x8 d2 = _mm256_permute2x128_si256(c1, c5, 0x20); /* ABCDEFGH2 */
+        int32x8 d3 = _mm256_permute2x128_si256(c3, c7, 0x20); /* ABCDEFGH5 */
+        int32x8 d4 = _mm256_permute2x128_si256(c0, c4, 0x31); /* ABCDEFGH4 */
+        int32x8 d5 = _mm256_permute2x128_si256(c2, c6, 0x31); /* ABCDEFGH3 */
+        int32x8 d6 = _mm256_permute2x128_si256(c1, c5, 0x31); /* ABCDEFGH6 */
+        int32x8 d7 = _mm256_permute2x128_si256(c3, c7, 0x31); /* ABCDEFGH7 */
+
+        int32x8_MINMAX(d0, d1);
+        int32x8_MINMAX(d2, d3);
+        int32x8_MINMAX(d4, d5);
+        int32x8_MINMAX(d6, d7);
+        int32x8_MINMAX(d0, d2);
+        int32x8_MINMAX(d1, d3);
+        int32x8_MINMAX(d4, d6);
+        int32x8_MINMAX(d5, d7);
+        int32x8_MINMAX(d0, d4);
+        int32x8_MINMAX(d1, d5);
+        int32x8_MINMAX(d2, d6);
+        int32x8_MINMAX(d3, d7);
+
+        int32x8 e0 = _mm256_unpacklo_epi32(d0, d1);
+        int32x8 e1 = _mm256_unpackhi_epi32(d0, d1);
+        int32x8 e2 = _mm256_unpacklo_epi32(d2, d3);
+        int32x8 e3 = _mm256_unpackhi_epi32(d2, d3);
+        int32x8 e4 = _mm256_unpacklo_epi32(d4, d5);
+        int32x8 e5 = _mm256_unpackhi_epi32(d4, d5);
+        int32x8 e6 = _mm256_unpacklo_epi32(d6, d7);
+        int32x8 e7 = _mm256_unpackhi_epi32(d6, d7);
+
+        int32x8 f0 = _mm256_unpacklo_epi64(e0, e2);
+        int32x8 f1 = _mm256_unpacklo_epi64(e1, e3);
+        int32x8 f2 = _mm256_unpackhi_epi64(e0, e2);
+        int32x8 f3 = _mm256_unpackhi_epi64(e1, e3);
+        int32x8 f4 = _mm256_unpacklo_epi64(e4, e6);
+        int32x8 f5 = _mm256_unpacklo_epi64(e5, e7);
+        int32x8 f6 = _mm256_unpackhi_epi64(e4, e6);
+        int32x8 f7 = _mm256_unpackhi_epi64(e5, e7);
+
+        int32x8 g0 = _mm256_permute2x128_si256(f0, f4, 0x20);
+        int32x8 g1 = _mm256_permute2x128_si256(f2, f6, 0x20);
+        int32x8 g2 = _mm256_permute2x128_si256(f1, f5, 0x20);
+        int32x8 g3 = _mm256_permute2x128_si256(f3, f7, 0x20);
+        int32x8 g4 = _mm256_permute2x128_si256(f0, f4, 0x31);
+        int32x8 g5 = _mm256_permute2x128_si256(f2, f6, 0x31);
+        int32x8 g6 = _mm256_permute2x128_si256(f1, f5, 0x31);
+        int32x8 g7 = _mm256_permute2x128_si256(f3, f7, 0x31);
+
+        int32x8_store(&x[i], g0);
+        int32x8_store(&x[i + 8], g1);
+        int32x8_store(&x[i + 16], g2);
+        int32x8_store(&x[i + 24], g3);
+        int32x8_store(&x[i + 32], g4);
+        int32x8_store(&x[i + 40], g5);
+        int32x8_store(&x[i + 48], g6);
+        int32x8_store(&x[i + 56], g7);
+    }
+
+    q = n >> 4;
+    while (q >= 128 || q == 32) {
+        q >>= 2;
+        for (j = 0; j < n; j += 8 * q) {
+            for (i = j; i < j + q; i += 8) {
+                int32x8 x0 = int32x8_load(&x[i]);
+                int32x8 x1 = int32x8_load(&x[i + q]);
+                int32x8 x2 = int32x8_load(&x[i + 2 * q]);
+                int32x8 x3 = int32x8_load(&x[i + 3 * q]);
+                int32x8 x4 = int32x8_load(&x[i + 4 * q]);
+                int32x8 x5 = int32x8_load(&x[i + 5 * q]);
+                int32x8 x6 = int32x8_load(&x[i + 6 * q]);
+                int32x8 x7 = int32x8_load(&x[i + 7 * q]);
+                int32x8_MINMAX(x0, x4);
+                int32x8_MINMAX(x1, x5);
+                int32x8_MINMAX(x2, x6);
+                int32x8_MINMAX(x3, x7);
+                int32x8_MINMAX(x0, x2);
+                int32x8_MINMAX(x1, x3);
+                int32x8_MINMAX(x4, x6);
+                int32x8_MINMAX(x5, x7);
+                int32x8_MINMAX(x0, x1);
+                int32x8_MINMAX(x2, x3);
+                int32x8_MINMAX(x4, x5);
+                int32x8_MINMAX(x6, x7);
+                int32x8_store(&x[i], x0);
+                int32x8_store(&x[i + q], x1);
+                int32x8_store(&x[i + 2 * q], x2);
+                int32x8_store(&x[i + 3 * q], x3);
+                int32x8_store(&x[i + 4 * q], x4);
+                int32x8_store(&x[i + 5 * q], x5);
+                int32x8_store(&x[i + 6 * q], x6);
+                int32x8_store(&x[i + 7 * q], x7);
+            }
+        }
+        q >>= 1;
+    }
+    while (q >= 16) {
+        q >>= 1;
+        for (j = 0; j < n; j += 4 * q) {
+            for (i = j; i < j + q; i += 8) {
+                int32x8 x0 = int32x8_load(&x[i]);
+                int32x8 x1 = int32x8_load(&x[i + q]);
+                int32x8 x2 = int32x8_load(&x[i + 2 * q]);
+                int32x8 x3 = int32x8_load(&x[i + 3 * q]);
+                int32x8_MINMAX(x0, x2);
+                int32x8_MINMAX(x1, x3);
+                int32x8_MINMAX(x0, x1);
+                int32x8_MINMAX(x2, x3);
+                int32x8_store(&x[i], x0);
+                int32x8_store(&x[i + q], x1);
+                int32x8_store(&x[i + 2 * q], x2);
+                int32x8_store(&x[i + 3 * q], x3);
+            }
+        }
+        q >>= 1;
+    }
+    if (q == 8) {
+        for (j = 0; j < n; j += q + q) {
+            int32x8 x0 = int32x8_load(&x[j]);
+            int32x8 x1 = int32x8_load(&x[j + q]);
+            int32x8_MINMAX(x0, x1);
+            int32x8_store(&x[j], x0);
+            int32x8_store(&x[j + q], x1);
+        }
+    }
+
+    q = n >> 3;
+    for (i = 0; i < q; i += 8) {
+        int32x8 x0 = int32x8_load(&x[i]);
+        int32x8 x1 = int32x8_load(&x[i + q]);
+        int32x8 x2 = int32x8_load(&x[i + 2 * q]);
+        int32x8 x3 = int32x8_load(&x[i + 3 * q]);
+        int32x8 x4 = int32x8_load(&x[i + 4 * q]);
+        int32x8 x5 = int32x8_load(&x[i + 5 * q]);
+        int32x8 x6 = int32x8_load(&x[i + 6 * q]);
+        int32x8 x7 = int32x8_load(&x[i + 7 * q]);
+
+        int32x8_MINMAX(x0, x1);
+        int32x8_MINMAX(x2, x3);
+        int32x8_MINMAX(x4, x5);
+        int32x8_MINMAX(x6, x7);
+        int32x8_MINMAX(x0, x2);
+        int32x8_MINMAX(x1, x3);
+        int32x8_MINMAX(x4, x6);
+        int32x8_MINMAX(x5, x7);
+        int32x8_MINMAX(x0, x4);
+        int32x8_MINMAX(x1, x5);
+        int32x8_MINMAX(x2, x6);
+        int32x8_MINMAX(x3, x7);
+
+        int32x8 b0 = _mm256_unpacklo_epi32(x0, x4); /* AE0AE1AE4AE5 */
+        int32x8 b1 = _mm256_unpackhi_epi32(x0, x4); /* AE2AE3AE6AE7 */
+        int32x8 b2 = _mm256_unpacklo_epi32(x1, x5); /* BF0BF1BF4BF5 */
+        int32x8 b3 = _mm256_unpackhi_epi32(x1, x5); /* BF2BF3BF6BF7 */
+        int32x8 b4 = _mm256_unpacklo_epi32(x2, x6); /* CG0CG1CG4CG5 */
+        int32x8 b5 = _mm256_unpackhi_epi32(x2, x6); /* CG2CG3CG6CG7 */
+        int32x8 b6 = _mm256_unpacklo_epi32(x3, x7); /* DH0DH1DH4DH5 */
+        int32x8 b7 = _mm256_unpackhi_epi32(x3, x7); /* DH2DH3DH6DH7 */
+
+        int32x8 c0 = _mm256_unpacklo_epi64(b0, b4); /* AECG0AECG4 */
+        int32x8 c1 = _mm256_unpacklo_epi64(b1, b5); /* AECG2AECG6 */
+        int32x8 c2 = _mm256_unpackhi_epi64(b0, b4); /* AECG1AECG5 */
+        int32x8 c3 = _mm256_unpackhi_epi64(b1, b5); /* AECG3AECG7 */
+        int32x8 c4 = _mm256_unpacklo_epi64(b2, b6); /* BFDH0BFDH4 */
+        int32x8 c5 = _mm256_unpacklo_epi64(b3, b7); /* BFDH2BFDH6 */
+        int32x8 c6 = _mm256_unpackhi_epi64(b2, b6); /* BFDH1BFDH5 */
+        int32x8 c7 = _mm256_unpackhi_epi64(b3, b7); /* BFDH3BFDH7 */
+
+        int32x8 d0 = _mm256_permute2x128_si256(c0, c4, 0x20); /* AECGBFDH0 */
+        int32x8 d1 = _mm256_permute2x128_si256(c1, c5, 0x20); /* AECGBFDH2 */
+        int32x8 d2 = _mm256_permute2x128_si256(c2, c6, 0x20); /* AECGBFDH1 */
+        int32x8 d3 = _mm256_permute2x128_si256(c3, c7, 0x20); /* AECGBFDH3 */
+        int32x8 d4 = _mm256_permute2x128_si256(c0, c4, 0x31); /* AECGBFDH4 */
+        int32x8 d5 = _mm256_permute2x128_si256(c1, c5, 0x31); /* AECGBFDH6 */
+        int32x8 d6 = _mm256_permute2x128_si256(c2, c6, 0x31); /* AECGBFDH5 */
+        int32x8 d7 = _mm256_permute2x128_si256(c3, c7, 0x31); /* AECGBFDH7 */
+
+        if (flagdown) {
+            d0 ^= mask;
+            d1 ^= mask;
+            d2 ^= mask;
+            d3 ^= mask;
+            d4 ^= mask;
+            d5 ^= mask;
+            d6 ^= mask;
+            d7 ^= mask;
+        }
+
+        int32x8_store(&x[i], d0);
+        int32x8_store(&x[i + q], d4);
+        int32x8_store(&x[i + 2 * q], d1);
+        int32x8_store(&x[i + 3 * q], d5);
+        int32x8_store(&x[i + 4 * q], d2);
+        int32x8_store(&x[i + 5 * q], d6);
+        int32x8_store(&x[i + 6 * q], d3);
+        int32x8_store(&x[i + 7 * q], d7);
+    }
+}
+
+void PQCLEAN_SNTRUP857_AVX2_crypto_sort_int32(int32 *x, size_t n) {
+    size_t q, i, j;
+
+    if (n <= 8) {
+        if (n == 8) {
+            int32_MINMAX(&x[0], &x[1]);
+            int32_MINMAX(&x[1], &x[2]);
+            int32_MINMAX(&x[2], &x[3]);
+            int32_MINMAX(&x[3], &x[4]);
+            int32_MINMAX(&x[4], &x[5]);
+            int32_MINMAX(&x[5], &x[6]);
+            int32_MINMAX(&x[6], &x[7]);
+        }
+        if (n >= 7) {
+            int32_MINMAX(&x[0], &x[1]);
+            int32_MINMAX(&x[1], &x[2]);
+            int32_MINMAX(&x[2], &x[3]);
+            int32_MINMAX(&x[3], &x[4]);
+            int32_MINMAX(&x[4], &x[5]);
+            int32_MINMAX(&x[5], &x[6]);
+        }
+        if (n >= 6) {
+            int32_MINMAX(&x[0], &x[1]);
+            int32_MINMAX(&x[1], &x[2]);
+            int32_MINMAX(&x[2], &x[3]);
+            int32_MINMAX(&x[3], &x[4]);
+            int32_MINMAX(&x[4], &x[5]);
+        }
+        if (n >= 5) {
+            int32_MINMAX(&x[0], &x[1]);
+            int32_MINMAX(&x[1], &x[2]);
+            int32_MINMAX(&x[2], &x[3]);
+            int32_MINMAX(&x[3], &x[4]);
+        }
+        if (n >= 4) {
+            int32_MINMAX(&x[0], &x[1]);
+            int32_MINMAX(&x[1], &x[2]);
+            int32_MINMAX(&x[2], &x[3]);
+        }
+        if (n >= 3) {
+            int32_MINMAX(&x[0], &x[1]);
+            int32_MINMAX(&x[1], &x[2]);
+        }
+        if (n >= 2) {
+            int32_MINMAX(&x[0], &x[1]);
+        }
+        return;
+    }
+
+    if (!(n & (n - 1))) {
+        int32_sort_2power(x, n, 0);
+        return;
+    }
+
+    q = 8;
+    while (q < n - q) {
+        q += q;
+    }
+    /* n > q >= 8 */
+
+    if (q <= 128) { /* n <= 256 */
+        int32x8 y[32];
+        for (i = q >> 3; i < q >> 2; ++i) {
+            y[i] = _mm256_set1_epi32(0x7fffffff);
+        }
+        for (i = 0; i < n; ++i) {
+            ((int32 *) y)[i] = x[i];
+        }
+        int32_sort_2power((int32 *) y, 2 * q, 0);
+        for (i = 0; i < n; ++i) {
+            x[i] = ((int32 *) y)[i];
+        }
+        return;
+    }
+
+    int32_sort_2power(x, q, 1);
+    PQCLEAN_SNTRUP857_AVX2_crypto_sort_int32(x + q, n - q);
+
+    while (q >= 64) {
+        q >>= 2;
+        j = int32_threestages(x, n, q);
+        minmax_vector(x + j, x + j + 4 * q, n - 4 * q - j);
+        if (j + 4 * q <= n) {
+            for (i = j; i < j + q; i += 8) {
+                int32x8 x0 = int32x8_load(&x[i]);
+                int32x8 x1 = int32x8_load(&x[i + q]);
+                int32x8 x2 = int32x8_load(&x[i + 2 * q]);
+                int32x8 x3 = int32x8_load(&x[i + 3 * q]);
+                int32x8_MINMAX(x0, x2);
+                int32x8_MINMAX(x1, x3);
+                int32x8_MINMAX(x0, x1);
+                int32x8_MINMAX(x2, x3);
+                int32x8_store(&x[i], x0);
+                int32x8_store(&x[i + q], x1);
+                int32x8_store(&x[i + 2 * q], x2);
+                int32x8_store(&x[i + 3 * q], x3);
+            }
+            j += 4 * q;
+        }
+        minmax_vector(x + j, x + j + 2 * q, n - 2 * q - j);
+        if (j + 2 * q <= n) {
+            for (i = j; i < j + q; i += 8) {
+                int32x8 x0 = int32x8_load(&x[i]);
+                int32x8 x1 = int32x8_load(&x[i + q]);
+                int32x8_MINMAX(x0, x1);
+                int32x8_store(&x[i], x0);
+                int32x8_store(&x[i + q], x1);
+            }
+            j += 2 * q;
+        }
+        minmax_vector(x + j, x + j + q, n - q - j);
+        q >>= 1;
+    }
+    if (q == 32) {
+        j = 0;
+        for (; j + 64 <= n; j += 64) {
+            int32x8 x0 = int32x8_load(&x[j]);
+            int32x8 x1 = int32x8_load(&x[j + 8]);
+            int32x8 x2 = int32x8_load(&x[j + 16]);
+            int32x8 x3 = int32x8_load(&x[j + 24]);
+            int32x8 x4 = int32x8_load(&x[j + 32]);
+            int32x8 x5 = int32x8_load(&x[j + 40]);
+            int32x8 x6 = int32x8_load(&x[j + 48]);
+            int32x8 x7 = int32x8_load(&x[j + 56]);
+            int32x8_MINMAX(x0, x4);
+            int32x8_MINMAX(x1, x5);
+            int32x8_MINMAX(x2, x6);
+            int32x8_MINMAX(x3, x7);
+            int32x8_MINMAX(x0, x2);
+            int32x8_MINMAX(x1, x3);
+            int32x8_MINMAX(x4, x6);
+            int32x8_MINMAX(x5, x7);
+            int32x8_MINMAX(x0, x1);
+            int32x8_MINMAX(x2, x3);
+            int32x8_MINMAX(x4, x5);
+            int32x8_MINMAX(x6, x7);
+            int32x8 a0 = _mm256_permute2x128_si256(x0, x1, 0x20);
+            int32x8 a1 = _mm256_permute2x128_si256(x0, x1, 0x31);
+            int32x8 a2 = _mm256_permute2x128_si256(x2, x3, 0x20);
+            int32x8 a3 = _mm256_permute2x128_si256(x2, x3, 0x31);
+            int32x8 a4 = _mm256_permute2x128_si256(x4, x5, 0x20);
+            int32x8 a5 = _mm256_permute2x128_si256(x4, x5, 0x31);
+            int32x8 a6 = _mm256_permute2x128_si256(x6, x7, 0x20);
+            int32x8 a7 = _mm256_permute2x128_si256(x6, x7, 0x31);
+            int32x8_MINMAX(a0, a1);
+            int32x8_MINMAX(a2, a3);
+            int32x8_MINMAX(a4, a5);
+            int32x8_MINMAX(a6, a7);
+            int32x8 b0 = _mm256_permute2x128_si256(a0, a1, 0x20);
+            int32x8 b1 = _mm256_permute2x128_si256(a0, a1, 0x31);
+            int32x8 b2 = _mm256_permute2x128_si256(a2, a3, 0x20);
+            int32x8 b3 = _mm256_permute2x128_si256(a2, a3, 0x31);
+            int32x8 b4 = _mm256_permute2x128_si256(a4, a5, 0x20);
+            int32x8 b5 = _mm256_permute2x128_si256(a4, a5, 0x31);
+            int32x8 b6 = _mm256_permute2x128_si256(a6, a7, 0x20);
+            int32x8 b7 = _mm256_permute2x128_si256(a6, a7, 0x31);
+            int32x8 c0 = _mm256_unpacklo_epi64(b0, b1);
+            int32x8 c1 = _mm256_unpackhi_epi64(b0, b1);
+            int32x8 c2 = _mm256_unpacklo_epi64(b2, b3);
+            int32x8 c3 = _mm256_unpackhi_epi64(b2, b3);
+            int32x8 c4 = _mm256_unpacklo_epi64(b4, b5);
+            int32x8 c5 = _mm256_unpackhi_epi64(b4, b5);
+            int32x8 c6 = _mm256_unpacklo_epi64(b6, b7);
+            int32x8 c7 = _mm256_unpackhi_epi64(b6, b7);
+            int32x8_MINMAX(c0, c1);
+            int32x8_MINMAX(c2, c3);
+            int32x8_MINMAX(c4, c5);
+            int32x8_MINMAX(c6, c7);
+            int32x8 d0 = _mm256_unpacklo_epi32(c0, c1);
+            int32x8 d1 = _mm256_unpackhi_epi32(c0, c1);
+            int32x8 d2 = _mm256_unpacklo_epi32(c2, c3);
+            int32x8 d3 = _mm256_unpackhi_epi32(c2, c3);
+            int32x8 d4 = _mm256_unpacklo_epi32(c4, c5);
+            int32x8 d5 = _mm256_unpackhi_epi32(c4, c5);
+            int32x8 d6 = _mm256_unpacklo_epi32(c6, c7);
+            int32x8 d7 = _mm256_unpackhi_epi32(c6, c7);
+            int32x8 e0 = _mm256_unpacklo_epi64(d0, d1);
+            int32x8 e1 = _mm256_unpackhi_epi64(d0, d1);
+            int32x8 e2 = _mm256_unpacklo_epi64(d2, d3);
+            int32x8 e3 = _mm256_unpackhi_epi64(d2, d3);
+            int32x8 e4 = _mm256_unpacklo_epi64(d4, d5);
+            int32x8 e5 = _mm256_unpackhi_epi64(d4, d5);
+            int32x8 e6 = _mm256_unpacklo_epi64(d6, d7);
+            int32x8 e7 = _mm256_unpackhi_epi64(d6, d7);
+            int32x8_MINMAX(e0, e1);
+            int32x8_MINMAX(e2, e3);
+            int32x8_MINMAX(e4, e5);
+            int32x8_MINMAX(e6, e7);
+            int32x8 f0 = _mm256_unpacklo_epi32(e0, e1);
+            int32x8 f1 = _mm256_unpackhi_epi32(e0, e1);
+            int32x8 f2 = _mm256_unpacklo_epi32(e2, e3);
+            int32x8 f3 = _mm256_unpackhi_epi32(e2, e3);
+            int32x8 f4 = _mm256_unpacklo_epi32(e4, e5);
+            int32x8 f5 = _mm256_unpackhi_epi32(e4, e5);
+            int32x8 f6 = _mm256_unpacklo_epi32(e6, e7);
+            int32x8 f7 = _mm256_unpackhi_epi32(e6, e7);
+            int32x8_store(&x[j], f0);
+            int32x8_store(&x[j + 8], f1);
+            int32x8_store(&x[j + 16], f2);
+            int32x8_store(&x[j + 24], f3);
+            int32x8_store(&x[j + 32], f4);
+            int32x8_store(&x[j + 40], f5);
+            int32x8_store(&x[j + 48], f6);
+            int32x8_store(&x[j + 56], f7);
+        }
+        minmax_vector(x + j, x + j + 32, n - 32 - j);
+        goto continue16;
+    }
+    if (q == 16) {
+        j = 0;
+continue16:
+        for (; j + 32 <= n; j += 32) {
+            int32x8 x0 = int32x8_load(&x[j]);
+            int32x8 x1 = int32x8_load(&x[j + 8]);
+            int32x8 x2 = int32x8_load(&x[j + 16]);
+            int32x8 x3 = int32x8_load(&x[j + 24]);
+            int32x8_MINMAX(x0, x2);
+            int32x8_MINMAX(x1, x3);
+            int32x8_MINMAX(x0, x1);
+            int32x8_MINMAX(x2, x3);
+            int32x8 a0 = _mm256_permute2x128_si256(x0, x1, 0x20);
+            int32x8 a1 = _mm256_permute2x128_si256(x0, x1, 0x31);
+            int32x8 a2 = _mm256_permute2x128_si256(x2, x3, 0x20);
+            int32x8 a3 = _mm256_permute2x128_si256(x2, x3, 0x31);
+            int32x8_MINMAX(a0, a1);
+            int32x8_MINMAX(a2, a3);
+            int32x8 b0 = _mm256_permute2x128_si256(a0, a1, 0x20);
+            int32x8 b1 = _mm256_permute2x128_si256(a0, a1, 0x31);
+            int32x8 b2 = _mm256_permute2x128_si256(a2, a3, 0x20);
+            int32x8 b3 = _mm256_permute2x128_si256(a2, a3, 0x31);
+            int32x8 c0 = _mm256_unpacklo_epi64(b0, b1);
+            int32x8 c1 = _mm256_unpackhi_epi64(b0, b1);
+            int32x8 c2 = _mm256_unpacklo_epi64(b2, b3);
+            int32x8 c3 = _mm256_unpackhi_epi64(b2, b3);
+            int32x8_MINMAX(c0, c1);
+            int32x8_MINMAX(c2, c3);
+            int32x8 d0 = _mm256_unpacklo_epi32(c0, c1);
+            int32x8 d1 = _mm256_unpackhi_epi32(c0, c1);
+            int32x8 d2 = _mm256_unpacklo_epi32(c2, c3);
+            int32x8 d3 = _mm256_unpackhi_epi32(c2, c3);
+            int32x8 e0 = _mm256_unpacklo_epi64(d0, d1);
+            int32x8 e1 = _mm256_unpackhi_epi64(d0, d1);
+            int32x8 e2 = _mm256_unpacklo_epi64(d2, d3);
+            int32x8 e3 = _mm256_unpackhi_epi64(d2, d3);
+            int32x8_MINMAX(e0, e1);
+            int32x8_MINMAX(e2, e3);
+            int32x8 f0 = _mm256_unpacklo_epi32(e0, e1);
+            int32x8 f1 = _mm256_unpackhi_epi32(e0, e1);
+            int32x8 f2 = _mm256_unpacklo_epi32(e2, e3);
+            int32x8 f3 = _mm256_unpackhi_epi32(e2, e3);
+            int32x8_store(&x[j], f0);
+            int32x8_store(&x[j + 8], f1);
+            int32x8_store(&x[j + 16], f2);
+            int32x8_store(&x[j + 24], f3);
+        }
+        minmax_vector(x + j, x + j + 16, n - 16 - j);
+        goto continue8;
+    }
+    /* q == 8 */
+    j = 0;
+continue8:
+    for (; j + 16 <= n; j += 16) {
+        int32x8 x0 = int32x8_load(&x[j]);
+        int32x8 x1 = int32x8_load(&x[j + 8]);
+        int32x8_MINMAX(x0, x1);
+        int32x8_store(&x[j], x0);
+        int32x8_store(&x[j + 8], x1);
+        int32x8 a0 = _mm256_permute2x128_si256(x0, x1, 0x20); /* x0123y0123 */
+        int32x8 a1 = _mm256_permute2x128_si256(x0, x1, 0x31); /* x4567y4567 */
+        int32x8_MINMAX(a0, a1);
+        int32x8 b0 = _mm256_permute2x128_si256(a0, a1, 0x20); /* x01234567 */
+        int32x8 b1 = _mm256_permute2x128_si256(a0, a1, 0x31); /* y01234567 */
+        int32x8 c0 = _mm256_unpacklo_epi64(b0, b1); /* x01y01x45y45 */
+        int32x8 c1 = _mm256_unpackhi_epi64(b0, b1); /* x23y23x67y67 */
+        int32x8_MINMAX(c0, c1);
+        int32x8 d0 = _mm256_unpacklo_epi32(c0, c1); /* x02x13x46x57 */
+        int32x8 d1 = _mm256_unpackhi_epi32(c0, c1); /* y02y13y46y57 */
+        int32x8 e0 = _mm256_unpacklo_epi64(d0, d1); /* x02y02x46y46 */
+        int32x8 e1 = _mm256_unpackhi_epi64(d0, d1); /* x13y13x57y57 */
+        int32x8_MINMAX(e0, e1);
+        int32x8 f0 = _mm256_unpacklo_epi32(e0, e1); /* x01234567 */
+        int32x8 f1 = _mm256_unpackhi_epi32(e0, e1); /* y01234567 */
+        int32x8_store(&x[j], f0);
+        int32x8_store(&x[j + 8], f1);
+    }
+    minmax_vector(x + j, x + j + 8, n - 8 - j);
+    if (j + 8 <= n) {
+        int32_MINMAX(&x[j], &x[j + 4]);
+        int32_MINMAX(&x[j + 1], &x[j + 5]);
+        int32_MINMAX(&x[j + 2], &x[j + 6]);
+        int32_MINMAX(&x[j + 3], &x[j + 7]);
+        int32_MINMAX(&x[j], &x[j + 2]);
+        int32_MINMAX(&x[j + 1], &x[j + 3]);
+        int32_MINMAX(&x[j], &x[j + 1]);
+        int32_MINMAX(&x[j + 2], &x[j + 3]);
+        int32_MINMAX(&x[j + 4], &x[j + 6]);
+        int32_MINMAX(&x[j + 5], &x[j + 7]);
+        int32_MINMAX(&x[j + 4], &x[j + 5]);
+        int32_MINMAX(&x[j + 6], &x[j + 7]);
+        j += 8;
+    }
+    minmax_vector(x + j, x + j + 4, n - 4 - j);
+    if (j + 4 <= n) {
+        int32_MINMAX(&x[j], &x[j + 2]);
+        int32_MINMAX(&x[j + 1], &x[j + 3]);
+        int32_MINMAX(&x[j], &x[j + 1]);
+        int32_MINMAX(&x[j + 2], &x[j + 3]);
+        j += 4;
+    }
+    if (j + 3 <= n) {
+        int32_MINMAX(&x[j], &x[j + 2]);
+    }
+    if (j + 2 <= n) {
+        int32_MINMAX(&x[j], &x[j + 1]);
+    }
+}
diff --git a/crypto_kem/sntrup857/avx2/crypto_sort_int32.h b/crypto_kem/sntrup857/avx2/crypto_sort_int32.h
new file mode 100644
index 00000000..81d82aa0
--- /dev/null
+++ b/crypto_kem/sntrup857/avx2/crypto_sort_int32.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_SNTRUP857_AVX2_CRYPTO_SORT
+#define PQCLEAN_SNTRUP857_AVX2_CRYPTO_SORT
+
+#include <stddef.h>
+#include <stdint.h>
+
+
+void PQCLEAN_SNTRUP857_AVX2_crypto_sort_int32(int32_t *x, size_t n);
+
+#endif
diff --git a/crypto_kem/sntrup857/avx2/crypto_sort_uint32.c b/crypto_kem/sntrup857/avx2/crypto_sort_uint32.c
new file mode 100644
index 00000000..eb138f6f
--- /dev/null
+++ b/crypto_kem/sntrup857/avx2/crypto_sort_uint32.c
@@ -0,0 +1,20 @@
+#include "crypto_sort_int32.h"
+#include "crypto_sort_uint32.h"
+#include <stdint.h>
+
+#define uint32 uint32_t
+
+/* can save time by vectorizing xor loops */
+/* can save time by integrating xor loops with int32_sort */
+
+void PQCLEAN_SNTRUP857_AVX2_crypto_sort_uint32(uint32_t *array, size_t n) {
+    uint32 *x = array;
+    size_t j;
+    for (j = 0; j < n; ++j) {
+        x[j] ^= 0x80000000;
+    }
+    PQCLEAN_SNTRUP857_AVX2_crypto_sort_int32((int32_t *)array, n);
+    for (j = 0; j < n; ++j) {
+        x[j] ^= 0x80000000;
+    }
+}
diff --git a/crypto_kem/sntrup857/avx2/crypto_sort_uint32.h b/crypto_kem/sntrup857/avx2/crypto_sort_uint32.h
new file mode 100644
index 00000000..5ccfe3ab
--- /dev/null
+++ b/crypto_kem/sntrup857/avx2/crypto_sort_uint32.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_SNTRUP857_AVX2_CRYPTO_SORT_UINT32_H
+#define PQCLEAN_SNTRUP857_AVX2_CRYPTO_SORT_UINT32_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+
+void PQCLEAN_SNTRUP857_AVX2_crypto_sort_uint32(uint32_t *array, size_t n);
+
+#endif
diff --git a/crypto_kem/sntrup857/avx2/crypto_stream_aes256ctr.c b/crypto_kem/sntrup857/avx2/crypto_stream_aes256ctr.c
new file mode 100644
index 00000000..048a6a0c
--- /dev/null
+++ b/crypto_kem/sntrup857/avx2/crypto_stream_aes256ctr.c
@@ -0,0 +1,15 @@
+#include "crypto_stream_aes256ctr.h"
+
+
+int PQCLEAN_SNTRUP857_AVX2_crypto_stream_aes256ctr(
+    uint8_t *out,
+    size_t outlen,
+    const uint8_t nonce[AESCTR_NONCEBYTES],
+    const uint8_t key[AES256_KEYBYTES]) {
+
+    aes256ctx state;
+    aes256_ctr_keyexp(&state, key);
+    aes256_ctr(out, outlen, nonce, &state);
+    aes256_ctx_release(&state);
+    return 0;
+}
diff --git a/crypto_kem/sntrup857/avx2/crypto_stream_aes256ctr.h b/crypto_kem/sntrup857/avx2/crypto_stream_aes256ctr.h
new file mode 100644
index 00000000..6c43671d
--- /dev/null
+++ b/crypto_kem/sntrup857/avx2/crypto_stream_aes256ctr.h
@@ -0,0 +1,15 @@
+#ifndef PQCLEAN_SNTRUP857_AVX2_CRYPTO_STREAM_AES256CTR_H
+#define PQCLEAN_SNTRUP857_AVX2_CRYPTO_STREAM_AES256CTR_H
+#include "aes.h"
+#include <stddef.h>
+#include <stdint.h>
+
+
+
+int PQCLEAN_SNTRUP857_AVX2_crypto_stream_aes256ctr(
+    uint8_t *out,
+    size_t outlen,
+    const uint8_t nonce[AESCTR_NONCEBYTES],
+    const uint8_t key[AES256_KEYBYTES]);
+
+#endif
diff --git a/crypto_kem/sntrup857/avx2/crypto_verify_1184.c b/crypto_kem/sntrup857/avx2/crypto_verify_1184.c
new file mode 100644
index 00000000..20fb5f4c
--- /dev/null
+++ b/crypto_kem/sntrup857/avx2/crypto_verify_1184.c
@@ -0,0 +1,36 @@
+#include "crypto_verify_1184.h"
+#include <immintrin.h>
+
+int PQCLEAN_SNTRUP857_AVX2_crypto_verify_1184(const unsigned char *x, const unsigned char *y) {
+    __m256i diff = _mm256_set1_epi8(0);
+    unsigned int differentbits = 0;
+    int i = PQCLEAN_SNTRUP857_AVX2_crypto_verify_1184_BYTES;
+
+    i -= 32;
+    for (;;) {
+        do {
+            __m256i x0 = _mm256_loadu_si256((__m256i *) x);
+            __m256i y0 = _mm256_loadu_si256((__m256i *) y);
+            diff |= x0 ^ y0;
+            i -= 32;
+            x += 32;
+            y += 32;
+        } while (i >= 0);
+        if (i <= -32) {
+            break;
+        }
+        x += i;
+        y += i;
+    }
+
+    diff |= _mm256_srli_epi16(diff, 8);
+    diff |= _mm256_srli_epi32(diff, 16);
+    diff |= _mm256_srli_epi64(diff, 32);
+
+    differentbits = _mm256_extract_epi8(diff, 0);
+    differentbits |= _mm256_extract_epi8(diff, 8);
+    differentbits |= _mm256_extract_epi8(diff, 16);
+    differentbits |= _mm256_extract_epi8(diff, 24);
+
+    return (int) (1 & ((differentbits - 1) >> 8)) - 1;
+}
diff --git a/crypto_kem/sntrup857/avx2/crypto_verify_1184.h b/crypto_kem/sntrup857/avx2/crypto_verify_1184.h
new file mode 100644
index 00000000..606fe2d0
--- /dev/null
+++ b/crypto_kem/sntrup857/avx2/crypto_verify_1184.h
@@ -0,0 +1,8 @@
+#ifndef PQCLEAN_SNTRUP857_AVX2_CRYPTO_VERIFY_1184_H
+#define PQCLEAN_SNTRUP857_AVX2_CRYPTO_VERIFY_1184_H
+
+#include <stdint.h>
+#define PQCLEAN_SNTRUP857_AVX2_crypto_verify_1184_BYTES 1184
+
+int PQCLEAN_SNTRUP857_AVX2_crypto_verify_1184(const unsigned char *x, const unsigned char *y);
+#endif
diff --git a/crypto_kem/sntrup857/avx2/kem.c b/crypto_kem/sntrup857/avx2/kem.c
new file mode 100644
index 00000000..bbf3abc4
--- /dev/null
+++ b/crypto_kem/sntrup857/avx2/kem.c
@@ -0,0 +1,247 @@
+#include "api.h"
+#include "crypto_sort_uint32.h"
+#include "params.h"
+#include "randombytes.h"
+#include "sha2.h"
+
+
+
+#define int8 int8_t
+#define int16 int16_t
+#define int32 int32_t
+#define uint16 uint16_t
+#define uint32 uint32_t
+
+/* ----- arithmetic mod 3 */
+
+typedef int8 small;
+/* F3 is always represented as -1,0,1 */
+
+/* ----- arithmetic mod q */
+
+typedef int16 Fq;
+/* always represented as -(q-1)/2...(q-1)/2 */
+
+/* ----- small polynomials */
+
+/* R3_fromR(R_fromRq(r)) */
+static void R3_fromRq(small *out, const Fq *r) {
+    crypto_encode_pxfreeze3((unsigned char *) out, (unsigned char *) r);
+}
+
+/* h = f*g in the ring R3 */
+static void R3_mult(small *h, const small *f, const small *g) {
+    crypto_core_mult3((unsigned char *) h, (const unsigned char *) f, (const unsigned char *) g);
+}
+
+/* ----- polynomials mod q */
+
+/* h = h*g in the ring Rq */
+static void Rq_mult_small(Fq *h, const small *g) {
+    crypto_encode_pxint16((unsigned char *) h, h);
+    crypto_core_mult((unsigned char *) h, (const unsigned char *) h, (const unsigned char *) g);
+    crypto_decode_pxint16(h, (const unsigned char *) h);
+}
+
+/* h = 3f in Rq */
+static void Rq_mult3(Fq *h, const Fq *f) {
+    crypto_encode_pxint16((unsigned char *) h, f);
+    crypto_core_scale3((unsigned char *) h, (const unsigned char *) h);
+    crypto_decode_pxint16(h, (const unsigned char *) h);
+}
+
+/* out = 1/(3*in) in Rq */
+/* caller must have 2p+1 bytes free in out, not just 2p */
+static void Rq_recip3(Fq *out, const small *in) {
+    crypto_core_inv((unsigned char *) out, (const unsigned char *) in);
+    /* could check byte 2*p for failure; but, in context, inv always works */
+    crypto_decode_pxint16(out, (unsigned char *) out);
+}
+
+/* ----- underlying hash function */
+
+#define Hash_bytes 32
+
+static void Hash(unsigned char *out, const unsigned char *in, int inlen) {
+    unsigned char h[64];
+    int i;
+    sha512(h, in, inlen);
+    for (i = 0; i < 32; ++i) {
+        out[i] = h[i];
+    }
+}
+
+/* ----- higher-level randomness */
+
+static void Short_random(small *out) {
+    uint32 L[ppadsort];
+    int i;
+
+    randombytes((unsigned char *) L, 4 * p);
+    crypto_decode_pxint32(L, (unsigned char *) L);
+    for (i = 0; i < w; ++i) {
+        L[i] = L[i] & (uint32) - 2;
+    }
+    for (i = w; i < p; ++i) {
+        L[i] = (L[i] & (uint32) - 3) | 1;
+    }
+    for (i = p; i < ppadsort; ++i) {
+        L[i] = 0xffffffff;
+    }
+    PQCLEAN_SNTRUP857_AVX2_crypto_sort_uint32(L, ppadsort);
+    for (i = 0; i < p; ++i) {
+        out[i] = (L[i] & 3) - 1;
+    }
+}
+
+static void Small_random(small *out) {
+    uint32 L[p];
+    int i;
+
+    randombytes((unsigned char *) L, sizeof L);
+    crypto_decode_pxint32(L, (unsigned char *) L);
+    for (i = 0; i < p; ++i) {
+        out[i] = (((L[i] & 0x3fffffff) * 3) >> 30) - 1;
+    }
+}
+
+/* ----- Streamlined NTRU Prime */
+
+typedef small Inputs[p]; /* passed by reference */
+#define Ciphertexts_bytes Rounded_bytes
+#define SecretKeys_bytes (2*Small_bytes)
+#define PublicKeys_bytes Rq_bytes
+#define Confirm_bytes 32
+
+/* c,r_enc[1:] = Hide(r,pk,cache); cache is Hash4(pk) */
+/* also set r_enc[0]=3 */
+/* also set x[0]=2, and x[1:1+Hash_bytes] = Hash3(r_enc) */
+/* also overwrite x[1+Hash_bytes:1+2*Hash_bytes] */
+static void Hide(unsigned char *x, unsigned char *c, unsigned char *r_enc, const Inputs r, const unsigned char *pk, const unsigned char *cache) {
+    Fq h[p];
+    int i;
+
+    Small_encode(r_enc + 1, r);
+    Rq_decode(h, pk);
+    Rq_mult_small(h, r);
+    Round_and_encode(c, h);
+    r_enc[0] = 3;
+    Hash(x + 1, r_enc, 1 + Small_bytes);
+    for (i = 0; i < Hash_bytes; ++i) {
+        x[1 + Hash_bytes + i] = cache[i];
+    }
+    x[0] = 2;
+    Hash(c + Ciphertexts_bytes, x, 1 + Hash_bytes * 2);
+}
+
+
+int PQCLEAN_SNTRUP857_AVX2_crypto_kem_keypair(unsigned char *pk, unsigned char *sk) {
+    small g[p];
+    for (;;) {
+        Small_random(g);
+        {
+            small v[p + 1];
+            crypto_core_inv3((unsigned char *) v, (const unsigned char *) g);
+            if (v[p] == 0) {
+                Small_encode(sk + Small_bytes, v);
+                break;
+            }
+        }
+    }
+    {
+        small f[p];
+        Short_random(f);
+        Small_encode(sk, f);
+        {
+            Fq h[p + 1];
+            Rq_recip3(h, f); /* always works */
+            Rq_mult_small(h, g);
+            Rq_encode(pk, h);
+        }
+    }
+    {
+        int i;
+        unsigned char sksave = sk[SecretKeys_bytes - 1];
+        for (i = 0; i < PublicKeys_bytes; ++i) {
+            sk[SecretKeys_bytes + i] = pk[i];
+        }
+        sk[SecretKeys_bytes - 1] = 4;
+        Hash(sk + SecretKeys_bytes + PublicKeys_bytes + Small_bytes, sk + SecretKeys_bytes - 1, 1 + PublicKeys_bytes);
+        sk[SecretKeys_bytes - 1] = sksave;
+        randombytes(sk + SecretKeys_bytes + PublicKeys_bytes, Small_bytes);
+    }
+    return 0;
+}
+
+int PQCLEAN_SNTRUP857_AVX2_crypto_kem_enc(unsigned char *c, unsigned char *k, const unsigned char *pk) {
+    unsigned char cache[Hash_bytes];
+    int i;
+    {
+        unsigned char y[1 + PublicKeys_bytes]; /* XXX: can eliminate with incremental hashing */
+        for (i = 0; i < PublicKeys_bytes; ++i) {
+            y[1 + i] = pk[i];
+        }
+        y[0] = 4;
+        Hash(cache, y, sizeof y);
+    }
+    {
+        Inputs r;
+        Short_random(r);
+        {
+            unsigned char r_enc[Small_bytes + 1];
+            unsigned char x[1 + Hash_bytes + Ciphertexts_bytes + Confirm_bytes];
+            Hide(x, c, r_enc, r, pk, cache);
+            for (i = 0; i < Ciphertexts_bytes + Confirm_bytes; ++i) {
+                x[1 + Hash_bytes + i] = c[i];
+            }
+            x[0] = 1;
+            Hash(k, x, sizeof x);
+        }
+    }
+    return 0;
+}
+
+int PQCLEAN_SNTRUP857_AVX2_crypto_kem_dec(unsigned char *k, const unsigned char *c, const unsigned char *sk) {
+    const unsigned char *pk = sk + SecretKeys_bytes;
+    const unsigned char *rho = pk + PublicKeys_bytes;
+    const unsigned char *cache = rho + Small_bytes;
+    int mask, i;
+    Inputs r;
+    {
+        Fq d[p];
+        Rounded_decode(d, c);
+        {
+            small f[p];
+            Small_decode(f, sk);
+            Rq_mult_small(d, f);
+            Rq_mult3(d, d);
+        }
+        {
+            small e[p];
+            small v[p];
+            R3_fromRq(e, d);
+            Small_decode(v, sk + Small_bytes);
+            R3_mult(r, e, v);
+        }
+        crypto_core_wforce((unsigned char *) r, (unsigned char *) r);
+    }
+    {
+        unsigned char r_enc[1 + Small_bytes];
+        unsigned char cnew[Ciphertexts_bytes + Confirm_bytes];
+        unsigned char x[1 + Hash_bytes + Ciphertexts_bytes + Confirm_bytes];
+        /* XXX: can use incremental hashing to reduce x size */
+
+        Hide(x, cnew, r_enc, r, pk, cache);
+        mask = crypto_verify_clen(c, cnew);
+        for (i = 0; i < Small_bytes; ++i) {
+            r_enc[i + 1] ^= mask & (r_enc[i + 1] ^ rho[i]);
+        }
+        Hash(x + 1, r_enc, 1 + Small_bytes); /* XXX: can instead do cmov on cached hash of rho */
+        for (i = 0; i < Ciphertexts_bytes + Confirm_bytes; ++i) {
+            x[1 + Hash_bytes + i] = c[i];
+        }
+        x[0] = 1 + mask;
+        Hash(k, x, sizeof x);
+    }
+    return 0;
+}
diff --git a/crypto_kem/sntrup857/avx2/params.h b/crypto_kem/sntrup857/avx2/params.h
new file mode 100644
index 00000000..6de2001f
--- /dev/null
+++ b/crypto_kem/sntrup857/avx2/params.h
@@ -0,0 +1,71 @@
+#ifndef params_H
+#define params_H
+#include "crypto_core_inv3sntrup857.h"
+#include "crypto_core_invsntrup857.h"
+#include "crypto_core_mult3sntrup857.h"
+#include "crypto_core_multsntrup857.h"
+#include "crypto_core_scale3sntrup857.h"
+#include "crypto_core_weightsntrup857.h"
+#include "crypto_core_wforcesntrup857.h"
+#include "crypto_decode_857x1723.h"
+#include "crypto_decode_857x3.h"
+#include "crypto_decode_857x5167.h"
+#include "crypto_decode_857xint16.h"
+#include "crypto_decode_857xint32.h"
+#include "crypto_encode_857x1723.h"
+#include "crypto_encode_857x1723round.h"
+#include "crypto_encode_857x3.h"
+#include "crypto_encode_857x5167.h"
+#include "crypto_encode_857xfreeze3.h"
+#include "crypto_encode_857xint16.h"
+#include "crypto_encode_int16.h"
+#include "crypto_verify_1184.h"
+
+
+#define p 857
+#define qinv (-19761) /* reciprocal of q mod 2^16 */
+#define q27 25976 /* closest integer to 2^27/q */
+#define q18 51 /* closest integer to 2^18/q */
+#define ppad 865
+#define crypto_core_weight PQCLEAN_SNTRUP857_AVX2_crypto_core_weightsntrup857
+#define q 5167
+#define w 322
+
+#define ppadsort 857
+
+#define crypto_verify_clen PQCLEAN_SNTRUP857_AVX2_crypto_verify_1184
+
+#define Rq_bytes PQCLEAN_SNTRUP857_AVX2_crypto_encode_857x5167_STRBYTES
+#define Rq_encode PQCLEAN_SNTRUP857_AVX2_crypto_encode_857x5167
+#define Rq_decode PQCLEAN_SNTRUP857_AVX2_crypto_decode_857x5167
+
+#define Rounded_bytes PQCLEAN_SNTRUP857_AVX2_crypto_decode_857x1723_STRBYTES
+#define Rounded_decode PQCLEAN_SNTRUP857_AVX2_crypto_decode_857x1723
+
+#define Round_and_encode PQCLEAN_SNTRUP857_AVX2_crypto_encode_857x1723round
+
+#define Small_bytes PQCLEAN_SNTRUP857_AVX2_crypto_encode_857x3_STRBYTES
+#define Small_encode PQCLEAN_SNTRUP857_AVX2_crypto_encode_857x3
+#define Small_decode PQCLEAN_SNTRUP857_AVX2_crypto_decode_857x3
+
+#define crypto_encode_pxfreeze3 PQCLEAN_SNTRUP857_AVX2_crypto_encode_857xfreeze3
+
+#define crypto_decode_pxint32 PQCLEAN_SNTRUP857_AVX2_crypto_decode_857xint32
+
+#define crypto_decode_pxint16 PQCLEAN_SNTRUP857_AVX2_crypto_decode_857xint16
+
+#define crypto_encode_pxint16 PQCLEAN_SNTRUP857_AVX2_crypto_encode_857xint16
+
+#define crypto_core_wforce PQCLEAN_SNTRUP857_AVX2_crypto_core_wforcesntrup857
+
+#define crypto_core_scale3 PQCLEAN_SNTRUP857_AVX2_crypto_core_scale3sntrup857
+
+#define crypto_core_inv PQCLEAN_SNTRUP857_AVX2_crypto_core_invsntrup857
+
+#define crypto_core_inv3 PQCLEAN_SNTRUP857_AVX2_crypto_core_inv3sntrup857
+
+#define crypto_core_mult PQCLEAN_SNTRUP857_AVX2_crypto_core_multsntrup857
+
+#define crypto_core_mult3 PQCLEAN_SNTRUP857_AVX2_crypto_core_mult3sntrup857
+
+#endif
diff --git a/crypto_kem/sntrup857/clean/LICENSE b/crypto_kem/sntrup857/clean/LICENSE
new file mode 100644
index 00000000..d5d21fff
--- /dev/null
+++ b/crypto_kem/sntrup857/clean/LICENSE
@@ -0,0 +1 @@
+Public Domain
diff --git a/crypto_kem/sntrup857/clean/Makefile b/crypto_kem/sntrup857/clean/Makefile
new file mode 100644
index 00000000..abe07fce
--- /dev/null
+++ b/crypto_kem/sntrup857/clean/Makefile
@@ -0,0 +1,19 @@
+# This Makefile can be used with GNU Make or BSD Make
+
+LIB=libsntrup857_clean.a
+HEADERS=api.h crypto_core_inv3sntrup857.h crypto_core_invsntrup857.h crypto_core_mult3sntrup857.h crypto_core_multsntrup857.h crypto_core_scale3sntrup857.h crypto_core_weightsntrup857.h crypto_core_wforcesntrup857.h crypto_decode_857x1723.h crypto_decode_857x3.h crypto_decode_857x5167.h crypto_decode_857xint16.h crypto_decode_857xint32.h crypto_encode_857x1723.h crypto_encode_857x1723round.h crypto_encode_857x3.h crypto_encode_857x5167.h crypto_encode_857xfreeze3.h crypto_encode_857xint16.h crypto_encode_int16.h crypto_sort_int32.h crypto_sort_uint32.h crypto_stream_aes256ctr.h crypto_verify_1184.h params.h 
+OBJECTS=crypto_core_inv3sntrup857.o crypto_core_invsntrup857.o crypto_core_mult3sntrup857.o crypto_core_multsntrup857.o crypto_core_scale3sntrup857.o crypto_core_weightsntrup857.o crypto_core_wforcesntrup857.o crypto_decode_857x1723.o crypto_decode_857x3.o crypto_decode_857x5167.o crypto_decode_857xint16.o crypto_decode_857xint32.o crypto_encode_857x1723.o crypto_encode_857x1723round.o crypto_encode_857x3.o crypto_encode_857x5167.o crypto_encode_857xfreeze3.o crypto_encode_857xint16.o crypto_encode_int16.o crypto_sort_int32.o crypto_sort_uint32.o crypto_stream_aes256ctr.o crypto_verify_1184.o kem.o 
+
+CFLAGS=-O3 -Wall -Wextra -Wpedantic -Wvla -Werror -Wredundant-decls -Wmissing-prototypes -std=c99 -I../../../common $(EXTRAFLAGS)
+
+all: $(LIB)
+
+%.o: %.c $(HEADERS)
+	$(CC) $(CFLAGS) -c -o $@ $<
+
+$(LIB): $(OBJECTS)
+	$(AR) -r $@ $(OBJECTS)
+
+clean:
+	$(RM) $(OBJECTS)
+	$(RM) $(LIB)
diff --git a/crypto_kem/sntrup857/clean/Makefile.Microsoft_nmake b/crypto_kem/sntrup857/clean/Makefile.Microsoft_nmake
new file mode 100644
index 00000000..a2f0c64d
--- /dev/null
+++ b/crypto_kem/sntrup857/clean/Makefile.Microsoft_nmake
@@ -0,0 +1,19 @@
+# This Makefile can be used with Microsoft Visual Studio's nmake using the command:
+#    nmake /f Makefile.Microsoft_nmake
+
+LIBRARY=libsntrup857_clean.lib
+OBJECTS=crypto_core_inv3sntrup857.obj crypto_core_invsntrup857.obj crypto_core_mult3sntrup857.obj crypto_core_multsntrup857.obj crypto_core_scale3sntrup857.obj crypto_core_weightsntrup857.obj crypto_core_wforcesntrup857.obj crypto_decode_857x1723.obj crypto_decode_857x3.obj crypto_decode_857x5167.obj crypto_decode_857xint16.obj crypto_decode_857xint32.obj crypto_encode_857x1723.obj crypto_encode_857x1723round.obj crypto_encode_857x3.obj crypto_encode_857x5167.obj crypto_encode_857xfreeze3.obj crypto_encode_857xint16.obj crypto_encode_int16.obj crypto_sort_int32.obj crypto_sort_uint32.obj crypto_stream_aes256ctr.obj crypto_verify_1184.obj kem.obj 
+
+CFLAGS=/nologo /O2 /I ..\..\..\common /W4 /WX
+
+all: $(LIBRARY)
+
+# Make sure objects are recompiled if headers change.
+$(OBJECTS): *.h
+
+$(LIBRARY): $(OBJECTS)
+    LIB.EXE /NOLOGO /WX /OUT:$@ $**
+
+clean:
+    -DEL $(OBJECTS)
+    -DEL $(LIBRARY)
diff --git a/crypto_kem/sntrup857/clean/api.h b/crypto_kem/sntrup857/clean/api.h
new file mode 100644
index 00000000..cb9e4c58
--- /dev/null
+++ b/crypto_kem/sntrup857/clean/api.h
@@ -0,0 +1,16 @@
+#ifndef PQCLEAN_SNTRUP857_CLEAN_API_H
+#define PQCLEAN_SNTRUP857_CLEAN_API_H
+
+
+
+#define PQCLEAN_SNTRUP857_CLEAN_CRYPTO_ALGNAME "sntrup857"
+
+#define PQCLEAN_SNTRUP857_CLEAN_CRYPTO_SECRETKEYBYTES 1999
+#define PQCLEAN_SNTRUP857_CLEAN_CRYPTO_PUBLICKEYBYTES 1322
+#define PQCLEAN_SNTRUP857_CLEAN_CRYPTO_CIPHERTEXTBYTES 1184
+#define PQCLEAN_SNTRUP857_CLEAN_CRYPTO_BYTES 32
+
+int PQCLEAN_SNTRUP857_CLEAN_crypto_kem_keypair(unsigned char *pk, unsigned char *sk);
+int PQCLEAN_SNTRUP857_CLEAN_crypto_kem_enc(unsigned char *c, unsigned char *k, const unsigned char *pk);
+int PQCLEAN_SNTRUP857_CLEAN_crypto_kem_dec(unsigned char *k, const unsigned char *c, const unsigned char *sk);
+#endif
diff --git a/crypto_kem/sntrup857/clean/crypto_core_inv3sntrup857.c b/crypto_kem/sntrup857/clean/crypto_core_inv3sntrup857.c
new file mode 100644
index 00000000..234b2ad4
--- /dev/null
+++ b/crypto_kem/sntrup857/clean/crypto_core_inv3sntrup857.c
@@ -0,0 +1,110 @@
+#include "crypto_core_inv3sntrup857.h"
+#include "params.h"
+
+
+
+#define int8 int8_t
+#define int16 int16_t
+#define int32 int32_t
+#define uint16 uint16_t
+#define uint32 uint32_t
+
+/* ----- masks */
+
+/* return -1 if x!=0; else return 0 */
+static int int16_nonzero_mask(int16 x) {
+    uint16 u = x; /* 0, else 1...65535 */
+    uint32 v = u; /* 0, else 1...65535 */
+    v = -v; /* 0, else 2^32-65535...2^32-1 */
+    v >>= 31; /* 0, else 1 */
+    return -v; /* 0, else -1 */
+}
+
+/* return -1 if x<0; otherwise return 0 */
+static int int16_negative_mask(int16 x) {
+    uint16 u = x;
+    u >>= 15;
+    return -(int) u;
+    /* alternative with gcc -fwrapv: */
+    /* x>>15 compiles to CPU's arithmetic right shift */
+}
+
+/* ----- arithmetic mod 3 */
+
+typedef int8 small;
+/* F3 is always represented as -1,0,1 */
+
+/* works for -16384 <= x < 16384 */
+static small F3_freeze(int16 x) {
+    return x - 3 * ((10923 * x + 16384) >> 15);
+}
+
+/* byte p of output is 0 if recip succeeded; else -1 */
+int PQCLEAN_SNTRUP857_CLEAN_crypto_core_inv3sntrup857(unsigned char *outbytes, const unsigned char *inbytes) {
+    small *out = (void *) outbytes;
+    small *in = (void *) inbytes;
+    small f[p + 1], g[p + 1], v[p + 1], r[p + 1];
+    int i, loop, delta;
+    int sign, swap, t;
+
+    for (i = 0; i < p + 1; ++i) {
+        v[i] = 0;
+    }
+    for (i = 0; i < p + 1; ++i) {
+        r[i] = 0;
+    }
+    r[0] = 1;
+    for (i = 0; i < p; ++i) {
+        f[i] = 0;
+    }
+    f[0] = 1;
+    f[p - 1] = f[p] = -1;
+    for (i = 0; i < p; ++i) {
+        small i1 = in[i] & 1;
+        g[p - 1 - i] = i1 - (in[i] & (i1 << 1));
+    }
+    g[p] = 0;
+
+    delta = 1;
+
+    for (loop = 0; loop < 2 * p - 1; ++loop) {
+        for (i = p; i > 0; --i) {
+            v[i] = v[i - 1];
+        }
+        v[0] = 0;
+
+        sign = -g[0] * f[0];
+        swap = int16_negative_mask(-delta) & int16_nonzero_mask(g[0]);
+        delta ^= swap & (delta ^ -delta);
+        delta += 1;
+
+        for (i = 0; i < p + 1; ++i) {
+            t = swap & (f[i] ^ g[i]);
+            f[i] ^= t;
+            g[i] ^= t;
+            t = swap & (v[i] ^ r[i]);
+            v[i] ^= t;
+            r[i] ^= t;
+        }
+
+        for (i = 0; i < p + 1; ++i) {
+            g[i] = F3_freeze(g[i] + sign * f[i]);
+        }
+        for (i = 0; i < p + 1; ++i) {
+            r[i] = F3_freeze(r[i] + sign * v[i]);
+        }
+
+        for (i = 0; i < p; ++i) {
+            g[i] = g[i + 1];
+        }
+        g[p] = 0;
+    }
+
+    sign = f[0];
+    for (i = 0; i < p; ++i) {
+        out[i] = sign * v[p - 1 - i];
+    }
+
+    out[p] = int16_nonzero_mask(delta);
+    return 0;
+}
diff --git a/crypto_kem/sntrup857/clean/crypto_core_inv3sntrup857.h b/crypto_kem/sntrup857/clean/crypto_core_inv3sntrup857.h
new file mode 100644
index 00000000..f56ceac1
--- /dev/null
+++ b/crypto_kem/sntrup857/clean/crypto_core_inv3sntrup857.h
@@ -0,0 +1,11 @@
+#ifndef PQCLEAN_SNTRUP857_CLEAN_CRYPTO_CORE_INV3SNTRUP857_H
+#define PQCLEAN_SNTRUP857_CLEAN_CRYPTO_CORE_INV3SNTRUP857_H
+
+#include <stdint.h>
+#define PQCLEAN_SNTRUP857_CLEAN_crypto_core_inv3sntrup857_OUTPUTBYTES 858
+#define PQCLEAN_SNTRUP857_CLEAN_crypto_core_inv3sntrup857_INPUTBYTES 857
+#define PQCLEAN_SNTRUP857_CLEAN_crypto_core_inv3sntrup857_KEYBYTES 0
+#define PQCLEAN_SNTRUP857_CLEAN_crypto_core_inv3sntrup857_CONSTBYTES 0
+
+int PQCLEAN_SNTRUP857_CLEAN_crypto_core_inv3sntrup857(unsigned char *outbytes, const unsigned char *inbytes);
+#endif
diff --git a/crypto_kem/sntrup857/clean/crypto_core_invsntrup857.c b/crypto_kem/sntrup857/clean/crypto_core_invsntrup857.c
new file mode 100644
index 00000000..eb9823b8
--- /dev/null
+++ b/crypto_kem/sntrup857/clean/crypto_core_invsntrup857.c
@@ -0,0 +1,131 @@
+#include "crypto_core_invsntrup857.h"
+#include "params.h"
+
+
+#define int8 int8_t
+#define int16 int16_t
+#define int32 int32_t
+#define uint16 uint16_t
+#define uint32 uint32_t
+
+
+/* ----- masks */
+
+/* return -1 if x!=0; else return 0 */
+static int int16_nonzero_mask(int16 x) {
+    uint16 u = x; /* 0, else 1...65535 */
+    uint32 v = u; /* 0, else 1...65535 */
+    v = -v; /* 0, else 2^32-65535...2^32-1 */
+    v >>= 31; /* 0, else 1 */
+    return -v; /* 0, else -1 */
+}
+
+/* return -1 if x<0; otherwise return 0 */
+static int int16_negative_mask(int16 x) {
+    uint16 u = x;
+    u >>= 15;
+    return -(int) u;
+    /* alternative with gcc -fwrapv: */
+    /* x>>15 compiles to CPU's arithmetic right shift */
+}
+
+/* ----- arithmetic mod q */
+
+typedef int8 small;
+
+typedef int16 Fq;
+/* always represented as -(q-1)/2...(q-1)/2 */
+
+/* works for -14000000 < x < 14000000 if q in 4591, 4621, 5167 */
+static Fq Fq_freeze(int32 x) {
+    x -= q * ((q18 * x) >> 18);
+    x -= q * ((q27 * x + 67108864) >> 27);
+    return x;
+}
+
+static Fq Fq_recip(Fq a1) {
+    int i = 1;
+    Fq ai = a1;
+
+    while (i < q - 2) {
+        ai = Fq_freeze(a1 * (int32)ai);
+        i += 1;
+    }
+    return ai;
+}
+
+/* ----- polynomials mod q */
+
+/* out = 1/(3*in) in Rq */
+/* outbytes[2*p] is 0 if recip succeeded; else -1 */
+int PQCLEAN_SNTRUP857_CLEAN_crypto_core_invsntrup857(unsigned char *outbytes, const unsigned char *inbytes) {
+    small *in = (void *) inbytes;
+    Fq out[p], f[p + 1], g[p + 1], v[p + 1], r[p + 1];
+    int i, loop, delta;
+    int swap, t;
+    int32 f0, g0;
+    Fq scale;
+
+    for (i = 0; i < p + 1; ++i) {
+        v[i] = 0;
+    }
+    for (i = 0; i < p + 1; ++i) {
+        r[i] = 0;
+    }
+    r[0] = Fq_recip(3);
+    for (i = 0; i < p; ++i) {
+        f[i] = 0;
+    }
+    f[0] = 1;
+    f[p - 1] = f[p] = -1;
+    for (i = 0; i < p; ++i) {
+        g[p - 1 - i] = in[i];
+    }
+    g[p] = 0;
+
+    delta = 1;
+
+    for (loop = 0; loop < 2 * p - 1; ++loop) {
+        for (i = p; i > 0; --i) {
+            v[i] = v[i - 1];
+        }
+        v[0] = 0;
+
+        swap = int16_negative_mask(-delta) & int16_nonzero_mask(g[0]);
+        delta ^= swap & (delta ^ -delta);
+        delta += 1;
+
+        for (i = 0; i < p + 1; ++i) {
+            t = swap & (f[i] ^ g[i]);
+            f[i] ^= t;
+            g[i] ^= t;
+            t = swap & (v[i] ^ r[i]);
+            v[i] ^= t;
+            r[i] ^= t;
+        }
+
+        f0 = f[0];
+        g0 = g[0];
+        for (i = 0; i < p + 1; ++i) {
+            g[i] = Fq_freeze(f0 * g[i] - g0 * f[i]);
+        }
+        for (i = 0; i < p + 1; ++i) {
+            r[i] = Fq_freeze(f0 * r[i] - g0 * v[i]);
+        }
+
+        for (i = 0; i < p; ++i) {
+            g[i] = g[i + 1];
+        }
+        g[p] = 0;
+    }
+
+    scale = Fq_recip(f[0]);
+    for (i = 0; i < p; ++i) {
+        out[i] = Fq_freeze(scale * (int32)v[p - 1 - i]);
+    }
+
+    crypto_encode_pxint16(outbytes, out);
+
+    outbytes[2 * p] = int16_nonzero_mask(delta);
+    return 0;
+}
diff --git a/crypto_kem/sntrup857/clean/crypto_core_invsntrup857.h b/crypto_kem/sntrup857/clean/crypto_core_invsntrup857.h
new file mode 100644
index 00000000..a69a68fd
--- /dev/null
+++ b/crypto_kem/sntrup857/clean/crypto_core_invsntrup857.h
@@ -0,0 +1,11 @@
+#ifndef PQCLEAN_SNTRUP857_CLEAN_CRYPTO_CORE_INVSNTRUP857_H
+#define PQCLEAN_SNTRUP857_CLEAN_CRYPTO_CORE_INVSNTRUP857_H
+
+#include <stdint.h>
+#define PQCLEAN_SNTRUP857_CLEAN_crypto_core_invsntrup857_OUTPUTBYTES 1715
+#define PQCLEAN_SNTRUP857_CLEAN_crypto_core_invsntrup857_INPUTBYTES 857
+#define PQCLEAN_SNTRUP857_CLEAN_crypto_core_invsntrup857_KEYBYTES 0
+#define PQCLEAN_SNTRUP857_CLEAN_crypto_core_invsntrup857_CONSTBYTES 0
+
+int PQCLEAN_SNTRUP857_CLEAN_crypto_core_invsntrup857(unsigned char *outbytes, const unsigned char *inbytes);
+#endif
diff --git a/crypto_kem/sntrup857/clean/crypto_core_mult3sntrup857.c b/crypto_kem/sntrup857/clean/crypto_core_mult3sntrup857.c
new file mode 100644
index 00000000..02d8697b
--- /dev/null
+++ b/crypto_kem/sntrup857/clean/crypto_core_mult3sntrup857.c
@@ -0,0 +1,57 @@
+#include "crypto_core_mult3sntrup857.h"
+#include "params.h"
+
+
+#define int8 int8_t
+#define int16 int16_t
+typedef int8 small;
+
+/* works for -16384 <= x < 16384 */
+static small F3_freeze(int16 x) {
+    return x - 3 * ((10923 * x + 16384) >> 15);
+}
+
+int PQCLEAN_SNTRUP857_CLEAN_crypto_core_mult3sntrup857(unsigned char *outbytes, const unsigned char *inbytes, const unsigned char *kbytes) {
+    small *h = (void *) outbytes;
+    small f[p];
+    small g[p];
+    small fg[p + p - 1];
+    int16 result;
+    int i, j;
+
+    for (i = 0; i < p; ++i) {
+        small fi = inbytes[i];
+        small fi0 = fi & 1;
+        f[i] = fi0 - (fi & (fi0 << 1));
+    }
+    for (i = 0; i < p; ++i) {
+        small gi = kbytes[i];
+        small gi0 = gi & 1;
+        g[i] = gi0 - (gi & (gi0 << 1));
+    }
+
+    for (i = 0; i < p; ++i) {
+        result = 0;
+        for (j = 0; j <= i; ++j) {
+            result += f[j] * g[i - j];
+        }
+        fg[i] = F3_freeze(result);
+    }
+    for (i = p; i < p + p - 1; ++i) {
+        result = 0;
+        for (j = i - p + 1; j < p; ++j) {
+            result += f[j] * g[i - j];
+        }
+        fg[i] = F3_freeze(result);
+    }
+
+    for (i = p + p - 2; i >= p; --i) {
+        fg[i - p] = F3_freeze(fg[i - p] + fg[i]);
+        fg[i - p + 1] = F3_freeze(fg[i - p + 1] + fg[i]);
+    }
+
+    for (i = 0; i < p; ++i) {
+        h[i] = fg[i];
+    }
+    return 0;
+}
diff --git a/crypto_kem/sntrup857/clean/crypto_core_mult3sntrup857.h b/crypto_kem/sntrup857/clean/crypto_core_mult3sntrup857.h
new file mode 100644
index 00000000..fc5a5492
--- /dev/null
+++ b/crypto_kem/sntrup857/clean/crypto_core_mult3sntrup857.h
@@ -0,0 +1,11 @@
+#ifndef PQCLEAN_SNTRUP857_CLEAN_CRYPTO_CORE_MULT3SNTRUP857_H
+#define PQCLEAN_SNTRUP857_CLEAN_CRYPTO_CORE_MULT3SNTRUP857_H
+
+#include <stdint.h>
+#define PQCLEAN_SNTRUP857_CLEAN_crypto_core_mult3sntrup857_OUTPUTBYTES 857
+#define PQCLEAN_SNTRUP857_CLEAN_crypto_core_mult3sntrup857_INPUTBYTES 857
+#define PQCLEAN_SNTRUP857_CLEAN_crypto_core_mult3sntrup857_KEYBYTES 857
+#define PQCLEAN_SNTRUP857_CLEAN_crypto_core_mult3sntrup857_CONSTBYTES 0
+
+int PQCLEAN_SNTRUP857_CLEAN_crypto_core_mult3sntrup857(unsigned char *outbytes, const unsigned char *inbytes, const unsigned char *kbytes);
+#endif
diff --git a/crypto_kem/sntrup857/clean/crypto_core_multsntrup857.c b/crypto_kem/sntrup857/clean/crypto_core_multsntrup857.c
new file mode 100644
index 00000000..80474ff8
--- /dev/null
+++ b/crypto_kem/sntrup857/clean/crypto_core_multsntrup857.c
@@ -0,0 +1,60 @@
+#include "crypto_core_multsntrup857.h"
+#include "params.h"
+
+
+#define int8 int8_t
+#define int16 int16_t
+#define int32 int32_t
+typedef int8 small;
+
+typedef int16 Fq;
+/* always represented as -(q-1)/2...(q-1)/2 */
+
+/* works for -14000000 < x < 14000000 if q in 4591, 4621, 5167 */
+static Fq Fq_freeze(int32 x) {
+    x -= q * ((q18 * x) >> 18);
+    x -= q * ((q27 * x + 67108864) >> 27);
+    return x;
+}
+
+int PQCLEAN_SNTRUP857_CLEAN_crypto_core_multsntrup857(unsigned char *outbytes, const unsigned char *inbytes, const unsigned char *kbytes) {
+    Fq f[p];
+    small g[p];
+    Fq fg[p + p - 1];
+    int32 result;
+    int i, j;
+
+    crypto_decode_pxint16(f, inbytes);
+    for (i = 0; i < p; ++i) {
+        f[i] = Fq_freeze(f[i]);
+    }
+
+    for (i = 0; i < p; ++i) {
+        small gi = kbytes[i];
+        small gi0 = gi & 1;
+        g[i] = gi0 - (gi & (gi0 << 1));
+    }
+
+    for (i = 0; i < p; ++i) {
+        result = 0;
+        for (j = 0; j <= i; ++j) {
+            result += f[j] * (int32)g[i - j];
+        }
+        fg[i] = Fq_freeze(result);
+    }
+    for (i = p; i < p + p - 1; ++i) {
+        result = 0;
+        for (j = i - p + 1; j < p; ++j) {
+            result += f[j] * (int32)g[i - j];
+        }
+        fg[i] = Fq_freeze(result);
+    }
+
+    for (i = p + p - 2; i >= p; --i) {
+        fg[i - p] = Fq_freeze(fg[i - p] + fg[i]);
+        fg[i - p + 1] = Fq_freeze(fg[i - p + 1] + fg[i]);
+    }
+
+    crypto_encode_pxint16(outbytes, fg);
+    return 0;
+}
diff --git a/crypto_kem/sntrup857/clean/crypto_core_multsntrup857.h b/crypto_kem/sntrup857/clean/crypto_core_multsntrup857.h
new file mode 100644
index 00000000..3eb38c74
--- /dev/null
+++ b/crypto_kem/sntrup857/clean/crypto_core_multsntrup857.h
@@ -0,0 +1,11 @@
+#ifndef PQCLEAN_SNTRUP857_CLEAN_CRYPTO_CORE_MULTSNTRUP857_H
+#define PQCLEAN_SNTRUP857_CLEAN_CRYPTO_CORE_MULTSNTRUP857_H
+
+#include <stdint.h>
+#define PQCLEAN_SNTRUP857_CLEAN_crypto_core_multsntrup857_OUTPUTBYTES 1714
+#define PQCLEAN_SNTRUP857_CLEAN_crypto_core_multsntrup857_INPUTBYTES 1714
+#define PQCLEAN_SNTRUP857_CLEAN_crypto_core_multsntrup857_KEYBYTES 857
+#define PQCLEAN_SNTRUP857_CLEAN_crypto_core_multsntrup857_CONSTBYTES 0
+
+int PQCLEAN_SNTRUP857_CLEAN_crypto_core_multsntrup857(unsigned char *outbytes, const unsigned char *inbytes, const unsigned char *kbytes);
+#endif
diff --git a/crypto_kem/sntrup857/clean/crypto_core_scale3sntrup857.c b/crypto_kem/sntrup857/clean/crypto_core_scale3sntrup857.c
new file mode 100644
index 00000000..7ae6f613
--- /dev/null
+++ b/crypto_kem/sntrup857/clean/crypto_core_scale3sntrup857.c
@@ -0,0 +1,32 @@
+#include "crypto_core_scale3sntrup857.h"
+#include "crypto_decode_857xint16.h"
+#include "crypto_encode_857xint16.h"
+
+
+#define p 857
+#define q 5167
+
+#define crypto_decode_pxint16 PQCLEAN_SNTRUP857_CLEAN_crypto_decode_857xint16
+#define crypto_encode_pxint16 PQCLEAN_SNTRUP857_CLEAN_crypto_encode_857xint16
+
+typedef int16_t Fq;
+
+/* out = 3*in in Rq */
+int PQCLEAN_SNTRUP857_CLEAN_crypto_core_scale3sntrup857(unsigned char *outbytes, const unsigned char *inbytes) {
+    Fq f[p];
+    int i;
+
+    crypto_decode_pxint16(f, inbytes);
+    for (i = 0; i < p; ++i) {
+        Fq x = f[i];
+        x *= 3; /* (-3q+3)/2 ... (3q-3)/2 */
+        x -= (q + 1) / 2; /* -2q+1 ... q-2 */
+        x += q & (x >> 15); /* -q+1 ... q-1 */
+        x += q & (x >> 15); /* 0 ... q-1 */
+        x -= (q - 1) / 2; /* -(q-1)/2 ... (q-1)/2 */
+        f[i] = x;
+    }
+    crypto_encode_pxint16(outbytes, f);
+
+    return 0;
+}
diff --git a/crypto_kem/sntrup857/clean/crypto_core_scale3sntrup857.h b/crypto_kem/sntrup857/clean/crypto_core_scale3sntrup857.h
new file mode 100644
index 00000000..15fb76a5
--- /dev/null
+++ b/crypto_kem/sntrup857/clean/crypto_core_scale3sntrup857.h
@@ -0,0 +1,11 @@
+#ifndef PQCLEAN_SNTRUP857_CLEAN_CRYPTO_CORE_SCALE3SNTRUP857_H
+#define PQCLEAN_SNTRUP857_CLEAN_CRYPTO_CORE_SCALE3SNTRUP857_H
+
+#include <stdint.h>
+#define PQCLEAN_SNTRUP857_CLEAN_crypto_core_scale3sntrup857_OUTPUTBYTES 1714
+#define PQCLEAN_SNTRUP857_CLEAN_crypto_core_scale3sntrup857_INPUTBYTES 1714
+#define PQCLEAN_SNTRUP857_CLEAN_crypto_core_scale3sntrup857_KEYBYTES 0
+#define PQCLEAN_SNTRUP857_CLEAN_crypto_core_scale3sntrup857_CONSTBYTES 0
+
+int PQCLEAN_SNTRUP857_CLEAN_crypto_core_scale3sntrup857(unsigned char *outbytes, const unsigned char *inbytes);
+#endif
diff --git a/crypto_kem/sntrup857/clean/crypto_core_weightsntrup857.c b/crypto_kem/sntrup857/clean/crypto_core_weightsntrup857.c
new file mode 100644
index 00000000..e8274227
--- /dev/null
+++ b/crypto_kem/sntrup857/clean/crypto_core_weightsntrup857.c
@@ -0,0 +1,21 @@
+#include "crypto_core_weightsntrup857.h"
+#include "crypto_encode_int16.h"
+#include "params.h"
+
+
+#define int8 int8_t
+#define int16 int16_t
+
+
+/* out = little-endian weight of bottom bits of in */
+int PQCLEAN_SNTRUP857_CLEAN_crypto_core_weightsntrup857(unsigned char *outbytes, const unsigned char *inbytes) {
+    int8 *in = (void *) inbytes;
+    int16 weight = 0;
+    int i;
+
+    for (i = 0; i < p; ++i) {
+        weight += in[i] & 1;
+    }
+    PQCLEAN_SNTRUP857_CLEAN_crypto_encode_int16(outbytes, &weight);
+    return 0;
+}
diff --git a/crypto_kem/sntrup857/clean/crypto_core_weightsntrup857.h b/crypto_kem/sntrup857/clean/crypto_core_weightsntrup857.h
new file mode 100644
index 00000000..e32e28cd
--- /dev/null
+++ b/crypto_kem/sntrup857/clean/crypto_core_weightsntrup857.h
@@ -0,0 +1,11 @@
+#ifndef PQCLEAN_SNTRUP857_CLEAN_CRYPTO_CORE_WEIGHTSNTRUP857_H
+#define PQCLEAN_SNTRUP857_CLEAN_CRYPTO_CORE_WEIGHTSNTRUP857_H
+
+#include <stdint.h>
+#define PQCLEAN_SNTRUP857_CLEAN_crypto_core_weightsntrup857_OUTPUTBYTES 2
+#define PQCLEAN_SNTRUP857_CLEAN_crypto_core_weightsntrup857_INPUTBYTES 857
+#define PQCLEAN_SNTRUP857_CLEAN_crypto_core_weightsntrup857_KEYBYTES 0
+#define PQCLEAN_SNTRUP857_CLEAN_crypto_core_weightsntrup857_CONSTBYTES 0
+
+int PQCLEAN_SNTRUP857_CLEAN_crypto_core_weightsntrup857(unsigned char *outbytes, const unsigned char *inbytes);
+#endif
diff --git a/crypto_kem/sntrup857/clean/crypto_core_wforcesntrup857.c b/crypto_kem/sntrup857/clean/crypto_core_wforcesntrup857.c
new file mode 100644
index 00000000..6a8dc8fd
--- /dev/null
+++ b/crypto_kem/sntrup857/clean/crypto_core_wforcesntrup857.c
@@ -0,0 +1,48 @@
+#include "crypto_core_wforcesntrup857.h"
+#include "params.h"
+
+
+#define int8 int8_t
+#define int16 int16_t
+#define uint16 uint16_t
+#define uint32 uint32_t
+
+typedef int8 small;
+
+
+/* return -1 if x!=0; else return 0 */
+static int int16_nonzero_mask(int16 x) {
+    uint16 u = x; /* 0, else 1...65535 */
+    uint32 v = u; /* 0, else 1...65535 */
+    v = -v; /* 0, else 2^32-65535...2^32-1 */
+    v >>= 31; /* 0, else 1 */
+    return -v; /* 0, else -1 */
+}
+
+/* 0 if Weightw_is(r), else -1 */
+static int Weightw_mask(const small *r) {
+    int weight = 0;
+    int i;
+
+    for (i = 0; i < p; ++i) {
+        weight += r[i] & 1;
+    }
+    return int16_nonzero_mask(weight - w);
+}
+
+/* out = in if bottom bits of in have weight w */
+/* otherwise out = (1,1,...,1,0,0,...,0) */
+int PQCLEAN_SNTRUP857_CLEAN_crypto_core_wforcesntrup857(unsigned char *outbytes, const unsigned char *inbytes) {
+    small *out = (void *) outbytes;
+    const small *in = (const void *) inbytes;
+    int i, mask;
+
+    mask = Weightw_mask(in); /* 0 if weight w, else -1 */
+    for (i = 0; i < w; ++i) {
+        out[i] = ((in[i] ^ 1) & ~mask) ^ 1;
+    }
+    for (i = w; i < p; ++i) {
+        out[i] = in[i] & ~mask;
+    }
+    return 0;
+}
diff --git a/crypto_kem/sntrup857/clean/crypto_core_wforcesntrup857.h b/crypto_kem/sntrup857/clean/crypto_core_wforcesntrup857.h
new file mode 100644
index 00000000..f36d8281
--- /dev/null
+++ b/crypto_kem/sntrup857/clean/crypto_core_wforcesntrup857.h
@@ -0,0 +1,11 @@
+#ifndef PQCLEAN_SNTRUP857_CLEAN_CRYPTO_CORE_WFORCESNTRUP857_H
+#define PQCLEAN_SNTRUP857_CLEAN_CRYPTO_CORE_WFORCESNTRUP857_H
+
+#include <stdint.h>
+#define PQCLEAN_SNTRUP857_CLEAN_crypto_core_wforcesntrup857_OUTPUTBYTES 857
+#define PQCLEAN_SNTRUP857_CLEAN_crypto_core_wforcesntrup857_INPUTBYTES 857
+#define PQCLEAN_SNTRUP857_CLEAN_crypto_core_wforcesntrup857_KEYBYTES 0
+#define PQCLEAN_SNTRUP857_CLEAN_crypto_core_wforcesntrup857_CONSTBYTES 0
+
+int PQCLEAN_SNTRUP857_CLEAN_crypto_core_wforcesntrup857(unsigned char *outbytes, const unsigned char *inbytes);
+#endif
diff --git a/crypto_kem/sntrup857/clean/crypto_decode_857x1723.c b/crypto_kem/sntrup857/clean/crypto_decode_857x1723.c
new file mode 100644
index 00000000..ab803b68
--- /dev/null
+++ b/crypto_kem/sntrup857/clean/crypto_decode_857x1723.c
@@ -0,0 +1,202 @@
+#include "crypto_decode_857x1723.h"
+
+/* auto-generated; do not edit */
+
+#define int16 int16_t
+#define uint16 uint16_t
+#define uint32 uint32_t
+#define uint64 uint64_t
+
+/*
+CPU division instruction typically takes time depending on x.
+This software is designed to take time independent of x.
+Time still varies depending on m; user must ensure that m is constant.
+Time also varies on CPUs where multiplication is variable-time.
+There could be more CPU issues.
+There could also be compiler issues.
+*/
+
+static void uint32_divmod_uint14(uint32 *q, uint16 *r, uint32 x, uint16 m) {
+    uint32 v = 0x80000000;
+    uint32 qpart;
+    uint32 mask;
+
+    v /= m;
+
+    /* caller guarantees m > 0 */
+    /* caller guarantees m < 16384 */
+    /* vm <= 2^31 <= vm+m-1 */
+    /* xvm <= 2^31 x <= xvm+x(m-1) */
+
+    *q = 0;
+
+    qpart = (x * (uint64)v) >> 31;
+    /* 2^31 qpart <= xv <= 2^31 qpart + 2^31-1 */
+    /* 2^31 qpart m <= xvm <= 2^31 qpart m + (2^31-1)m */
+    /* 2^31 qpart m <= 2^31 x <= 2^31 qpart m + (2^31-1)m + x(m-1) */
+    /* 0 <= 2^31 newx <= (2^31-1)m + x(m-1) */
+    /* 0 <= newx <= (1-1/2^31)m + x(m-1)/2^31 */
+    /* 0 <= newx <= (1-1/2^31)(2^14-1) + (2^32-1)((2^14-1)-1)/2^31 */
+
+    x -= qpart * m;
+    *q += qpart;
+    /* x <= 49146 */
+
+    qpart = (x * (uint64)v) >> 31;
+    /* 0 <= newx <= (1-1/2^31)m + x(m-1)/2^31 */
+    /* 0 <= newx <= m + 49146(2^14-1)/2^31 */
+    /* 0 <= newx <= m + 0.4 */
+    /* 0 <= newx <= m */
+
+    x -= qpart * m;
+    *q += qpart;
+    /* x <= m */
+
+    x -= m;
+    *q += 1;
+    mask = -(x >> 31);
+    x += mask & (uint32)m;
+    *q += mask;
+    /* x < m */
+
+    *r = x;
+}
+
+static uint16 uint32_mod_uint14(uint32 x, uint16 m) {
+    uint32 q;
+    uint16 r;
+    uint32_divmod_uint14(&q, &r, x, m);
+    return r;
+}
+
+void PQCLEAN_SNTRUP857_CLEAN_crypto_decode_857x1723(void *v, const unsigned char *s) {
+    int16 *R0 = v;
+    uint16 R1[429], R2[215], R3[108], R4[54], R5[27], R6[14], R7[7], R8[4], R9[2], R10[1];
+    long long i;
+    uint16 r0;
+    uint32 r1, r2;
+
+    s += PQCLEAN_SNTRUP857_CLEAN_crypto_decode_857x1723_STRBYTES;
+    r1 = 0;
+    r1 = (r1 << 8) | *--s;
+    r1 = uint32_mod_uint14(r1, 160); /* needed only for invalid inputs */
+    R10[0] = r1;
+
+    r2 = R10[0];
+    r2 = (r2 << 8) | *--s;
+    r2 = (r2 << 8) | *--s;
+    uint32_divmod_uint14(&r1, &r0, r2, 743);
+    R9[0] = r0;
+    r1 = uint32_mod_uint14(r1, 14044); /* needed only for invalid inputs */
+    R9[1] = r1;
+
+    r2 = R9[1];
+    r2 = (r2 << 8) | *--s;
+    uint32_divmod_uint14(&r1, &r0, r2, 436);
+    R8[2] = r0;
+    r1 = uint32_mod_uint14(r1, 8246); /* needed only for invalid inputs */
+    R8[3] = r1;
+    r2 = R9[0];
+    r2 = (r2 << 8) | *--s;
+    uint32_divmod_uint14(&r1, &r0, r2, 436);
+    R8[0] = r0;
+    r1 = uint32_mod_uint14(r1, 436); /* needed only for invalid inputs */
+    R8[1] = r1;
+
+    R7[6] = R8[3];
+    for (i = 2; i >= 0; --i) {
+        r2 = R8[i];
+        r2 = (r2 << 8) | *--s;
+        uint32_divmod_uint14(&r1, &r0, r2, 334);
+        R7[2 * i] = r0;
+        r1 = uint32_mod_uint14(r1, 334); /* needed only for invalid inputs */
+        R7[2 * i + 1] = r1;
+    }
+
+    r2 = R7[6];
+    r2 = (r2 << 8) | *--s;
+    uint32_divmod_uint14(&r1, &r0, r2, 292);
+    R6[12] = r0;
+    r1 = uint32_mod_uint14(r1, 7229); /* needed only for invalid inputs */
+    R6[13] = r1;
+    for (i = 5; i >= 0; --i) {
+        r2 = R7[i];
+        r2 = (r2 << 8) | *--s;
+        uint32_divmod_uint14(&r1, &r0, r2, 292);
+        R6[2 * i] = r0;
+        r1 = uint32_mod_uint14(r1, 292); /* needed only for invalid inputs */
+        R6[2 * i + 1] = r1;
+    }
+
+    R5[26] = R6[13];
+    for (i = 12; i >= 0; --i) {
+        r2 = R6[i];
+        r2 = (r2 << 8) | *--s;
+        uint32_divmod_uint14(&r1, &r0, r2, 273);
+        R5[2 * i] = r0;
+        r1 = uint32_mod_uint14(r1, 273); /* needed only for invalid inputs */
+        R5[2 * i + 1] = r1;
+    }
+
+    r2 = R5[26];
+    r2 = (r2 << 8) | *--s;
+    uint32_divmod_uint14(&r1, &r0, r2, 4225);
+    R4[52] = r0;
+    r1 = uint32_mod_uint14(r1, 438); /* needed only for invalid inputs */
+    R4[53] = r1;
+    for (i = 25; i >= 0; --i) {
+        r2 = R5[i];
+        r2 = (r2 << 8) | *--s;
+        r2 = (r2 << 8) | *--s;
+        uint32_divmod_uint14(&r1, &r0, r2, 4225);
+        R4[2 * i] = r0;
+        r1 = uint32_mod_uint14(r1, 4225); /* needed only for invalid inputs */
+        R4[2 * i + 1] = r1;
+    }
+
+    r2 = R4[53];
+    r2 = (r2 << 8) | *--s;
+    uint32_divmod_uint14(&r1, &r0, r2, 65);
+    R3[106] = r0;
+    r1 = uint32_mod_uint14(r1, 1723); /* needed only for invalid inputs */
+    R3[107] = r1;
+    for (i = 52; i >= 0; --i) {
+        r2 = R4[i];
+        uint32_divmod_uint14(&r1, &r0, r2, 65);
+        R3[2 * i] = r0;
+        r1 = uint32_mod_uint14(r1, 65); /* needed only for invalid inputs */
+        R3[2 * i + 1] = r1;
+    }
+
+    R2[214] = R3[107];
+    for (i = 106; i >= 0; --i) {
+        r2 = R3[i];
+        r2 = (r2 << 8) | *--s;
+        r2 = (r2 << 8) | *--s;
+        uint32_divmod_uint14(&r1, &r0, r2, 2053);
+        R2[2 * i] = r0;
+        r1 = uint32_mod_uint14(r1, 2053); /* needed only for invalid inputs */
+        R2[2 * i + 1] = r1;
+    }
+
+    R1[428] = R2[214];
+    for (i = 213; i >= 0; --i) {
+        r2 = R2[i];
+        r2 = (r2 << 8) | *--s;
+        r2 = (r2 << 8) | *--s;
+        uint32_divmod_uint14(&r1, &r0, r2, 11597);
+        R1[2 * i] = r0;
+        r1 = uint32_mod_uint14(r1, 11597); /* needed only for invalid inputs */
+        R1[2 * i + 1] = r1;
+    }
+
+    R0[856] = 3 * R1[428] - 2583;
+    for (i = 427; i >= 0; --i) {
+        r2 = R1[i];
+        r2 = (r2 << 8) | *--s;
+        uint32_divmod_uint14(&r1, &r0, r2, 1723);
+        R0[2 * i] = 3 * r0 - 2583;
+        r1 = uint32_mod_uint14(r1, 1723); /* needed only for invalid inputs */
+        R0[2 * i + 1] = 3 * r1 - 2583;
+    }
+}
diff --git a/crypto_kem/sntrup857/clean/crypto_decode_857x1723.h b/crypto_kem/sntrup857/clean/crypto_decode_857x1723.h
new file mode 100644
index 00000000..e344d0f1
--- /dev/null
+++ b/crypto_kem/sntrup857/clean/crypto_decode_857x1723.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_SNTRUP857_CLEAN_CRYPTO_DECODE_857X1723_H
+#define PQCLEAN_SNTRUP857_CLEAN_CRYPTO_DECODE_857X1723_H
+
+#include <stdint.h>
+#define PQCLEAN_SNTRUP857_CLEAN_crypto_decode_857x1723_STRBYTES 1152
+#define PQCLEAN_SNTRUP857_CLEAN_crypto_decode_857x1723_ITEMS 857
+#define PQCLEAN_SNTRUP857_CLEAN_crypto_decode_857x1723_ITEMBYTES 2
+
+void PQCLEAN_SNTRUP857_CLEAN_crypto_decode_857x1723(void *v, const unsigned char *s);
+#endif
diff --git a/crypto_kem/sntrup857/clean/crypto_decode_857x3.c b/crypto_kem/sntrup857/clean/crypto_decode_857x3.c
new file mode 100644
index 00000000..b2f433e6
--- /dev/null
+++ b/crypto_kem/sntrup857/clean/crypto_decode_857x3.c
@@ -0,0 +1,24 @@
+#include "crypto_decode_857x3.h"
+
+#define uint8 uint8_t
+
+#define p 857
+
+void PQCLEAN_SNTRUP857_CLEAN_crypto_decode_857x3(void *v, const unsigned char *s) {
+    uint8 *f = v;
+    uint8 x;
+    int i;
+
+    for (i = 0; i < p / 4; ++i) {
+        x = *s++;
+        *f++ = ((uint8)(x & 3)) - 1;
+        x >>= 2;
+        *f++ = ((uint8)(x & 3)) - 1;
+        x >>= 2;
+        *f++ = ((uint8)(x & 3)) - 1;
+        x >>= 2;
+        *f++ = ((uint8)(x & 3)) - 1;
+    }
+    x = *s++;
+    *f++ = ((uint8)(x & 3)) - 1;
+}
diff --git a/crypto_kem/sntrup857/clean/crypto_decode_857x3.h b/crypto_kem/sntrup857/clean/crypto_decode_857x3.h
new file mode 100644
index 00000000..f264068f
--- /dev/null
+++ b/crypto_kem/sntrup857/clean/crypto_decode_857x3.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_SNTRUP857_CLEAN_CRYPTO_DECODE_857X3_H
+#define PQCLEAN_SNTRUP857_CLEAN_CRYPTO_DECODE_857X3_H
+
+#include <stdint.h>
+#define PQCLEAN_SNTRUP857_CLEAN_crypto_decode_857x3_STRBYTES 215
+#define PQCLEAN_SNTRUP857_CLEAN_crypto_decode_857x3_ITEMS 857
+#define PQCLEAN_SNTRUP857_CLEAN_crypto_decode_857x3_ITEMBYTES 1
+
+void PQCLEAN_SNTRUP857_CLEAN_crypto_decode_857x3(void *v, const unsigned char *s);
+#endif
diff --git a/crypto_kem/sntrup857/clean/crypto_decode_857x5167.c b/crypto_kem/sntrup857/clean/crypto_decode_857x5167.c
new file mode 100644
index 00000000..1068effe
--- /dev/null
+++ b/crypto_kem/sntrup857/clean/crypto_decode_857x5167.c
@@ -0,0 +1,205 @@
+#include "crypto_decode_857x5167.h"
+
+/* auto-generated; do not edit */
+
+#define int16 int16_t
+#define uint16 uint16_t
+#define uint32 uint32_t
+#define uint64 uint64_t
+
+/*
+CPU division instruction typically takes time depending on x.
+This software is designed to take time independent of x.
+Time still varies depending on m; user must ensure that m is constant.
+Time also varies on CPUs where multiplication is variable-time.
+There could be more CPU issues.
+There could also be compiler issues.
+*/
+
+static void uint32_divmod_uint14(uint32 *q, uint16 *r, uint32 x, uint16 m) {
+    uint32 v = 0x80000000;
+    uint32 qpart;
+    uint32 mask;
+
+    v /= m;
+
+    /* caller guarantees m > 0 */
+    /* caller guarantees m < 16384 */
+    /* vm <= 2^31 <= vm+m-1 */
+    /* xvm <= 2^31 x <= xvm+x(m-1) */
+
+    *q = 0;
+
+    qpart = (x * (uint64)v) >> 31;
+    /* 2^31 qpart <= xv <= 2^31 qpart + 2^31-1 */
+    /* 2^31 qpart m <= xvm <= 2^31 qpart m + (2^31-1)m */
+    /* 2^31 qpart m <= 2^31 x <= 2^31 qpart m + (2^31-1)m + x(m-1) */
+    /* 0 <= 2^31 newx <= (2^31-1)m + x(m-1) */
+    /* 0 <= newx <= (1-1/2^31)m + x(m-1)/2^31 */
+    /* 0 <= newx <= (1-1/2^31)(2^14-1) + (2^32-1)((2^14-1)-1)/2^31 */
+
+    x -= qpart * m;
+    *q += qpart;
+    /* x <= 49146 */
+
+    qpart = (x * (uint64)v) >> 31;
+    /* 0 <= newx <= (1-1/2^31)m + x(m-1)/2^31 */
+    /* 0 <= newx <= m + 49146(2^14-1)/2^31 */
+    /* 0 <= newx <= m + 0.4 */
+    /* 0 <= newx <= m */
+
+    x -= qpart * m;
+    *q += qpart;
+    /* x <= m */
+
+    x -= m;
+    *q += 1;
+    mask = -(x >> 31);
+    x += mask & (uint32)m;
+    *q += mask;
+    /* x < m */
+
+    *r = x;
+}
+
+static uint16 uint32_mod_uint14(uint32 x, uint16 m) {
+    uint32 q;
+    uint16 r;
+    uint32_divmod_uint14(&q, &r, x, m);
+    return r;
+}
+
+void PQCLEAN_SNTRUP857_CLEAN_crypto_decode_857x5167(void *v, const unsigned char *s) {
+    int16 *R0 = v;
+    uint16 R1[429], R2[215], R3[108], R4[54], R5[27], R6[14], R7[7], R8[4], R9[2], R10[1];
+    long long i;
+    uint16 r0;
+    uint32 r1, r2;
+
+    s += PQCLEAN_SNTRUP857_CLEAN_crypto_decode_857x5167_STRBYTES;
+    r1 = 0;
+    r1 = (r1 << 8) | *--s;
+    r1 = (r1 << 8) | *--s;
+    r1 = uint32_mod_uint14(r1, 6225); /* needed only for invalid inputs */
+    R10[0] = r1;
+
+    r2 = R10[0];
+    r2 = (r2 << 8) | *--s;
+    uint32_divmod_uint14(&r1, &r0, r2, 5476);
+    R9[0] = r0;
+    r1 = uint32_mod_uint14(r1, 291); /* needed only for invalid inputs */
+    R9[1] = r1;
+
+    r2 = R9[1];
+    r2 = (r2 << 8) | *--s;
+    uint32_divmod_uint14(&r1, &r0, r2, 74);
+    R8[2] = r0;
+    r1 = uint32_mod_uint14(r1, 1004); /* needed only for invalid inputs */
+    R8[3] = r1;
+    r2 = R9[0];
+    uint32_divmod_uint14(&r1, &r0, r2, 74);
+    R8[0] = r0;
+    r1 = uint32_mod_uint14(r1, 74); /* needed only for invalid inputs */
+    R8[1] = r1;
+
+    R7[6] = R8[3];
+    for (i = 2; i >= 0; --i) {
+        r2 = R8[i];
+        r2 = (r2 << 8) | *--s;
+        r2 = (r2 << 8) | *--s;
+        uint32_divmod_uint14(&r1, &r0, r2, 2194);
+        R7[2 * i] = r0;
+        r1 = uint32_mod_uint14(r1, 2194); /* needed only for invalid inputs */
+        R7[2 * i + 1] = r1;
+    }
+
+    r2 = R7[6];
+    r2 = (r2 << 8) | *--s;
+    r2 = (r2 << 8) | *--s;
+    uint32_divmod_uint14(&r1, &r0, r2, 11991);
+    R6[12] = r0;
+    r1 = uint32_mod_uint14(r1, 5483); /* needed only for invalid inputs */
+    R6[13] = r1;
+    for (i = 5; i >= 0; --i) {
+        r2 = R7[i];
+        r2 = (r2 << 8) | *--s;
+        r2 = (r2 << 8) | *--s;
+        uint32_divmod_uint14(&r1, &r0, r2, 11991);
+        R6[2 * i] = r0;
+        r1 = uint32_mod_uint14(r1, 11991); /* needed only for invalid inputs */
+        R6[2 * i + 1] = r1;
+    }
+
+    R5[26] = R6[13];
+    for (i = 12; i >= 0; --i) {
+        r2 = R6[i];
+        r2 = (r2 << 8) | *--s;
+        uint32_divmod_uint14(&r1, &r0, r2, 1752);
+        R5[2 * i] = r0;
+        r1 = uint32_mod_uint14(r1, 1752); /* needed only for invalid inputs */
+        R5[2 * i + 1] = r1;
+    }
+
+    r2 = R5[26];
+    r2 = (r2 << 8) | *--s;
+    uint32_divmod_uint14(&r1, &r0, r2, 10713);
+    R4[52] = r0;
+    r1 = uint32_mod_uint14(r1, 131); /* needed only for invalid inputs */
+    R4[53] = r1;
+    for (i = 25; i >= 0; --i) {
+        r2 = R5[i];
+        r2 = (r2 << 8) | *--s;
+        r2 = (r2 << 8) | *--s;
+        uint32_divmod_uint14(&r1, &r0, r2, 10713);
+        R4[2 * i] = r0;
+        r1 = uint32_mod_uint14(r1, 10713); /* needed only for invalid inputs */
+        R4[2 * i + 1] = r1;
+    }
+
+    r2 = R4[53];
+    r2 = (r2 << 8) | *--s;
+    r2 = (r2 << 8) | *--s;
+    uint32_divmod_uint14(&r1, &r0, r2, 1656);
+    R3[106] = r0;
+    r1 = uint32_mod_uint14(r1, 5167); /* needed only for invalid inputs */
+    R3[107] = r1;
+    for (i = 52; i >= 0; --i) {
+        r2 = R4[i];
+        r2 = (r2 << 8) | *--s;
+        uint32_divmod_uint14(&r1, &r0, r2, 1656);
+        R3[2 * i] = r0;
+        r1 = uint32_mod_uint14(r1, 1656); /* needed only for invalid inputs */
+        R3[2 * i + 1] = r1;
+    }
+
+    R2[214] = R3[107];
+    for (i = 106; i >= 0; --i) {
+        r2 = R3[i];
+        r2 = (r2 << 8) | *--s;
+        uint32_divmod_uint14(&r1, &r0, r2, 651);
+        R2[2 * i] = r0;
+        r1 = uint32_mod_uint14(r1, 651); /* needed only for invalid inputs */
+        R2[2 * i + 1] = r1;
+    }
+
+    R1[428] = R2[214];
+    for (i = 213; i >= 0; --i) {
+        r2 = R2[i];
+        r2 = (r2 << 8) | *--s;
+        uint32_divmod_uint14(&r1, &r0, r2, 408);
+        R1[2 * i] = r0;
+        r1 = uint32_mod_uint14(r1, 408); /* needed only for invalid inputs */
+        R1[2 * i + 1] = r1;
+    }
+
+    R0[856] = R1[428] - 2583;
+    for (i = 427; i >= 0; --i) {
+        r2 = R1[i];
+        r2 = (r2 << 8) | *--s;
+        r2 = (r2 << 8) | *--s;
+        uint32_divmod_uint14(&r1, &r0, r2, 5167);
+        R0[2 * i] = r0 - 2583;
+        r1 = uint32_mod_uint14(r1, 5167); /* needed only for invalid inputs */
+        R0[2 * i + 1] = r1 - 2583;
+    }
+}
diff --git a/crypto_kem/sntrup857/clean/crypto_decode_857x5167.h b/crypto_kem/sntrup857/clean/crypto_decode_857x5167.h
new file mode 100644
index 00000000..34880ea6
--- /dev/null
+++ b/crypto_kem/sntrup857/clean/crypto_decode_857x5167.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_SNTRUP857_CLEAN_CRYPTO_DECODE_857X5167_H
+#define PQCLEAN_SNTRUP857_CLEAN_CRYPTO_DECODE_857X5167_H
+
+#include <stdint.h>
+#define PQCLEAN_SNTRUP857_CLEAN_crypto_decode_857x5167_STRBYTES 1322
+#define PQCLEAN_SNTRUP857_CLEAN_crypto_decode_857x5167_ITEMS 857
+#define PQCLEAN_SNTRUP857_CLEAN_crypto_decode_857x5167_ITEMBYTES 2
+
+void PQCLEAN_SNTRUP857_CLEAN_crypto_decode_857x5167(void *v, const unsigned char *s);
+#endif
diff --git a/crypto_kem/sntrup857/clean/crypto_decode_857xint16.c b/crypto_kem/sntrup857/clean/crypto_decode_857xint16.c
new file mode 100644
index 00000000..7841b093
--- /dev/null
+++ b/crypto_kem/sntrup857/clean/crypto_decode_857xint16.c
@@ -0,0 +1,16 @@
+#include "crypto_decode_857xint16.h"
+
+
+void PQCLEAN_SNTRUP857_CLEAN_crypto_decode_857xint16(void *v, const unsigned char *s) {
+    uint16_t *x = v;
+    int i;
+
+    for (i = 0; i < 857; ++i) {
+        uint16_t u0 = s[0];
+        uint16_t u1 = s[1];
+        u1 <<= 8;
+        *x = u0 | u1;
+        x += 1;
+        s += 2;
+    }
+}
diff --git a/crypto_kem/sntrup857/clean/crypto_decode_857xint16.h b/crypto_kem/sntrup857/clean/crypto_decode_857xint16.h
new file mode 100644
index 00000000..c13478c8
--- /dev/null
+++ b/crypto_kem/sntrup857/clean/crypto_decode_857xint16.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_SNTRUP857_CLEAN_CRYPTO_DECODE_857XINT16_H
+#define PQCLEAN_SNTRUP857_CLEAN_CRYPTO_DECODE_857XINT16_H
+
+#include <stdint.h>
+#define PQCLEAN_SNTRUP857_CLEAN_crypto_decode_857xint16_STRBYTES 1714
+#define PQCLEAN_SNTRUP857_CLEAN_crypto_decode_857xint16_ITEMBYTES 2
+#define PQCLEAN_SNTRUP857_CLEAN_crypto_decode_857xint16_ITEMS 857
+
+void PQCLEAN_SNTRUP857_CLEAN_crypto_decode_857xint16(void *v, const unsigned char *s);
+#endif
diff --git a/crypto_kem/sntrup857/clean/crypto_decode_857xint32.c b/crypto_kem/sntrup857/clean/crypto_decode_857xint32.c
new file mode 100644
index 00000000..8da1a180
--- /dev/null
+++ b/crypto_kem/sntrup857/clean/crypto_decode_857xint32.c
@@ -0,0 +1,20 @@
+#include "crypto_decode_857xint32.h"
+
+
+void PQCLEAN_SNTRUP857_CLEAN_crypto_decode_857xint32(void *v, const unsigned char *s) {
+    uint32_t *x = v;
+    int i;
+
+    for (i = 0; i < 857; ++i) {
+        uint32_t u0 = s[0];
+        uint32_t u1 = s[1];
+        uint32_t u2 = s[2];
+        uint32_t u3 = s[3];
+        u1 <<= 8;
+        u2 <<= 16;
+        u3 <<= 24;
+        *x = u0 | u1 | u2 | u3;
+        x += 1;
+        s += 4;
+    }
+}
diff --git a/crypto_kem/sntrup857/clean/crypto_decode_857xint32.h b/crypto_kem/sntrup857/clean/crypto_decode_857xint32.h
new file mode 100644
index 00000000..6d0233a4
--- /dev/null
+++ b/crypto_kem/sntrup857/clean/crypto_decode_857xint32.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_SNTRUP857_CLEAN_CRYPTO_DECODE_857XINT32_H
+#define PQCLEAN_SNTRUP857_CLEAN_CRYPTO_DECODE_857XINT32_H
+
+#include <stdint.h>
+#define PQCLEAN_SNTRUP857_CLEAN_crypto_decode_857xint32_STRBYTES 3428
+#define PQCLEAN_SNTRUP857_CLEAN_crypto_decode_857xint32_ITEMBYTES 4
+#define PQCLEAN_SNTRUP857_CLEAN_crypto_decode_857xint32_ITEMS 857
+
+void PQCLEAN_SNTRUP857_CLEAN_crypto_decode_857xint32(void *v, const unsigned char *s);
+#endif
diff --git a/crypto_kem/sntrup857/clean/crypto_encode_857x1723.c b/crypto_kem/sntrup857/clean/crypto_encode_857x1723.c
new file mode 100644
index 00000000..7a33a9a6
--- /dev/null
+++ b/crypto_kem/sntrup857/clean/crypto_encode_857x1723.c
@@ -0,0 +1,130 @@
+#include "crypto_encode_857x1723.h"
+
+/* auto-generated; do not edit */
+
+#define int16 int16_t
+#define uint16 uint16_t
+#define uint32 uint32_t
+
+void PQCLEAN_SNTRUP857_CLEAN_crypto_encode_857x1723(unsigned char *out, const void *v) {
+    const int16 *R0 = v;
+    /* XXX: caller could overlap R with input */
+    uint16 R[429];
+    long i;
+    uint16 r0, r1;
+    uint32 r2;
+
+    for (i = 0; i < 428; ++i) {
+        r0 = (((R0[2 * i] + 2583) & 16383) * 10923) >> 15;
+        r1 = (((R0[2 * i + 1] + 2583) & 16383) * 10923) >> 15;
+        r2 = r0 + r1 * (uint32)1723;
+        *out++ = r2;
+        r2 >>= 8;
+        R[i] = r2;
+    }
+    R[428] = (((R0[856] + 2583) & 16383) * 10923) >> 15;
+
+    for (i = 0; i < 214; ++i) {
+        r0 = R[2 * i];
+        r1 = R[2 * i + 1];
+        r2 = r0 + r1 * (uint32)11597;
+        *out++ = r2;
+        r2 >>= 8;
+        *out++ = r2;
+        r2 >>= 8;
+        R[i] = r2;
+    }
+    R[214] = R[428];
+
+    for (i = 0; i < 107; ++i) {
+        r0 = R[2 * i];
+        r1 = R[2 * i + 1];
+        r2 = r0 + r1 * (uint32)2053;
+        *out++ = r2;
+        r2 >>= 8;
+        *out++ = r2;
+        r2 >>= 8;
+        R[i] = r2;
+    }
+    R[107] = R[214];
+
+    for (i = 0; i < 53; ++i) {
+        r0 = R[2 * i];
+        r1 = R[2 * i + 1];
+        r2 = r0 + r1 * (uint32)65;
+        R[i] = r2;
+    }
+    r0 = R[106];
+    r1 = R[107];
+    r2 = r0 + r1 * (uint32)65;
+    *out++ = r2;
+    r2 >>= 8;
+    R[53] = r2;
+
+    for (i = 0; i < 26; ++i) {
+        r0 = R[2 * i];
+        r1 = R[2 * i + 1];
+        r2 = r0 + r1 * (uint32)4225;
+        *out++ = r2;
+        r2 >>= 8;
+        *out++ = r2;
+        r2 >>= 8;
+        R[i] = r2;
+    }
+    r0 = R[52];
+    r1 = R[53];
+    r2 = r0 + r1 * (uint32)4225;
+    *out++ = r2;
+    r2 >>= 8;
+    R[26] = r2;
+
+    for (i = 0; i < 13; ++i) {
+        r0 = R[2 * i];
+        r1 = R[2 * i + 1];
+        r2 = r0 + r1 * (uint32)273;
+        *out++ = r2;
+        r2 >>= 8;
+        R[i] = r2;
+    }
+    R[13] = R[26];
+
+    for (i = 0; i < 7; ++i) {
+        r0 = R[2 * i];
+        r1 = R[2 * i + 1];
+        r2 = r0 + r1 * (uint32)292;
+        *out++ = r2;
+        r2 >>= 8;
+        R[i] = r2;
+    }
+
+    for (i = 0; i < 3; ++i) {
+        r0 = R[2 * i];
+        r1 = R[2 * i + 1];
+        r2 = r0 + r1 * (uint32)334;
+        *out++ = r2;
+        r2 >>= 8;
+        R[i] = r2;
+    }
+    R[3] = R[6];
+
+    for (i = 0; i < 2; ++i) {
+        r0 = R[2 * i];
+        r1 = R[2 * i + 1];
+        r2 = r0 + r1 * (uint32)436;
+        *out++ = r2;
+        r2 >>= 8;
+        R[i] = r2;
+    }
+
+    r0 = R[0];
+    r1 = R[1];
+    r2 = r0 + r1 * (uint32)743;
+    *out++ = r2;
+    r2 >>= 8;
+    *out++ = r2;
+    r2 >>= 8;
+    R[0] = r2;
+
+    r0 = R[0];
+    *out++ = r0; /*clang-analyzer-deadcode.DeadStores*/ /*r0 >>= 8;*/
+}
diff --git a/crypto_kem/sntrup857/clean/crypto_encode_857x1723.h b/crypto_kem/sntrup857/clean/crypto_encode_857x1723.h
new file mode 100644
index 00000000..c13d5d6b
--- /dev/null
+++ b/crypto_kem/sntrup857/clean/crypto_encode_857x1723.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_SNTRUP857_CLEAN_CRYPTO_ENCODE_857X1723_H
+#define PQCLEAN_SNTRUP857_CLEAN_CRYPTO_ENCODE_857X1723_H
+
+#include <stdint.h>
+#define PQCLEAN_SNTRUP857_CLEAN_crypto_encode_857x1723_STRBYTES 1152
+#define PQCLEAN_SNTRUP857_CLEAN_crypto_encode_857x1723_ITEMS 857
+#define PQCLEAN_SNTRUP857_CLEAN_crypto_encode_857x1723_ITEMBYTES 2
+
+void PQCLEAN_SNTRUP857_CLEAN_crypto_encode_857x1723(unsigned char *out, const void *v);
+#endif
diff --git a/crypto_kem/sntrup857/clean/crypto_encode_857x1723round.c b/crypto_kem/sntrup857/clean/crypto_encode_857x1723round.c
new file mode 100644
index 00000000..fd73c0f5
--- /dev/null
+++ b/crypto_kem/sntrup857/clean/crypto_encode_857x1723round.c
@@ -0,0 +1,17 @@
+#include "crypto_encode_857x1723.h"
+#include "crypto_encode_857x1723round.h"
+
+#define int16 int16_t
+
+#define p 857
+
+void PQCLEAN_SNTRUP857_CLEAN_crypto_encode_857x1723round(unsigned char *out, const void *v) {
+    const int16 *a = v;
+    int16 x[p];
+    int i;
+
+    for (i = 0; i < p; ++i) {
+        x[i] = 3 * ((10923 * a[i] + 16384) >> 15);
+    }
+    PQCLEAN_SNTRUP857_CLEAN_crypto_encode_857x1723(out, x);
+}
diff --git a/crypto_kem/sntrup857/clean/crypto_encode_857x1723round.h b/crypto_kem/sntrup857/clean/crypto_encode_857x1723round.h
new file mode 100644
index 00000000..a6e3fa92
--- /dev/null
+++ b/crypto_kem/sntrup857/clean/crypto_encode_857x1723round.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_SNTRUP857_CLEAN_CRYPTO_ENCODE_857X1723ROUND_H
+#define PQCLEAN_SNTRUP857_CLEAN_CRYPTO_ENCODE_857X1723ROUND_H
+
+#include <stdint.h>
+#define PQCLEAN_SNTRUP857_CLEAN_crypto_encode_857x1723round_STRBYTES 1152
+#define PQCLEAN_SNTRUP857_CLEAN_crypto_encode_857x1723round_ITEMS 857
+#define PQCLEAN_SNTRUP857_CLEAN_crypto_encode_857x1723round_ITEMBYTES 2
+
+void PQCLEAN_SNTRUP857_CLEAN_crypto_encode_857x1723round(unsigned char *out, const void *v);
+#endif
diff --git a/crypto_kem/sntrup857/clean/crypto_encode_857x3.c b/crypto_kem/sntrup857/clean/crypto_encode_857x3.c
new file mode 100644
index 00000000..de61c551
--- /dev/null
+++ b/crypto_kem/sntrup857/clean/crypto_encode_857x3.c
@@ -0,0 +1,21 @@
+#include "crypto_encode_857x3.h"
+
+#define uint8 uint8_t
+
+#define p 857
+
+void PQCLEAN_SNTRUP857_CLEAN_crypto_encode_857x3(unsigned char *s, const void *v) {
+    const uint8 *f = v;
+    uint8 x;
+    int i;
+
+    for (i = 0; i < p / 4; ++i) {
+        x = *f++ + 1;
+        x += (*f++ + 1) << 2;
+        x += (*f++ + 1) << 4;
+        x += (*f++ + 1) << 6;
+        *s++ = x;
+    }
+    x = *f++ + 1;
+    *s++ = x;
+}
diff --git a/crypto_kem/sntrup857/clean/crypto_encode_857x3.h b/crypto_kem/sntrup857/clean/crypto_encode_857x3.h
new file mode 100644
index 00000000..0bd9a6a5
--- /dev/null
+++ b/crypto_kem/sntrup857/clean/crypto_encode_857x3.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_SNTRUP857_CLEAN_CRYPTO_ENCODE_857X3_H
+#define PQCLEAN_SNTRUP857_CLEAN_CRYPTO_ENCODE_857X3_H
+
+#include <stdint.h>
+#define PQCLEAN_SNTRUP857_CLEAN_crypto_encode_857x3_STRBYTES 215
+#define PQCLEAN_SNTRUP857_CLEAN_crypto_encode_857x3_ITEMS 857
+#define PQCLEAN_SNTRUP857_CLEAN_crypto_encode_857x3_ITEMBYTES 1
+
+void PQCLEAN_SNTRUP857_CLEAN_crypto_encode_857x3(unsigned char *s, const void *v);
+#endif
diff --git a/crypto_kem/sntrup857/clean/crypto_encode_857x5167.c b/crypto_kem/sntrup857/clean/crypto_encode_857x5167.c
new file mode 100644
index 00000000..064d5249
--- /dev/null
+++ b/crypto_kem/sntrup857/clean/crypto_encode_857x5167.c
@@ -0,0 +1,138 @@
+#include "crypto_encode_857x5167.h"
+
+/* auto-generated; do not edit */
+
+#define int16 int16_t
+#define uint16 uint16_t
+#define uint32 uint32_t
+
+void PQCLEAN_SNTRUP857_CLEAN_crypto_encode_857x5167(unsigned char *out, const void *v) {
+    const int16 *R0 = v;
+    /* XXX: caller could overlap R with input */
+    uint16 R[429];
+    long i;
+    uint16 r0, r1;
+    uint32 r2;
+
+    for (i = 0; i < 428; ++i) {
+        r0 = (R0[2 * i] + 2583) & 16383;
+        r1 = (R0[2 * i + 1] + 2583) & 16383;
+        r2 = r0 + r1 * (uint32)5167;
+        *out++ = r2;
+        r2 >>= 8;
+        *out++ = r2;
+        r2 >>= 8;
+        R[i] = r2;
+    }
+    R[428] = (R0[856] + 2583) & 16383;
+
+    for (i = 0; i < 214; ++i) {
+        r0 = R[2 * i];
+        r1 = R[2 * i + 1];
+        r2 = r0 + r1 * (uint32)408;
+        *out++ = r2;
+        r2 >>= 8;
+        R[i] = r2;
+    }
+    R[214] = R[428];
+
+    for (i = 0; i < 107; ++i) {
+        r0 = R[2 * i];
+        r1 = R[2 * i + 1];
+        r2 = r0 + r1 * (uint32)651;
+        *out++ = r2;
+        r2 >>= 8;
+        R[i] = r2;
+    }
+    R[107] = R[214];
+
+    for (i = 0; i < 53; ++i) {
+        r0 = R[2 * i];
+        r1 = R[2 * i + 1];
+        r2 = r0 + r1 * (uint32)1656;
+        *out++ = r2;
+        r2 >>= 8;
+        R[i] = r2;
+    }
+    r0 = R[106];
+    r1 = R[107];
+    r2 = r0 + r1 * (uint32)1656;
+    *out++ = r2;
+    r2 >>= 8;
+    *out++ = r2;
+    r2 >>= 8;
+    R[53] = r2;
+
+    for (i = 0; i < 26; ++i) {
+        r0 = R[2 * i];
+        r1 = R[2 * i + 1];
+        r2 = r0 + r1 * (uint32)10713;
+        *out++ = r2;
+        r2 >>= 8;
+        *out++ = r2;
+        r2 >>= 8;
+        R[i] = r2;
+    }
+    r0 = R[52];
+    r1 = R[53];
+    r2 = r0 + r1 * (uint32)10713;
+    *out++ = r2;
+    r2 >>= 8;
+    R[26] = r2;
+
+    for (i = 0; i < 13; ++i) {
+        r0 = R[2 * i];
+        r1 = R[2 * i + 1];
+        r2 = r0 + r1 * (uint32)1752;
+        *out++ = r2;
+        r2 >>= 8;
+        R[i] = r2;
+    }
+    R[13] = R[26];
+
+    for (i = 0; i < 7; ++i) {
+        r0 = R[2 * i];
+        r1 = R[2 * i + 1];
+        r2 = r0 + r1 * (uint32)11991;
+        *out++ = r2;
+        r2 >>= 8;
+        *out++ = r2;
+        r2 >>= 8;
+        R[i] = r2;
+    }
+
+    for (i = 0; i < 3; ++i) {
+        r0 = R[2 * i];
+        r1 = R[2 * i + 1];
+        r2 = r0 + r1 * (uint32)2194;
+        *out++ = r2;
+        r2 >>= 8;
+        *out++ = r2;
+        r2 >>= 8;
+        R[i] = r2;
+    }
+    R[3] = R[6];
+
+    r0 = R[0];
+    r1 = R[1];
+    r2 = r0 + r1 * (uint32)74;
+    R[0] = r2;
+    r0 = R[2];
+    r1 = R[3];
+    r2 = r0 + r1 * (uint32)74;
+    *out++ = r2;
+    r2 >>= 8;
+    R[1] = r2;
+
+    r0 = R[0];
+    r1 = R[1];
+    r2 = r0 + r1 * (uint32)5476;
+    *out++ = r2;
+    r2 >>= 8;
+    R[0] = r2;
+
+    r0 = R[0];
+    *out++ = r0;
+    r0 >>= 8;
+    *out++ = r0; /*clang-analyzer-deadcode.DeadStores*/ /*r0 >>= 8;*/
+}
diff --git a/crypto_kem/sntrup857/clean/crypto_encode_857x5167.h b/crypto_kem/sntrup857/clean/crypto_encode_857x5167.h
new file mode 100644
index 00000000..03965ec9
--- /dev/null
+++ b/crypto_kem/sntrup857/clean/crypto_encode_857x5167.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_SNTRUP857_CLEAN_CRYPTO_ENCODE_857X5167_H
+#define PQCLEAN_SNTRUP857_CLEAN_CRYPTO_ENCODE_857X5167_H
+
+#include <stdint.h>
+#define PQCLEAN_SNTRUP857_CLEAN_crypto_encode_857x5167_STRBYTES 1322
+#define PQCLEAN_SNTRUP857_CLEAN_crypto_encode_857x5167_ITEMS 857
+#define PQCLEAN_SNTRUP857_CLEAN_crypto_encode_857x5167_ITEMBYTES 2
+
+void PQCLEAN_SNTRUP857_CLEAN_crypto_encode_857x5167(unsigned char *out, const void *v);
+#endif
diff --git a/crypto_kem/sntrup857/clean/crypto_encode_857xfreeze3.c b/crypto_kem/sntrup857/clean/crypto_encode_857xfreeze3.c
new file mode 100644
index 00000000..99fc3bc5
--- /dev/null
+++ b/crypto_kem/sntrup857/clean/crypto_encode_857xfreeze3.c
@@ -0,0 +1,25 @@
+#include "crypto_encode_857xfreeze3.h"
+
+#define int16 int16_t
+
+#define p 857
+
+/* valid inputs: -16384 <= x < 16384 */
+/* then 3 divides x-F3_freeze(x) */
+/* and F3_freeze(x) is in {-1,0,1} */
+
+/* all inputs: 3 divides x-F3_freeze(x) */
+/* and F3_freeze(x) is in {-2,-1,0,1,2} */
+
+static inline unsigned char F3_freeze(int16 x) {
+    return x - 3 * ((10923 * x + 16384) >> 15);
+}
+
+void PQCLEAN_SNTRUP857_CLEAN_crypto_encode_857xfreeze3(unsigned char *s, const void *v) {
+    const int16 *r = v;
+
+    int i;
+    for (i = 0; i < p; ++i) {
+        s[i] = F3_freeze(r[i]);
+    }
+}
diff --git a/crypto_kem/sntrup857/clean/crypto_encode_857xfreeze3.h b/crypto_kem/sntrup857/clean/crypto_encode_857xfreeze3.h
new file mode 100644
index 00000000..3db63e85
--- /dev/null
+++ b/crypto_kem/sntrup857/clean/crypto_encode_857xfreeze3.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_SNTRUP857_CLEAN_CRYPTO_ENCODE_857XFREEZE3_H
+#define PQCLEAN_SNTRUP857_CLEAN_CRYPTO_ENCODE_857XFREEZE3_H
+
+#include <stdint.h>
+#define PQCLEAN_SNTRUP857_CLEAN_crypto_encode_857xfreeze3_STRBYTES 857
+#define PQCLEAN_SNTRUP857_CLEAN_crypto_encode_857xfreeze3_ITEMS 857
+#define PQCLEAN_SNTRUP857_CLEAN_crypto_encode_857xfreeze3_ITEMBYTES 2
+
+void PQCLEAN_SNTRUP857_CLEAN_crypto_encode_857xfreeze3(unsigned char *s, const void *v);
+#endif
diff --git a/crypto_kem/sntrup857/clean/crypto_encode_857xint16.c b/crypto_kem/sntrup857/clean/crypto_encode_857xint16.c
new file mode 100644
index 00000000..d496ac17
--- /dev/null
+++ b/crypto_kem/sntrup857/clean/crypto_encode_857xint16.c
@@ -0,0 +1,13 @@
+#include "crypto_encode_857xint16.h"
+
+
+void PQCLEAN_SNTRUP857_CLEAN_crypto_encode_857xint16(unsigned char *s, const void *v) {
+    const uint16_t *x = v;
+    int i;
+
+    for (i = 0; i < 857; ++i) {
+        uint16_t u = *x++;
+        *s++ = u;
+        *s++ = u >> 8;
+    }
+}
diff --git a/crypto_kem/sntrup857/clean/crypto_encode_857xint16.h b/crypto_kem/sntrup857/clean/crypto_encode_857xint16.h
new file mode 100644
index 00000000..bd42a80d
--- /dev/null
+++ b/crypto_kem/sntrup857/clean/crypto_encode_857xint16.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_SNTRUP857_CLEAN_CRYPTO_ENCODE_857XINT16_H
+#define PQCLEAN_SNTRUP857_CLEAN_CRYPTO_ENCODE_857XINT16_H
+
+#include <stdint.h>
+#define PQCLEAN_SNTRUP857_CLEAN_crypto_encode_857xint16_STRBYTES 1714
+#define PQCLEAN_SNTRUP857_CLEAN_crypto_encode_857xint16_ITEMBYTES 2
+#define PQCLEAN_SNTRUP857_CLEAN_crypto_encode_857xint16_ITEMS 857
+
+void PQCLEAN_SNTRUP857_CLEAN_crypto_encode_857xint16(unsigned char *s, const void *v);
+#endif
diff --git a/crypto_kem/sntrup857/clean/crypto_encode_int16.c b/crypto_kem/sntrup857/clean/crypto_encode_int16.c
new file mode 100644
index 00000000..30756273
--- /dev/null
+++ b/crypto_kem/sntrup857/clean/crypto_encode_int16.c
@@ -0,0 +1,9 @@
+#include "crypto_encode_int16.h"
+
+#define uint16 uint16_t
+
+void PQCLEAN_SNTRUP857_CLEAN_crypto_encode_int16(unsigned char *s, const void *x) {
+    uint16 u = *(const uint16 *) x;
+    s[0] = u;
+    s[1] = u >> 8;
+}
diff --git a/crypto_kem/sntrup857/clean/crypto_encode_int16.h b/crypto_kem/sntrup857/clean/crypto_encode_int16.h
new file mode 100644
index 00000000..5ee7e3c3
--- /dev/null
+++ b/crypto_kem/sntrup857/clean/crypto_encode_int16.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_SNTRUP857_CLEAN_CRYPTO_ENCODE_INT16_H
+#define PQCLEAN_SNTRUP857_CLEAN_CRYPTO_ENCODE_INT16_H
+
+#include <stdint.h>
+#define PQCLEAN_SNTRUP857_CLEAN_crypto_encode_int16_STRBYTES 2
+#define PQCLEAN_SNTRUP857_CLEAN_crypto_encode_int16_ITEMBYTES 2
+#define PQCLEAN_SNTRUP857_CLEAN_crypto_encode_int16_ITEMS 1
+
+void PQCLEAN_SNTRUP857_CLEAN_crypto_encode_int16(unsigned char *s, const void *x);
+#endif
diff --git a/crypto_kem/sntrup857/clean/crypto_sort_int32.c b/crypto_kem/sntrup857/clean/crypto_sort_int32.c
new file mode 100644
index 00000000..0b1dd2f8
--- /dev/null
+++ b/crypto_kem/sntrup857/clean/crypto_sort_int32.c
@@ -0,0 +1,86 @@
+#include "crypto_sort_int32.h"
+#include <stdint.h>
+// Based on supercop-20190110/crypto_sort/int32/x86
+
+
+#define int32 int32_t
+
+#define int32_MINMAX(a,b) \
+    do { \
+        int32_t ab = (b) ^ (a); \
+        int32_t c = (int32_t)((int64_t)(b) - (int64_t)(a)); \
+        c ^= ab & (c ^ (b)); \
+        c >>= 31; \
+        c &= ab; \
+        (a) ^= c; \
+        (b) ^= c; \
+    } while(0)
+
+/* assume 2 <= n <= 0x40000000 */
+void PQCLEAN_SNTRUP857_CLEAN_crypto_sort_int32(int32 *array, size_t n) {
+    size_t top, p, q, r, i, j;
+    int32 *x = array;
+
+    top = 1;
+    while (top < n - top) {
+        top += top;
+    }
+
+    for (p = top; p >= 1; p >>= 1) {
+        i = 0;
+        while (i + 2 * p <= n) {
+            for (j = i; j < i + p; ++j) {
+                int32_MINMAX(x[j], x[j + p]);
+            }
+            i += 2 * p;
+        }
+        for (j = i; j < n - p; ++j) {
+            int32_MINMAX(x[j], x[j + p]);
+        }
+
+        i = 0;
+        j = 0;
+        for (q = top; q > p; q >>= 1) {
+            if (j != i) {
+                for (;;) {
+                    if (j == n - q) {
+                        goto done;
+                    }
+                    int32 a = x[j + p];
+                    for (r = q; r > p; r >>= 1) {
+                        int32_MINMAX(a, x[j + r]);
+                    }
+                    x[j + p] = a;
+                    ++j;
+                    if (j == i + p) {
+                        i += 2 * p;
+                        break;
+                    }
+                }
+            }
+            while (i + p <= n - q) {
+                for (j = i; j < i + p; ++j) {
+                    int32 a = x[j + p];
+                    for (r = q; r > p; r >>= 1) {
+                        int32_MINMAX(a, x[j + r]);
+                    }
+                    x[j + p] = a;
+                }
+                i += 2 * p;
+            }
+            /* now i + p > n - q */
+            j = i;
+            while (j < n - q) {
+                int32 a = x[j + p];
+                for (r = q; r > p; r >>= 1) {
+                    int32_MINMAX(a, x[j + r]);
+                }
+                x[j + p] = a;
+                ++j;
+            }
+
+done:
+            ;
+        }
+    }
+}
diff --git a/crypto_kem/sntrup857/clean/crypto_sort_int32.h b/crypto_kem/sntrup857/clean/crypto_sort_int32.h
new file mode 100644
index 00000000..3006e3b2
--- /dev/null
+++ b/crypto_kem/sntrup857/clean/crypto_sort_int32.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_SNTRUP857_CLEAN_CRYPTO_SORT_INT32_H
+#define PQCLEAN_SNTRUP857_CLEAN_CRYPTO_SORT_INT32_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+
+void PQCLEAN_SNTRUP857_CLEAN_crypto_sort_int32(int32_t *array, size_t n);
+
+#endif
diff --git a/crypto_kem/sntrup857/clean/crypto_sort_uint32.c b/crypto_kem/sntrup857/clean/crypto_sort_uint32.c
new file mode 100644
index 00000000..4c32bae5
--- /dev/null
+++ b/crypto_kem/sntrup857/clean/crypto_sort_uint32.c
@@ -0,0 +1,20 @@
+#include "crypto_sort_int32.h"
+#include "crypto_sort_uint32.h"
+#include <stdint.h>
+
+#define uint32 uint32_t
+
+/* can save time by vectorizing xor loops */
+/* can save time by integrating xor loops with int32_sort */
+
+void PQCLEAN_SNTRUP857_CLEAN_crypto_sort_uint32(uint32_t *array, size_t n) {
+    uint32 *x = array;
+    size_t j;
+    for (j = 0; j < n; ++j) {
+        x[j] ^= 0x80000000;
+    }
+    PQCLEAN_SNTRUP857_CLEAN_crypto_sort_int32((int32_t *)array, n);
+    for (j = 0; j < n; ++j) {
+        x[j] ^= 0x80000000;
+    }
+}
diff --git a/crypto_kem/sntrup857/clean/crypto_sort_uint32.h b/crypto_kem/sntrup857/clean/crypto_sort_uint32.h
new file mode 100644
index 00000000..2d19744b
--- /dev/null
+++ b/crypto_kem/sntrup857/clean/crypto_sort_uint32.h
@@ -0,0 +1,10 @@
+#ifndef PQCLEAN_SNTRUP857_CLEAN_CRYPTO_SORT_UINT32_H
+#define PQCLEAN_SNTRUP857_CLEAN_CRYPTO_SORT_UINT32_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+
+void PQCLEAN_SNTRUP857_CLEAN_crypto_sort_uint32(uint32_t *array, size_t n);
+
+#endif
diff --git a/crypto_kem/sntrup857/clean/crypto_stream_aes256ctr.c b/crypto_kem/sntrup857/clean/crypto_stream_aes256ctr.c
new file mode 100644
index 00000000..e202f8aa
--- /dev/null
+++ b/crypto_kem/sntrup857/clean/crypto_stream_aes256ctr.c
@@ -0,0 +1,15 @@
+#include "crypto_stream_aes256ctr.h"
+
+
+int PQCLEAN_SNTRUP857_CLEAN_crypto_stream_aes256ctr(
+    uint8_t *out,
+    size_t outlen,
+    const uint8_t nonce[AESCTR_NONCEBYTES],
+    const uint8_t key[AES256_KEYBYTES]) {
+
+    aes256ctx state;
+    aes256_ctr_keyexp(&state, key);
+    aes256_ctr(out, outlen, nonce, &state);
+    aes256_ctx_release(&state);
+    return 0;
+}
diff --git a/crypto_kem/sntrup857/clean/crypto_stream_aes256ctr.h b/crypto_kem/sntrup857/clean/crypto_stream_aes256ctr.h
new file mode 100644
index 00000000..fae907dc
--- /dev/null
+++ b/crypto_kem/sntrup857/clean/crypto_stream_aes256ctr.h
@@ -0,0 +1,15 @@
+#ifndef PQCLEAN_SNTRUP857_CLEAN_CRYPTO_STREAM_AES256CTR_H
+#define PQCLEAN_SNTRUP857_CLEAN_CRYPTO_STREAM_AES256CTR_H
+#include "aes.h"
+#include <stddef.h>
+#include <stdint.h>
+
+
+
+int PQCLEAN_SNTRUP857_CLEAN_crypto_stream_aes256ctr(
+    uint8_t *out,
+    size_t outlen,
+    const uint8_t nonce[AESCTR_NONCEBYTES],
+    const uint8_t key[AES256_KEYBYTES]);
+
+#endif
diff --git a/crypto_kem/sntrup857/clean/crypto_verify_1184.c b/crypto_kem/sntrup857/clean/crypto_verify_1184.c
new file mode 100644
index 00000000..e2b6856a
--- /dev/null
+++ b/crypto_kem/sntrup857/clean/crypto_verify_1184.c
@@ -0,0 +1,13 @@
+#include "crypto_verify_1184.h"
+
+
+int PQCLEAN_SNTRUP857_CLEAN_crypto_verify_1184(const unsigned char *x, const unsigned char *y) {
+    unsigned int differentbits = 0;
+    int i;
+
+    for (i = 0; i < PQCLEAN_SNTRUP857_CLEAN_crypto_verify_1184_BYTES; ++i) {
+        differentbits |= x[i] ^ y[i];
+    }
+
+    return (int) (1 & ((differentbits - 1) >> 8)) - 1;
+}
diff --git a/crypto_kem/sntrup857/clean/crypto_verify_1184.h b/crypto_kem/sntrup857/clean/crypto_verify_1184.h
new file mode 100644
index 00000000..9bcbf3f4
--- /dev/null
+++ b/crypto_kem/sntrup857/clean/crypto_verify_1184.h
@@ -0,0 +1,8 @@
+#ifndef PQCLEAN_SNTRUP857_CLEAN_CRYPTO_VERIFY_1184_H
+#define PQCLEAN_SNTRUP857_CLEAN_CRYPTO_VERIFY_1184_H
+
+#include <stdint.h>
+#define PQCLEAN_SNTRUP857_CLEAN_crypto_verify_1184_BYTES 1184
+
+int PQCLEAN_SNTRUP857_CLEAN_crypto_verify_1184(const unsigned char *x, const unsigned char *y);
+#endif
diff --git a/crypto_kem/sntrup857/clean/kem.c b/crypto_kem/sntrup857/clean/kem.c
new file mode 100644
index 00000000..74cc1acd
--- /dev/null
+++ b/crypto_kem/sntrup857/clean/kem.c
@@ -0,0 +1,247 @@
+#include "api.h"
+#include "crypto_sort_uint32.h"
+#include "params.h"
+#include "randombytes.h"
+#include "sha2.h"
+
+
+
+#define int8 int8_t
+#define int16 int16_t
+#define int32 int32_t
+#define uint16 uint16_t
+#define uint32 uint32_t
+
+/* ----- arithmetic mod 3 */
+
+typedef int8 small;
+/* F3 is always represented as -1,0,1 */
+
+/* ----- arithmetic mod q */
+
+typedef int16 Fq;
+/* always represented as -(q-1)/2...(q-1)/2 */
+
+/* ----- small polynomials */
+
+/* R3_fromR(R_fromRq(r)) */
+static void R3_fromRq(small *out, const Fq *r) {
+    crypto_encode_pxfreeze3((unsigned char *) out, (unsigned char *) r);
+}
+
+/* h = f*g in the ring R3 */
+static void R3_mult(small *h, const small *f, const small *g) {
+    crypto_core_mult3((unsigned char *) h, (const unsigned char *) f, (const unsigned char *) g);
+}
+
+/* ----- polynomials mod q */
+
+/* h = h*g in the ring Rq */
+static void Rq_mult_small(Fq *h, const small *g) {
+    crypto_encode_pxint16((unsigned char *) h, h);
+    crypto_core_mult((unsigned char *) h, (const unsigned char *) h, (const unsigned char *) g);
+    crypto_decode_pxint16(h, (const unsigned char *) h);
+}
+
+/* h = 3f in Rq */
+static void Rq_mult3(Fq *h, const Fq *f) {
+    crypto_encode_pxint16((unsigned char *) h, f);
+    crypto_core_scale3((unsigned char *) h, (const unsigned char *) h);
+    crypto_decode_pxint16(h, (const unsigned char *) h);
+}
+
+/* out = 1/(3*in) in Rq */
+/* caller must have 2p+1 bytes free in out, not just 2p */
+static void Rq_recip3(Fq *out, const small *in) {
+    crypto_core_inv((unsigned char *) out, (const unsigned char *) in);
+    /* could check byte 2*p for failure; but, in context, inv always works */
+    crypto_decode_pxint16(out, (unsigned char *) out);
+}
+
+/* ----- underlying hash function */
+
+#define Hash_bytes 32
+
+static void Hash(unsigned char *out, const unsigned char *in, int inlen) {
+    unsigned char h[64];
+    int i;
+    sha512(h, in, inlen);
+    for (i = 0; i < 32; ++i) {
+        out[i] = h[i];
+    }
+}
+
+/* ----- higher-level randomness */
+
+static void Short_random(small *out) {
+    uint32 L[ppadsort];
+    int i;
+
+    randombytes((unsigned char *) L, 4 * p);
+    crypto_decode_pxint32(L, (unsigned char *) L);
+    for (i = 0; i < w; ++i) {
+        L[i] = L[i] & (uint32) - 2;
+    }
+    for (i = w; i < p; ++i) {
+        L[i] = (L[i] & (uint32) - 3) | 1;
+    }
+    for (i = p; i < ppadsort; ++i) {
+        L[i] = 0xffffffff;
+    }
+    PQCLEAN_SNTRUP857_CLEAN_crypto_sort_uint32(L, ppadsort);
+    for (i = 0; i < p; ++i) {
+        out[i] = (L[i] & 3) - 1;
+    }
+}
+
+static void Small_random(small *out) {
+    uint32 L[p];
+    int i;
+
+    randombytes((unsigned char *) L, sizeof L);
+    crypto_decode_pxint32(L, (unsigned char *) L);
+    for (i = 0; i < p; ++i) {
+        out[i] = (((L[i] & 0x3fffffff) * 3) >> 30) - 1;
+    }
+}
+
+/* ----- Streamlined NTRU Prime */
+
+typedef small Inputs[p]; /* passed by reference */
+#define Ciphertexts_bytes Rounded_bytes
+#define SecretKeys_bytes (2*Small_bytes)
+#define PublicKeys_bytes Rq_bytes
+#define Confirm_bytes 32
+
+/* c,r_enc[1:] = Hide(r,pk,cache); cache is Hash4(pk) */
+/* also set r_enc[0]=3 */
+/* also set x[0]=2, and x[1:1+Hash_bytes] = Hash3(r_enc) */
+/* also overwrite x[1+Hash_bytes:1+2*Hash_bytes] */
+static void Hide(unsigned char *x, unsigned char *c, unsigned char *r_enc, const Inputs r, const unsigned char *pk, const unsigned char *cache) {
+    Fq h[p];
+    int i;
+
+    Small_encode(r_enc + 1, r);
+    Rq_decode(h, pk);
+    Rq_mult_small(h, r);
+    Round_and_encode(c, h);
+    r_enc[0] = 3;
+    Hash(x + 1, r_enc, 1 + Small_bytes);
+    for (i = 0; i < Hash_bytes; ++i) {
+        x[1 + Hash_bytes + i] = cache[i];
+    }
+    x[0] = 2;
+    Hash(c + Ciphertexts_bytes, x, 1 + Hash_bytes * 2);
+}
+
+
+int PQCLEAN_SNTRUP857_CLEAN_crypto_kem_keypair(unsigned char *pk, unsigned char *sk) {
+    small g[p];
+    for (;;) {
+        Small_random(g);
+        {
+            small v[p + 1];
+            crypto_core_inv3((unsigned char *) v, (const unsigned char *) g);
+            if (v[p] == 0) {
+                Small_encode(sk + Small_bytes, v);
+                break;
+            }
+        }
+    }
+    {
+        small f[p];
+        Short_random(f);
+        Small_encode(sk, f);
+        {
+            Fq h[p + 1];
+            Rq_recip3(h, f); /* always works */
+            Rq_mult_small(h, g);
+            Rq_encode(pk, h);
+        }
+    }
+    {
+        int i;
+        unsigned char sksave = sk[SecretKeys_bytes - 1];
+        for (i = 0; i < PublicKeys_bytes; ++i) {
+            sk[SecretKeys_bytes + i] = pk[i];
+        }
+        sk[SecretKeys_bytes - 1] = 4;
+        Hash(sk + SecretKeys_bytes + PublicKeys_bytes + Small_bytes, sk + SecretKeys_bytes - 1, 1 + PublicKeys_bytes);
+        sk[SecretKeys_bytes - 1] = sksave;
+        randombytes(sk + SecretKeys_bytes + PublicKeys_bytes, Small_bytes);
+    }
+    return 0;
+}
+
+int PQCLEAN_SNTRUP857_CLEAN_crypto_kem_enc(unsigned char *c, unsigned char *k, const unsigned char *pk) {
+    unsigned char cache[Hash_bytes];
+    int i;
+    {
+        unsigned char y[1 + PublicKeys_bytes]; /* XXX: can eliminate with incremental hashing */
+        for (i = 0; i < PublicKeys_bytes; ++i) {
+            y[1 + i] = pk[i];
+        }
+        y[0] = 4;
+        Hash(cache, y, sizeof y);
+    }
+    {
+        Inputs r;
+        Short_random(r);
+        {
+            unsigned char r_enc[Small_bytes + 1];
+            unsigned char x[1 + Hash_bytes + Ciphertexts_bytes + Confirm_bytes];
+            Hide(x, c, r_enc, r, pk, cache);
+            for (i = 0; i < Ciphertexts_bytes + Confirm_bytes; ++i) {
+                x[1 + Hash_bytes + i] = c[i];
+            }
+            x[0] = 1;
+            Hash(k, x, sizeof x);
+        }
+    }
+    return 0;
+}
+
+int PQCLEAN_SNTRUP857_CLEAN_crypto_kem_dec(unsigned char *k, const unsigned char *c, const unsigned char *sk) {
+    const unsigned char *pk = sk + SecretKeys_bytes;
+    const unsigned char *rho = pk + PublicKeys_bytes;
+    const unsigned char *cache = rho + Small_bytes;
+    int mask, i;
+    Inputs r;
+    {
+        Fq d[p];
+        Rounded_decode(d, c);
+        {
+            small f[p];
+            Small_decode(f, sk);
+            Rq_mult_small(d, f);
+            Rq_mult3(d, d);
+        }
+        {
+            small e[p];
+            small v[p];
+            R3_fromRq(e, d);
+            Small_decode(v, sk + Small_bytes);
+            R3_mult(r, e, v);
+        }
+        crypto_core_wforce((unsigned char *) r, (unsigned char *) r);
+    }
+    {
+        unsigned char r_enc[1 + Small_bytes];
+        unsigned char cnew[Ciphertexts_bytes + Confirm_bytes];
+        unsigned char x[1 + Hash_bytes + Ciphertexts_bytes + Confirm_bytes];
+        /* XXX: can use incremental hashing to reduce x size */
+
+        Hide(x, cnew, r_enc, r, pk, cache);
+        mask = crypto_verify_clen(c, cnew);
+        for (i = 0; i < Small_bytes; ++i) {
+            r_enc[i + 1] ^= mask & (r_enc[i + 1] ^ rho[i]);
+        }
+        Hash(x + 1, r_enc, 1 + Small_bytes); /* XXX: can instead do cmov on cached hash of rho */
+        for (i = 0; i < Ciphertexts_bytes + Confirm_bytes; ++i) {
+            x[1 + Hash_bytes + i] = c[i];
+        }
+        x[0] = 1 + mask;
+        Hash(k, x, sizeof x);
+    }
+    return 0;
+}
diff --git a/crypto_kem/sntrup857/clean/params.h b/crypto_kem/sntrup857/clean/params.h
new file mode 100644
index 00000000..4b4903d3
--- /dev/null
+++ b/crypto_kem/sntrup857/clean/params.h
@@ -0,0 +1,68 @@
+#ifndef params_H
+#define params_H
+#include "crypto_core_inv3sntrup857.h"
+#include "crypto_core_invsntrup857.h"
+#include "crypto_core_mult3sntrup857.h"
+#include "crypto_core_multsntrup857.h"
+#include "crypto_core_scale3sntrup857.h"
+#include "crypto_core_weightsntrup857.h"
+#include "crypto_core_wforcesntrup857.h"
+#include "crypto_decode_857x1723.h"
+#include "crypto_decode_857x3.h"
+#include "crypto_decode_857x5167.h"
+#include "crypto_decode_857xint16.h"
+#include "crypto_decode_857xint32.h"
+#include "crypto_encode_857x1723.h"
+#include "crypto_encode_857x1723round.h"
+#include "crypto_encode_857x3.h"
+#include "crypto_encode_857x5167.h"
+#include "crypto_encode_857xfreeze3.h"
+#include "crypto_encode_857xint16.h"
+#include "crypto_encode_int16.h"
+#include "crypto_verify_1184.h"
+
+
+#define p 857
+#define q27 25976 /* closest integer to 2^27/q */
+#define q18 51 /* closest integer to 2^18/q */
+#define q 5167
+#define w 322
+
+#define ppadsort 857
+
+#define crypto_verify_clen PQCLEAN_SNTRUP857_CLEAN_crypto_verify_1184
+
+#define Rq_bytes PQCLEAN_SNTRUP857_CLEAN_crypto_encode_857x5167_STRBYTES
+#define Rq_encode PQCLEAN_SNTRUP857_CLEAN_crypto_encode_857x5167
+#define Rq_decode PQCLEAN_SNTRUP857_CLEAN_crypto_decode_857x5167
+
+#define Rounded_bytes PQCLEAN_SNTRUP857_CLEAN_crypto_decode_857x1723_STRBYTES
+#define Rounded_decode PQCLEAN_SNTRUP857_CLEAN_crypto_decode_857x1723
+
+#define Round_and_encode PQCLEAN_SNTRUP857_CLEAN_crypto_encode_857x1723round
+
+#define Small_bytes PQCLEAN_SNTRUP857_CLEAN_crypto_encode_857x3_STRBYTES
+#define Small_encode PQCLEAN_SNTRUP857_CLEAN_crypto_encode_857x3
+#define Small_decode PQCLEAN_SNTRUP857_CLEAN_crypto_decode_857x3
+
+#define crypto_encode_pxfreeze3 PQCLEAN_SNTRUP857_CLEAN_crypto_encode_857xfreeze3
+
+#define crypto_decode_pxint32 PQCLEAN_SNTRUP857_CLEAN_crypto_decode_857xint32
+
+#define crypto_decode_pxint16 PQCLEAN_SNTRUP857_CLEAN_crypto_decode_857xint16
+
+#define crypto_encode_pxint16 PQCLEAN_SNTRUP857_CLEAN_crypto_encode_857xint16
+
+#define crypto_core_wforce PQCLEAN_SNTRUP857_CLEAN_crypto_core_wforcesntrup857
+
+#define crypto_core_scale3 PQCLEAN_SNTRUP857_CLEAN_crypto_core_scale3sntrup857
+
+#define crypto_core_inv PQCLEAN_SNTRUP857_CLEAN_crypto_core_invsntrup857
+
+#define crypto_core_inv3 PQCLEAN_SNTRUP857_CLEAN_crypto_core_inv3sntrup857
+
+#define crypto_core_mult PQCLEAN_SNTRUP857_CLEAN_crypto_core_multsntrup857
+
+#define crypto_core_mult3 PQCLEAN_SNTRUP857_CLEAN_crypto_core_mult3sntrup857
+
+#endif
diff --git a/test/duplicate_consistency/ntrulpr653_avx2.yml b/test/duplicate_consistency/ntrulpr653_avx2.yml
new file mode 100644
index 00000000..39486aa4
--- /dev/null
+++ b/test/duplicate_consistency/ntrulpr653_avx2.yml
@@ -0,0 +1,184 @@
+consistency_checks:
+- source:
+    scheme: sntrup653
+    implementation: clean
+  files:
+      - crypto_core_multsntrup653.h
+      - crypto_decode_653x1541.h
+      - crypto_decode_653x3.h
+      - crypto_decode_653xint16.h
+      - crypto_decode_653xint32.h
+      - crypto_encode_653x1541.h
+      - crypto_encode_653x1541round.h
+      - crypto_encode_653x3.h
+      - crypto_encode_653xint16.h
+      - crypto_sort_uint32.h
+      - crypto_stream_aes256ctr.h
+      - crypto_decode_653xint16.c
+      - crypto_decode_653xint32.c
+      - crypto_encode_653xint16.c
+      - crypto_sort_uint32.c
+      - crypto_stream_aes256ctr.c
+- source:
+    scheme: sntrup653
+    implementation: avx2
+  files:
+      - crypto_core_multsntrup653.h
+      - crypto_core_multsntrup653_ntt.h
+      - crypto_decode_653x1541.h
+      - crypto_decode_653x3.h
+      - crypto_decode_653xint16.h
+      - crypto_decode_653xint32.h
+      - crypto_encode_653x1541.h
+      - crypto_encode_653x1541round.h
+      - crypto_encode_653x3.h
+      - crypto_encode_653xint16.h
+      - crypto_sort_int32.h
+      - crypto_sort_uint32.h
+      - crypto_stream_aes256ctr.h
+      - crypto_core_multsntrup653.c
+      - crypto_core_multsntrup653_ntt.c
+      - crypto_decode_653x1541.c
+      - crypto_decode_653x3.c
+      - crypto_decode_653xint16.c
+      - crypto_decode_653xint32.c
+      - crypto_encode_653x1541.c
+      - crypto_encode_653x1541round.c
+      - crypto_encode_653x3.c
+      - crypto_encode_653xint16.c
+      - crypto_sort_int32.c
+      - crypto_sort_uint32.c
+      - crypto_stream_aes256ctr.c
+- source:
+    scheme: sntrup761
+    implementation: clean
+  files:
+      - crypto_sort_uint32.h
+      - crypto_stream_aes256ctr.h
+      - crypto_sort_uint32.c
+      - crypto_stream_aes256ctr.c
+- source:
+    scheme: sntrup761
+    implementation: avx2
+  files:
+      - crypto_sort_int32.h
+      - crypto_sort_uint32.h
+      - crypto_stream_aes256ctr.h
+      - crypto_sort_int32.c
+      - crypto_sort_uint32.c
+      - crypto_stream_aes256ctr.c
+- source:
+    scheme: sntrup857
+    implementation: clean
+  files:
+      - crypto_sort_uint32.h
+      - crypto_stream_aes256ctr.h
+      - crypto_sort_uint32.c
+      - crypto_stream_aes256ctr.c
+- source:
+    scheme: sntrup857
+    implementation: avx2
+  files:
+      - crypto_sort_int32.h
+      - crypto_sort_uint32.h
+      - crypto_stream_aes256ctr.h
+      - crypto_sort_int32.c
+      - crypto_sort_uint32.c
+      - crypto_stream_aes256ctr.c
+- source:
+    scheme: ntrulpr653
+    implementation: clean
+  files:
+      - api.h
+      - crypto_core_multsntrup653.h
+      - crypto_decode_256x16.h
+      - crypto_decode_256x2.h
+      - crypto_decode_653x1541.h
+      - crypto_decode_653x3.h
+      - crypto_decode_653xint16.h
+      - crypto_decode_653xint32.h
+      - crypto_encode_256x16.h
+      - crypto_encode_256x2.h
+      - crypto_encode_653x1541.h
+      - crypto_encode_653x1541round.h
+      - crypto_encode_653x3.h
+      - crypto_encode_653xint16.h
+      - crypto_sort_uint32.h
+      - crypto_stream_aes256ctr.h
+      - crypto_verify_1025.h
+      - crypto_decode_256x16.c
+      - crypto_decode_653xint16.c
+      - crypto_decode_653xint32.c
+      - crypto_encode_256x16.c
+      - crypto_encode_653xint16.c
+      - crypto_sort_uint32.c
+      - crypto_stream_aes256ctr.c
+      - kem.c
+- source:
+    scheme: ntrulpr761
+    implementation: clean
+  files:
+      - crypto_decode_256x16.h
+      - crypto_decode_256x2.h
+      - crypto_encode_256x16.h
+      - crypto_encode_256x2.h
+      - crypto_sort_uint32.h
+      - crypto_stream_aes256ctr.h
+      - crypto_decode_256x16.c
+      - crypto_encode_256x16.c
+      - crypto_sort_uint32.c
+      - crypto_stream_aes256ctr.c
+      - kem.c
+- source:
+    scheme: ntrulpr761
+    implementation: avx2
+  files:
+      - crypto_decode_256x16.h
+      - crypto_decode_256x2.h
+      - crypto_encode_256x16.h
+      - crypto_encode_256x2.h
+      - crypto_sort_int32.h
+      - crypto_sort_uint32.h
+      - crypto_stream_aes256ctr.h
+      - crypto_decode_256x16.c
+      - crypto_decode_256x2.c
+      - crypto_encode_256x16.c
+      - crypto_encode_256x2.c
+      - crypto_sort_int32.c
+      - crypto_sort_uint32.c
+      - crypto_stream_aes256ctr.c
+      - kem.c
+- source:
+    scheme: ntrulpr857
+    implementation: clean
+  files:
+      - crypto_decode_256x16.h
+      - crypto_decode_256x2.h
+      - crypto_encode_256x16.h
+      - crypto_encode_256x2.h
+      - crypto_sort_uint32.h
+      - crypto_stream_aes256ctr.h
+      - crypto_decode_256x16.c
+      - crypto_encode_256x16.c
+      - crypto_sort_uint32.c
+      - crypto_stream_aes256ctr.c
+      - kem.c
+- source:
+    scheme: ntrulpr857
+    implementation: avx2
+  files:
+      - crypto_decode_256x16.h
+      - crypto_decode_256x2.h
+      - crypto_encode_256x16.h
+      - crypto_encode_256x2.h
+      - crypto_sort_int32.h
+      - crypto_sort_uint32.h
+      - crypto_stream_aes256ctr.h
+      - crypto_decode_256x16.c
+      - crypto_decode_256x2.c
+      - crypto_encode_256x16.c
+      - crypto_encode_256x2.c
+      - crypto_sort_int32.c
+      - crypto_sort_uint32.c
+      - crypto_stream_aes256ctr.c
+      - kem.c
diff --git a/test/duplicate_consistency/ntrulpr653_clean.yml b/test/duplicate_consistency/ntrulpr653_clean.yml
new file mode 100644
index 00000000..462a3bb9
--- /dev/null
+++ b/test/duplicate_consistency/ntrulpr653_clean.yml
@@ -0,0 +1,182 @@
+consistency_checks:
+- source:
+    scheme: sntrup653
+    implementation: clean
+  files:
+      - crypto_core_multsntrup653.h
+      - crypto_decode_653x1541.h
+      - crypto_decode_653x3.h
+      - crypto_decode_653xint16.h
+      - crypto_decode_653xint32.h
+      - crypto_encode_653x1541.h
+      - crypto_encode_653x1541round.h
+      - crypto_encode_653x3.h
+      - crypto_encode_653xint16.h
+      - crypto_sort_int32.h
+      - crypto_sort_uint32.h
+      - crypto_stream_aes256ctr.h
+      - crypto_core_multsntrup653.c
+      - crypto_decode_653x1541.c
+      - crypto_decode_653x3.c
+      - crypto_decode_653xint16.c
+      - crypto_decode_653xint32.c
+      - crypto_encode_653x1541.c
+      - crypto_encode_653x1541round.c
+      - crypto_encode_653x3.c
+      - crypto_encode_653xint16.c
+      - crypto_sort_int32.c
+      - crypto_sort_uint32.c
+      - crypto_stream_aes256ctr.c
+- source:
+    scheme: sntrup653
+    implementation: avx2
+  files:
+      - crypto_core_multsntrup653.h
+      - crypto_decode_653x1541.h
+      - crypto_decode_653x3.h
+      - crypto_decode_653xint16.h
+      - crypto_decode_653xint32.h
+      - crypto_encode_653x1541.h
+      - crypto_encode_653x1541round.h
+      - crypto_encode_653x3.h
+      - crypto_encode_653xint16.h
+      - crypto_sort_uint32.h
+      - crypto_stream_aes256ctr.h
+      - crypto_decode_653xint16.c
+      - crypto_decode_653xint32.c
+      - crypto_encode_653xint16.c
+      - crypto_sort_uint32.c
+      - crypto_stream_aes256ctr.c
+- source:
+    scheme: sntrup761
+    implementation: clean
+  files:
+      - crypto_sort_int32.h
+      - crypto_sort_uint32.h
+      - crypto_stream_aes256ctr.h
+      - crypto_sort_int32.c
+      - crypto_sort_uint32.c
+      - crypto_stream_aes256ctr.c
+- source:
+    scheme: sntrup761
+    implementation: avx2
+  files:
+      - crypto_sort_uint32.h
+      - crypto_stream_aes256ctr.h
+      - crypto_sort_uint32.c
+      - crypto_stream_aes256ctr.c
+- source:
+    scheme: sntrup857
+    implementation: clean
+  files:
+      - crypto_sort_int32.h
+      - crypto_sort_uint32.h
+      - crypto_stream_aes256ctr.h
+      - crypto_sort_int32.c
+      - crypto_sort_uint32.c
+      - crypto_stream_aes256ctr.c
+- source:
+    scheme: sntrup857
+    implementation: avx2
+  files:
+      - crypto_sort_uint32.h
+      - crypto_stream_aes256ctr.h
+      - crypto_sort_uint32.c
+      - crypto_stream_aes256ctr.c
+- source:
+    scheme: ntrulpr653
+    implementation: avx2
+  files:
+      - api.h
+      - crypto_core_multsntrup653.h
+      - crypto_decode_256x16.h
+      - crypto_decode_256x2.h
+      - crypto_decode_653x1541.h
+      - crypto_decode_653x3.h
+      - crypto_decode_653xint16.h
+      - crypto_decode_653xint32.h
+      - crypto_encode_256x16.h
+      - crypto_encode_256x2.h
+      - crypto_encode_653x1541.h
+      - crypto_encode_653x1541round.h
+      - crypto_encode_653x3.h
+      - crypto_encode_653xint16.h
+      - crypto_sort_uint32.h
+      - crypto_stream_aes256ctr.h
+      - crypto_verify_1025.h
+      - crypto_decode_256x16.c
+      - crypto_decode_653xint16.c
+      - crypto_decode_653xint32.c
+      - crypto_encode_256x16.c
+      - crypto_encode_653xint16.c
+      - crypto_sort_uint32.c
+      - crypto_stream_aes256ctr.c
+      - kem.c
+- source:
+    scheme: ntrulpr761
+    implementation: clean
+  files:
+      - crypto_decode_256x16.h
+      - crypto_decode_256x2.h
+      - crypto_encode_256x16.h
+      - crypto_encode_256x2.h
+      - crypto_sort_int32.h
+      - crypto_sort_uint32.h
+      - crypto_stream_aes256ctr.h
+      - crypto_decode_256x16.c
+      - crypto_decode_256x2.c
+      - crypto_encode_256x16.c
+      - crypto_encode_256x2.c
+      - crypto_sort_int32.c
+      - crypto_sort_uint32.c
+      - crypto_stream_aes256ctr.c
+      - kem.c
+- source:
+    scheme: ntrulpr761
+    implementation: avx2
+  files:
+      - crypto_decode_256x16.h
+      - crypto_decode_256x2.h
+      - crypto_encode_256x16.h
+      - crypto_encode_256x2.h
+      - crypto_sort_uint32.h
+      - crypto_stream_aes256ctr.h
+      - crypto_decode_256x16.c
+      - crypto_encode_256x16.c
+      - crypto_sort_uint32.c
+      - crypto_stream_aes256ctr.c
+      - kem.c
+- source:
+    scheme: ntrulpr857
+    implementation: clean
+  files:
+      - crypto_decode_256x16.h
+      - crypto_decode_256x2.h
+      - crypto_encode_256x16.h
+      - crypto_encode_256x2.h
+      - crypto_sort_int32.h
+      - crypto_sort_uint32.h
+      - crypto_stream_aes256ctr.h
+      - crypto_decode_256x16.c
+      - crypto_decode_256x2.c
+      - crypto_encode_256x16.c
+      - crypto_encode_256x2.c
+      - crypto_sort_int32.c
+      - crypto_sort_uint32.c
+      - crypto_stream_aes256ctr.c
+      - kem.c
+- source:
+    scheme: ntrulpr857
+    implementation: avx2
+  files:
+      - crypto_decode_256x16.h
+      - crypto_decode_256x2.h
+      - crypto_encode_256x16.h
+      - crypto_encode_256x2.h
+      - crypto_sort_uint32.h
+      - crypto_stream_aes256ctr.h
+      - crypto_decode_256x16.c
+      - crypto_encode_256x16.c
+      - crypto_sort_uint32.c
+      - crypto_stream_aes256ctr.c
+      - kem.c
diff --git a/test/duplicate_consistency/ntrulpr761_avx2.yml b/test/duplicate_consistency/ntrulpr761_avx2.yml
new file mode 100644
index 00000000..551ad24d
--- /dev/null
+++ b/test/duplicate_consistency/ntrulpr761_avx2.yml
@@ -0,0 +1,150 @@
+consistency_checks:
+- source:
+    scheme: sntrup653
+    implementation: clean
+  files:
+      - crypto_sort_uint32.h
+      - crypto_stream_aes256ctr.h
+      - crypto_sort_uint32.c
+      - crypto_stream_aes256ctr.c
+- source:
+    scheme: sntrup653
+    implementation: avx2
+  files:
+      - crypto_sort_int32.h
+      - crypto_sort_uint32.h
+      - crypto_stream_aes256ctr.h
+      - crypto_sort_int32.c
+      - crypto_sort_uint32.c
+      - crypto_stream_aes256ctr.c
+- source:
+    scheme: sntrup761
+    implementation: clean
+  files:
+      - crypto_core_multsntrup761.h
+      - crypto_decode_761x1531.h
+      - crypto_decode_761x3.h
+      - crypto_decode_761xint16.h
+      - crypto_decode_761xint32.h
+      - crypto_encode_761x1531.h
+      - crypto_encode_761x1531round.h
+      - crypto_encode_761x3.h
+      - crypto_encode_761xint16.h
+      - crypto_sort_uint32.h
+      - crypto_stream_aes256ctr.h
+      - crypto_decode_761xint16.c
+      - crypto_decode_761xint32.c
+      - crypto_encode_761xint16.c
+      - crypto_sort_uint32.c
+      - crypto_stream_aes256ctr.c
+- source:
+    scheme: sntrup761
+    implementation: avx2
+  files:
+      - crypto_core_multsntrup761.h
+      - crypto_core_multsntrup761_ntt.h
+      - crypto_decode_761x1531.h
+      - crypto_decode_761x3.h
+      - crypto_decode_761xint16.h
+      - crypto_decode_761xint32.h
+      - crypto_encode_761x1531.h
+      - crypto_encode_761x1531round.h
+      - crypto_encode_761x3.h
+      - crypto_encode_761xint16.h
+      - crypto_sort_int32.h
+      - crypto_sort_uint32.h
+      - crypto_stream_aes256ctr.h
+      - crypto_core_multsntrup761.c
+      - crypto_core_multsntrup761_ntt.c
+      - crypto_decode_761x1531.c
+      - crypto_decode_761x3.c
+      - crypto_decode_761xint16.c
+      - crypto_decode_761xint32.c
+      - crypto_encode_761x1531.c
+      - crypto_encode_761x1531round.c
+      - crypto_encode_761x3.c
+      - crypto_encode_761xint16.c
+      - crypto_sort_int32.c
+      - crypto_sort_uint32.c
+      - crypto_stream_aes256ctr.c
+- source:
+    scheme: sntrup857
+    implementation: clean
+  files:
+      - crypto_sort_uint32.h
+      - crypto_stream_aes256ctr.h
+      - crypto_sort_uint32.c
+      - crypto_stream_aes256ctr.c
+- source:
+    scheme: sntrup857
+    implementation: avx2
+  files:
+      - crypto_sort_int32.h
+      - crypto_sort_uint32.h
+      - crypto_stream_aes256ctr.h
+      - crypto_sort_int32.c
+      - crypto_sort_uint32.c
+      - crypto_stream_aes256ctr.c
+- source:
+    scheme: ntrulpr761
+    implementation: clean
+  files:
+      - api.h
+      - crypto_core_multsntrup761.h
+      - crypto_decode_256x16.h
+      - crypto_decode_256x2.h
+      - crypto_decode_761x1531.h
+      - crypto_decode_761x3.h
+      - crypto_decode_761xint16.h
+      - crypto_decode_761xint32.h
+      - crypto_encode_256x16.h
+      - crypto_encode_256x2.h
+      - crypto_encode_761x1531.h
+      - crypto_encode_761x1531round.h
+      - crypto_encode_761x3.h
+      - crypto_encode_761xint16.h
+      - crypto_sort_uint32.h
+      - crypto_stream_aes256ctr.h
+      - crypto_verify_1167.h
+      - crypto_decode_256x16.c
+      - crypto_decode_761xint16.c
+      - crypto_decode_761xint32.c
+      - crypto_encode_256x16.c
+      - crypto_encode_761xint16.c
+      - crypto_sort_uint32.c
+      - crypto_stream_aes256ctr.c
+      - kem.c
+- source:
+    scheme: ntrulpr857
+    implementation: clean
+  files:
+      - crypto_decode_256x16.h
+      - crypto_decode_256x2.h
+      - crypto_encode_256x16.h
+      - crypto_encode_256x2.h
+      - crypto_sort_uint32.h
+      - crypto_stream_aes256ctr.h
+      - crypto_decode_256x16.c
+      - crypto_encode_256x16.c
+      - crypto_sort_uint32.c
+      - crypto_stream_aes256ctr.c
+      - kem.c
+- source:
+    scheme: ntrulpr857
+    implementation: avx2
+  files:
+      - crypto_decode_256x16.h
+      - crypto_decode_256x2.h
+      - crypto_encode_256x16.h
+      - crypto_encode_256x2.h
+      - crypto_sort_int32.h
+      - crypto_sort_uint32.h
+      - crypto_stream_aes256ctr.h
+      - crypto_decode_256x16.c
+      - crypto_decode_256x2.c
+      - crypto_encode_256x16.c
+      - crypto_encode_256x2.c
+      - crypto_sort_int32.c
+      - crypto_sort_uint32.c
+      - crypto_stream_aes256ctr.c
+      - kem.c
diff --git a/test/duplicate_consistency/ntrulpr761_clean.yml b/test/duplicate_consistency/ntrulpr761_clean.yml
new file mode 100644
index 00000000..40c1e43e
--- /dev/null
+++ b/test/duplicate_consistency/ntrulpr761_clean.yml
@@ -0,0 +1,148 @@
+consistency_checks:
+- source:
+    scheme: sntrup653
+    implementation: clean
+  files:
+      - crypto_sort_int32.h
+      - crypto_sort_uint32.h
+      - crypto_stream_aes256ctr.h
+      - crypto_sort_int32.c
+      - crypto_sort_uint32.c
+      - crypto_stream_aes256ctr.c
+- source:
+    scheme: sntrup653
+    implementation: avx2
+  files:
+      - crypto_sort_uint32.h
+      - crypto_stream_aes256ctr.h
+      - crypto_sort_uint32.c
+      - crypto_stream_aes256ctr.c
+- source:
+    scheme: sntrup761
+    implementation: clean
+  files:
+      - crypto_core_multsntrup761.h
+      - crypto_decode_761x1531.h
+      - crypto_decode_761x3.h
+      - crypto_decode_761xint16.h
+      - crypto_decode_761xint32.h
+      - crypto_encode_761x1531.h
+      - crypto_encode_761x1531round.h
+      - crypto_encode_761x3.h
+      - crypto_encode_761xint16.h
+      - crypto_sort_int32.h
+      - crypto_sort_uint32.h
+      - crypto_stream_aes256ctr.h
+      - crypto_core_multsntrup761.c
+      - crypto_decode_761x1531.c
+      - crypto_decode_761x3.c
+      - crypto_decode_761xint16.c
+      - crypto_decode_761xint32.c
+      - crypto_encode_761x1531.c
+      - crypto_encode_761x1531round.c
+      - crypto_encode_761x3.c
+      - crypto_encode_761xint16.c
+      - crypto_sort_int32.c
+      - crypto_sort_uint32.c
+      - crypto_stream_aes256ctr.c
+- source:
+    scheme: sntrup761
+    implementation: avx2
+  files:
+      - crypto_core_multsntrup761.h
+      - crypto_decode_761x1531.h
+      - crypto_decode_761x3.h
+      - crypto_decode_761xint16.h
+      - crypto_decode_761xint32.h
+      - crypto_encode_761x1531.h
+      - crypto_encode_761x1531round.h
+      - crypto_encode_761x3.h
+      - crypto_encode_761xint16.h
+      - crypto_sort_uint32.h
+      - crypto_stream_aes256ctr.h
+      - crypto_decode_761xint16.c
+      - crypto_decode_761xint32.c
+      - crypto_encode_761xint16.c
+      - crypto_sort_uint32.c
+      - crypto_stream_aes256ctr.c
+- source:
+    scheme: sntrup857
+    implementation: clean
+  files:
+      - crypto_sort_int32.h
+      - crypto_sort_uint32.h
+      - crypto_stream_aes256ctr.h
+      - crypto_sort_int32.c
+      - crypto_sort_uint32.c
+      - crypto_stream_aes256ctr.c
+- source:
+    scheme: sntrup857
+    implementation: avx2
+  files:
+      - crypto_sort_uint32.h
+      - crypto_stream_aes256ctr.h
+      - crypto_sort_uint32.c
+      - crypto_stream_aes256ctr.c
+- source:
+    scheme: ntrulpr761
+    implementation: avx2
+  files:
+      - api.h
+      - crypto_core_multsntrup761.h
+      - crypto_decode_256x16.h
+      - crypto_decode_256x2.h
+      - crypto_decode_761x1531.h
+      - crypto_decode_761x3.h
+      - crypto_decode_761xint16.h
+      - crypto_decode_761xint32.h
+      - crypto_encode_256x16.h
+      - crypto_encode_256x2.h
+      - crypto_encode_761x1531.h
+      - crypto_encode_761x1531round.h
+      - crypto_encode_761x3.h
+      - crypto_encode_761xint16.h
+      - crypto_sort_uint32.h
+      - crypto_stream_aes256ctr.h
+      - crypto_verify_1167.h
+      - crypto_decode_256x16.c
+      - crypto_decode_761xint16.c
+      - crypto_decode_761xint32.c
+      - crypto_encode_256x16.c
+      - crypto_encode_761xint16.c
+      - crypto_sort_uint32.c
+      - crypto_stream_aes256ctr.c
+      - kem.c
+- source:
+    scheme: ntrulpr857
+    implementation: clean
+  files:
+      - crypto_decode_256x16.h
+      - crypto_decode_256x2.h
+      - crypto_encode_256x16.h
+      - crypto_encode_256x2.h
+      - crypto_sort_int32.h
+      - crypto_sort_uint32.h
+      - crypto_stream_aes256ctr.h
+      - crypto_decode_256x16.c
+      - crypto_decode_256x2.c
+      - crypto_encode_256x16.c
+      - crypto_encode_256x2.c
+      - crypto_sort_int32.c
+      - crypto_sort_uint32.c
+      - crypto_stream_aes256ctr.c
+      - kem.c
+- source:
+    scheme: ntrulpr857
+    implementation: avx2
+  files:
+      - crypto_decode_256x16.h
+      - crypto_decode_256x2.h
+      - crypto_encode_256x16.h
+      - crypto_encode_256x2.h
+      - crypto_sort_uint32.h
+      - crypto_stream_aes256ctr.h
+      - crypto_decode_256x16.c
+      - crypto_encode_256x16.c
+      - crypto_sort_uint32.c
+      - crypto_stream_aes256ctr.c
+      - kem.c
diff --git a/test/duplicate_consistency/ntrulpr857_avx2.yml b/test/duplicate_consistency/ntrulpr857_avx2.yml
new file mode 100644
index 00000000..d538d871
--- /dev/null
+++ b/test/duplicate_consistency/ntrulpr857_avx2.yml
@@ -0,0 +1,116 @@
+consistency_checks:
+- source:
+    scheme: sntrup653
+    implementation: clean
+  files:
+      - crypto_sort_uint32.h
+      - crypto_stream_aes256ctr.h
+      - crypto_sort_uint32.c
+      - crypto_stream_aes256ctr.c
+- source:
+    scheme: sntrup653
+    implementation: avx2
+  files:
+      - crypto_sort_int32.h
+      - crypto_sort_uint32.h
+      - crypto_stream_aes256ctr.h
+      - crypto_sort_int32.c
+      - crypto_sort_uint32.c
+      - crypto_stream_aes256ctr.c
+- source:
+    scheme: sntrup761
+    implementation: clean
+  files:
+      - crypto_sort_uint32.h
+      - crypto_stream_aes256ctr.h
+      - crypto_sort_uint32.c
+      - crypto_stream_aes256ctr.c
+- source:
+    scheme: sntrup761
+    implementation: avx2
+  files:
+      - crypto_sort_int32.h
+      - crypto_sort_uint32.h
+      - crypto_stream_aes256ctr.h
+      - crypto_sort_int32.c
+      - crypto_sort_uint32.c
+      - crypto_stream_aes256ctr.c
+- source:
+    scheme: sntrup857
+    implementation: clean
+  files:
+      - crypto_core_multsntrup857.h
+      - crypto_decode_857x1723.h
+      - crypto_decode_857x3.h
+      - crypto_decode_857xint16.h
+      - crypto_decode_857xint32.h
+      - crypto_encode_857x1723.h
+      - crypto_encode_857x1723round.h
+      - crypto_encode_857x3.h
+      - crypto_encode_857xint16.h
+      - crypto_sort_uint32.h
+      - crypto_stream_aes256ctr.h
+      - crypto_decode_857xint16.c
+      - crypto_decode_857xint32.c
+      - crypto_encode_857xint16.c
+      - crypto_sort_uint32.c
+      - crypto_stream_aes256ctr.c
+- source:
+    scheme: sntrup857
+    implementation: avx2
+  files:
+      - crypto_core_multsntrup857.h
+      - crypto_core_multsntrup857_ntt.h
+      - crypto_decode_857x1723.h
+      - crypto_decode_857x3.h
+      - crypto_decode_857xint16.h
+      - crypto_decode_857xint32.h
+      - crypto_encode_857x1723.h
+      - crypto_encode_857x1723round.h
+      - crypto_encode_857x3.h
+      - crypto_encode_857xint16.h
+      - crypto_sort_int32.h
+      - crypto_sort_uint32.h
+      - crypto_stream_aes256ctr.h
+      - crypto_core_multsntrup857.c
+      - crypto_core_multsntrup857_ntt.c
+      - crypto_decode_857x1723.c
+      - crypto_decode_857x3.c
+      - crypto_decode_857xint16.c
+      - crypto_decode_857xint32.c
+      - crypto_encode_857x1723.c
+      - crypto_encode_857x1723round.c
+      - crypto_encode_857x3.c
+      - crypto_encode_857xint16.c
+      - crypto_sort_int32.c
+      - crypto_sort_uint32.c
+      - crypto_stream_aes256ctr.c
+- source:
+    scheme: ntrulpr857
+    implementation: clean
+  files:
+      - api.h
+      - crypto_core_multsntrup857.h
+      - crypto_decode_256x16.h
+      - crypto_decode_256x2.h
+      - crypto_decode_857x1723.h
+      - crypto_decode_857x3.h
+      - crypto_decode_857xint16.h
+      - crypto_decode_857xint32.h
+      - crypto_encode_256x16.h
+      - crypto_encode_256x2.h
+      - crypto_encode_857x1723.h
+      - crypto_encode_857x1723round.h
+      - crypto_encode_857x3.h
+      - crypto_encode_857xint16.h
+      - crypto_sort_uint32.h
+      - crypto_stream_aes256ctr.h
+      - crypto_verify_1312.h
+      - crypto_decode_256x16.c
+      - crypto_decode_857xint16.c
+      - crypto_decode_857xint32.c
+      - crypto_encode_256x16.c
+      - crypto_encode_857xint16.c
+      - crypto_sort_uint32.c
+      - crypto_stream_aes256ctr.c
+      - kem.c
diff --git a/test/duplicate_consistency/ntrulpr857_clean.yml b/test/duplicate_consistency/ntrulpr857_clean.yml
new file mode 100644
index 00000000..ca9508a0
--- /dev/null
+++ b/test/duplicate_consistency/ntrulpr857_clean.yml
@@ -0,0 +1,114 @@
+consistency_checks:
+- source:
+    scheme: sntrup653
+    implementation: clean
+  files:
+      - crypto_sort_int32.h
+      - crypto_sort_uint32.h
+      - crypto_stream_aes256ctr.h
+      - crypto_sort_int32.c
+      - crypto_sort_uint32.c
+      - crypto_stream_aes256ctr.c
+- source:
+    scheme: sntrup653
+    implementation: avx2
+  files:
+      - crypto_sort_uint32.h
+      - crypto_stream_aes256ctr.h
+      - crypto_sort_uint32.c
+      - crypto_stream_aes256ctr.c
+- source:
+    scheme: sntrup761
+    implementation: clean
+  files:
+      - crypto_sort_int32.h
+      - crypto_sort_uint32.h
+      - crypto_stream_aes256ctr.h
+      - crypto_sort_int32.c
+      - crypto_sort_uint32.c
+      - crypto_stream_aes256ctr.c
+- source:
+    scheme: sntrup761
+    implementation: avx2
+  files:
+      - crypto_sort_uint32.h
+      - crypto_stream_aes256ctr.h
+      - crypto_sort_uint32.c
+      - crypto_stream_aes256ctr.c
+- source:
+    scheme: sntrup857
+    implementation: clean
+  files:
+      - crypto_core_multsntrup857.h
+      - crypto_decode_857x1723.h
+      - crypto_decode_857x3.h
+      - crypto_decode_857xint16.h
+      - crypto_decode_857xint32.h
+      - crypto_encode_857x1723.h
+      - crypto_encode_857x1723round.h
+      - crypto_encode_857x3.h
+      - crypto_encode_857xint16.h
+      - crypto_sort_int32.h
+      - crypto_sort_uint32.h
+      - crypto_stream_aes256ctr.h
+      - crypto_core_multsntrup857.c
+      - crypto_decode_857x1723.c
+      - crypto_decode_857x3.c
+      - crypto_decode_857xint16.c
+      - crypto_decode_857xint32.c
+      - crypto_encode_857x1723.c
+      - crypto_encode_857x1723round.c
+      - crypto_encode_857x3.c
+      - crypto_encode_857xint16.c
+      - crypto_sort_int32.c
+      - crypto_sort_uint32.c
+      - crypto_stream_aes256ctr.c
+- source:
+    scheme: sntrup857
+    implementation: avx2
+  files:
+      - crypto_core_multsntrup857.h
+      - crypto_decode_857x1723.h
+      - crypto_decode_857x3.h
+      - crypto_decode_857xint16.h
+      - crypto_decode_857xint32.h
+      - crypto_encode_857x1723.h
+      - crypto_encode_857x1723round.h
+      - crypto_encode_857x3.h
+      - crypto_encode_857xint16.h
+      - crypto_sort_uint32.h
+      - crypto_stream_aes256ctr.h
+      - crypto_decode_857xint16.c
+      - crypto_decode_857xint32.c
+      - crypto_encode_857xint16.c
+      - crypto_sort_uint32.c
+      - crypto_stream_aes256ctr.c
+- source:
+    scheme: ntrulpr857
+    implementation: avx2
+  files:
+      - api.h
+      - crypto_core_multsntrup857.h
+      - crypto_decode_256x16.h
+      - crypto_decode_256x2.h
+      - crypto_decode_857x1723.h
+      - crypto_decode_857x3.h
+      - crypto_decode_857xint16.h
+      - crypto_decode_857xint32.h
+      - crypto_encode_256x16.h
+      - crypto_encode_256x2.h
+      - crypto_encode_857x1723.h
+      - crypto_encode_857x1723round.h
+      - crypto_encode_857x3.h
+      - crypto_encode_857xint16.h
+      - crypto_sort_uint32.h
+      - crypto_stream_aes256ctr.h
+      - crypto_verify_1312.h
+      - crypto_decode_256x16.c
+      - crypto_decode_857xint16.c
+      - crypto_decode_857xint32.c
+      - crypto_encode_256x16.c
+      - crypto_encode_857xint16.c
+      - crypto_sort_uint32.c
+      - crypto_stream_aes256ctr.c
+      - kem.c
diff --git a/test/duplicate_consistency/sntrup653_avx2.yml b/test/duplicate_consistency/sntrup653_avx2.yml
new file mode 100644
index 00000000..19793e9e
--- /dev/null
+++ b/test/duplicate_consistency/sntrup653_avx2.yml
@@ -0,0 +1,86 @@
+consistency_checks:
+- source:
+    scheme: sntrup653
+    implementation: clean
+  files:
+      - api.h
+      - crypto_core_inv3sntrup653.h
+      - crypto_core_invsntrup653.h
+      - crypto_core_mult3sntrup653.h
+      - crypto_core_multsntrup653.h
+      - crypto_core_scale3sntrup653.h
+      - crypto_core_weightsntrup653.h
+      - crypto_decode_653x1541.h
+      - crypto_decode_653x3.h
+      - crypto_decode_653x4621.h
+      - crypto_decode_653xint16.h
+      - crypto_decode_653xint32.h
+      - crypto_encode_653x1541.h
+      - crypto_encode_653x1541round.h
+      - crypto_encode_653x3.h
+      - crypto_encode_653x4621.h
+      - crypto_encode_653xfreeze3.h
+      - crypto_encode_653xint16.h
+      - crypto_encode_int16.h
+      - crypto_sort_uint32.h
+      - crypto_stream_aes256ctr.h
+      - crypto_verify_897.h
+      - crypto_decode_653xint16.c
+      - crypto_decode_653xint32.c
+      - crypto_encode_653xint16.c
+      - crypto_encode_int16.c
+      - crypto_sort_uint32.c
+      - crypto_stream_aes256ctr.c
+      - kem.c
+- source:
+    scheme: sntrup761
+    implementation: clean
+  files:
+      - crypto_encode_int16.h
+      - crypto_sort_uint32.h
+      - crypto_stream_aes256ctr.h
+      - crypto_encode_int16.c
+      - crypto_sort_uint32.c
+      - crypto_stream_aes256ctr.c
+      - kem.c
+- source:
+    scheme: sntrup761
+    implementation: avx2
+  files:
+      - crypto_decode_int16.h
+      - crypto_encode_int16.h
+      - crypto_sort_int32.h
+      - crypto_sort_uint32.h
+      - crypto_stream_aes256ctr.h
+      - crypto_decode_int16.c
+      - crypto_encode_int16.c
+      - crypto_sort_int32.c
+      - crypto_sort_uint32.c
+      - crypto_stream_aes256ctr.c
+      - kem.c
+- source:
+    scheme: sntrup857
+    implementation: clean
+  files:
+      - crypto_encode_int16.h
+      - crypto_sort_uint32.h
+      - crypto_stream_aes256ctr.h
+      - crypto_encode_int16.c
+      - crypto_sort_uint32.c
+      - crypto_stream_aes256ctr.c
+      - kem.c
+- source:
+    scheme: sntrup857
+    implementation: avx2
+  files:
+      - crypto_decode_int16.h
+      - crypto_encode_int16.h
+      - crypto_sort_int32.h
+      - crypto_sort_uint32.h
+      - crypto_stream_aes256ctr.h
+      - crypto_decode_int16.c
+      - crypto_encode_int16.c
+      - crypto_sort_int32.c
+      - crypto_sort_uint32.c
+      - crypto_stream_aes256ctr.c
+      - kem.c
diff --git a/test/duplicate_consistency/sntrup653_clean.yml b/test/duplicate_consistency/sntrup653_clean.yml
new file mode 100644
index 00000000..817346c3
--- /dev/null
+++ b/test/duplicate_consistency/sntrup653_clean.yml
@@ -0,0 +1,82 @@
+consistency_checks:
+- source:
+    scheme: sntrup653
+    implementation: avx2
+  files:
+      - api.h
+      - crypto_core_inv3sntrup653.h
+      - crypto_core_invsntrup653.h
+      - crypto_core_mult3sntrup653.h
+      - crypto_core_multsntrup653.h
+      - crypto_core_scale3sntrup653.h
+      - crypto_core_weightsntrup653.h
+      - crypto_decode_653x1541.h
+      - crypto_decode_653x3.h
+      - crypto_decode_653x4621.h
+      - crypto_decode_653xint16.h
+      - crypto_decode_653xint32.h
+      - crypto_encode_653x1541.h
+      - crypto_encode_653x1541round.h
+      - crypto_encode_653x3.h
+      - crypto_encode_653x4621.h
+      - crypto_encode_653xfreeze3.h
+      - crypto_encode_653xint16.h
+      - crypto_encode_int16.h
+      - crypto_sort_uint32.h
+      - crypto_stream_aes256ctr.h
+      - crypto_verify_897.h
+      - crypto_decode_653xint16.c
+      - crypto_decode_653xint32.c
+      - crypto_encode_653xint16.c
+      - crypto_encode_int16.c
+      - crypto_sort_uint32.c
+      - crypto_stream_aes256ctr.c
+      - kem.c
+- source:
+    scheme: sntrup761
+    implementation: clean
+  files:
+      - crypto_encode_int16.h
+      - crypto_sort_int32.h
+      - crypto_sort_uint32.h
+      - crypto_stream_aes256ctr.h
+      - crypto_encode_int16.c
+      - crypto_sort_int32.c
+      - crypto_sort_uint32.c
+      - crypto_stream_aes256ctr.c
+      - kem.c
+- source:
+    scheme: sntrup761
+    implementation: avx2
+  files:
+      - crypto_encode_int16.h
+      - crypto_sort_uint32.h
+      - crypto_stream_aes256ctr.h
+      - crypto_encode_int16.c
+      - crypto_sort_uint32.c
+      - crypto_stream_aes256ctr.c
+      - kem.c
+- source:
+    scheme: sntrup857
+    implementation: clean
+  files:
+      - crypto_encode_int16.h
+      - crypto_sort_int32.h
+      - crypto_sort_uint32.h
+      - crypto_stream_aes256ctr.h
+      - crypto_encode_int16.c
+      - crypto_sort_int32.c
+      - crypto_sort_uint32.c
+      - crypto_stream_aes256ctr.c
+      - kem.c
+- source:
+    scheme: sntrup857
+    implementation: avx2
+  files:
+      - crypto_encode_int16.h
+      - crypto_sort_uint32.h
+      - crypto_stream_aes256ctr.h
+      - crypto_encode_int16.c
+      - crypto_sort_uint32.c
+      - crypto_stream_aes256ctr.c
+      - kem.c
diff --git a/test/duplicate_consistency/sntrup761_avx2.yml b/test/duplicate_consistency/sntrup761_avx2.yml
new file mode 100644
index 00000000..286e13bb
--- /dev/null
+++ b/test/duplicate_consistency/sntrup761_avx2.yml
@@ -0,0 +1,60 @@
+consistency_checks:
+- source:
+    scheme: sntrup761
+    implementation: clean
+  files:
+      - api.h
+      - crypto_core_inv3sntrup761.h
+      - crypto_core_invsntrup761.h
+      - crypto_core_mult3sntrup761.h
+      - crypto_core_multsntrup761.h
+      - crypto_core_scale3sntrup761.h
+      - crypto_core_weightsntrup761.h
+      - crypto_decode_761x1531.h
+      - crypto_decode_761x3.h
+      - crypto_decode_761x4591.h
+      - crypto_decode_761xint16.h
+      - crypto_decode_761xint32.h
+      - crypto_encode_761x1531.h
+      - crypto_encode_761x1531round.h
+      - crypto_encode_761x3.h
+      - crypto_encode_761x4591.h
+      - crypto_encode_761xfreeze3.h
+      - crypto_encode_761xint16.h
+      - crypto_encode_int16.h
+      - crypto_sort_uint32.h
+      - crypto_stream_aes256ctr.h
+      - crypto_verify_1039.h
+      - crypto_decode_761xint16.c
+      - crypto_decode_761xint32.c
+      - crypto_encode_761xint16.c
+      - crypto_encode_int16.c
+      - crypto_sort_uint32.c
+      - crypto_stream_aes256ctr.c
+      - kem.c
+- source:
+    scheme: sntrup857
+    implementation: clean
+  files:
+      - crypto_encode_int16.h
+      - crypto_sort_uint32.h
+      - crypto_stream_aes256ctr.h
+      - crypto_encode_int16.c
+      - crypto_sort_uint32.c
+      - crypto_stream_aes256ctr.c
+      - kem.c
+- source:
+    scheme: sntrup857
+    implementation: avx2
+  files:
+      - crypto_decode_int16.h
+      - crypto_encode_int16.h
+      - crypto_sort_int32.h
+      - crypto_sort_uint32.h
+      - crypto_stream_aes256ctr.h
+      - crypto_decode_int16.c
+      - crypto_encode_int16.c
+      - crypto_sort_int32.c
+      - crypto_sort_uint32.c
+      - crypto_stream_aes256ctr.c
+      - kem.c
diff --git a/test/duplicate_consistency/sntrup761_clean.yml b/test/duplicate_consistency/sntrup761_clean.yml
new file mode 100644
index 00000000..28b0f63f
--- /dev/null
+++ b/test/duplicate_consistency/sntrup761_clean.yml
@@ -0,0 +1,58 @@
+consistency_checks:
+- source:
+    scheme: sntrup761
+    implementation: avx2
+  files:
+      - api.h
+      - crypto_core_inv3sntrup761.h
+      - crypto_core_invsntrup761.h
+      - crypto_core_mult3sntrup761.h
+      - crypto_core_multsntrup761.h
+      - crypto_core_scale3sntrup761.h
+      - crypto_core_weightsntrup761.h
+      - crypto_decode_761x1531.h
+      - crypto_decode_761x3.h
+      - crypto_decode_761x4591.h
+      - crypto_decode_761xint16.h
+      - crypto_decode_761xint32.h
+      - crypto_encode_761x1531.h
+      - crypto_encode_761x1531round.h
+      - crypto_encode_761x3.h
+      - crypto_encode_761x4591.h
+      - crypto_encode_761xfreeze3.h
+      - crypto_encode_761xint16.h
+      - crypto_encode_int16.h
+      - crypto_sort_uint32.h
+      - crypto_stream_aes256ctr.h
+      - crypto_verify_1039.h
+      - crypto_decode_761xint16.c
+      - crypto_decode_761xint32.c
+      - crypto_encode_761xint16.c
+      - crypto_encode_int16.c
+      - crypto_sort_uint32.c
+      - crypto_stream_aes256ctr.c
+      - kem.c
+- source:
+    scheme: sntrup857
+    implementation: clean
+  files:
+      - crypto_encode_int16.h
+      - crypto_sort_int32.h
+      - crypto_sort_uint32.h
+      - crypto_stream_aes256ctr.h
+      - crypto_encode_int16.c
+      - crypto_sort_int32.c
+      - crypto_sort_uint32.c
+      - crypto_stream_aes256ctr.c
+      - kem.c
+- source:
+    scheme: sntrup857
+    implementation: avx2
+  files:
+      - crypto_encode_int16.h
+      - crypto_sort_uint32.h
+      - crypto_stream_aes256ctr.h
+      - crypto_encode_int16.c
+      - crypto_sort_uint32.c
+      - crypto_stream_aes256ctr.c
+      - kem.c
diff --git a/test/duplicate_consistency/sntrup857_avx2.yml b/test/duplicate_consistency/sntrup857_avx2.yml
new file mode 100644
index 00000000..9dd896f3
--- /dev/null
+++ b/test/duplicate_consistency/sntrup857_avx2.yml
@@ -0,0 +1,34 @@
+consistency_checks:
+- source:
+    scheme: sntrup857
+    implementation: clean
+  files:
+      - api.h
+      - crypto_core_inv3sntrup857.h
+      - crypto_core_invsntrup857.h
+      - crypto_core_mult3sntrup857.h
+      - crypto_core_multsntrup857.h
+      - crypto_core_scale3sntrup857.h
+      - crypto_core_weightsntrup857.h
+      - crypto_decode_857x1723.h
+      - crypto_decode_857x3.h
+      - crypto_decode_857x5167.h
+      - crypto_decode_857xint16.h
+      - crypto_decode_857xint32.h
+      - crypto_encode_857x1723.h
+      - crypto_encode_857x1723round.h
+      - crypto_encode_857x3.h
+      - crypto_encode_857x5167.h
+      - crypto_encode_857xfreeze3.h
+      - crypto_encode_857xint16.h
+      - crypto_encode_int16.h
+      - crypto_sort_uint32.h
+      - crypto_stream_aes256ctr.h
+      - crypto_verify_1184.h
+      - crypto_decode_857xint16.c
+      - crypto_decode_857xint32.c
+      - crypto_encode_857xint16.c
+      - crypto_encode_int16.c
+      - crypto_sort_uint32.c
+      - crypto_stream_aes256ctr.c
+      - kem.c
diff --git a/test/duplicate_consistency/sntrup857_clean.yml b/test/duplicate_consistency/sntrup857_clean.yml
new file mode 100644
index 00000000..1c5a45e6
--- /dev/null
+++ b/test/duplicate_consistency/sntrup857_clean.yml
@@ -0,0 +1,34 @@
+consistency_checks:
+- source:
+    scheme: sntrup857
+    implementation: avx2
+  files:
+      - api.h
+      - crypto_core_inv3sntrup857.h
+      - crypto_core_invsntrup857.h
+      - crypto_core_mult3sntrup857.h
+      - crypto_core_multsntrup857.h
+      - crypto_core_scale3sntrup857.h
+      - crypto_core_weightsntrup857.h
+      - crypto_decode_857x1723.h
+      - crypto_decode_857x3.h
+      - crypto_decode_857x5167.h
+      - crypto_decode_857xint16.h
+      - crypto_decode_857xint32.h
+      - crypto_encode_857x1723.h
+      - crypto_encode_857x1723round.h
+      - crypto_encode_857x3.h
+      - crypto_encode_857x5167.h
+      - crypto_encode_857xfreeze3.h
+      - crypto_encode_857xint16.h
+      - crypto_encode_int16.h
+      - crypto_sort_uint32.h
+      - crypto_stream_aes256ctr.h
+      - crypto_verify_1184.h
+      - crypto_decode_857xint16.c
+      - crypto_decode_857xint32.c
+      - crypto_encode_857xint16.c
+      - crypto_encode_int16.c
+      - crypto_sort_uint32.c
+      - crypto_stream_aes256ctr.c
+      - kem.c